provide text converters for XLS and PPT
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1626 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
65249df13f
commit
06d4652807
9 changed files with 124 additions and 4 deletions
|
@ -66,3 +66,32 @@ Word Documents
|
||||||
89
|
89
|
||||||
>>> u'lamb' in words
|
>>> u'lamb' in words
|
||||||
True
|
True
|
||||||
|
|
||||||
|
PowerPoint Presentations
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
>>> from cybertools.text.ppt import PptTransform
|
||||||
|
>>> transform = PptTransform(None)
|
||||||
|
>>> f = open(os.path.join(testdir, 'mary.ppt'))
|
||||||
|
>>> result = transform(f)
|
||||||
|
>>> print log
|
||||||
|
>>> words = result.split()
|
||||||
|
>>> len(words)
|
||||||
|
102
|
||||||
|
>>> u'lamb' in words
|
||||||
|
True
|
||||||
|
|
||||||
|
Excel Spreadsheets
|
||||||
|
------------------
|
||||||
|
|
||||||
|
>>> from cybertools.text.xls import XlsTransform
|
||||||
|
>>> transform = XlsTransform(None)
|
||||||
|
>>> f = open(os.path.join(testdir, 'mary.xls'))
|
||||||
|
>>> result = transform(f)
|
||||||
|
>>> print log
|
||||||
|
>>> words = result.split()
|
||||||
|
>>> len(words)
|
||||||
|
89
|
||||||
|
>>> u'lamb' in words
|
||||||
|
True
|
||||||
|
|
||||||
|
|
|
@ -17,10 +17,9 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Searchable text support for Portable Document Format (PDF) files.
|
Searchable text support for MS Word (.doc) files.
|
||||||
|
|
||||||
This uses the pdftotext command from xpdf to perform the extraction.
|
This uses the wvware command to perform the extraction.
|
||||||
interface definitions for text transformations.
|
|
||||||
|
|
||||||
Based on code provided by zc.index and TextIndexNG3.
|
Based on code provided by zc.index and TextIndexNG3.
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,6 @@
|
||||||
Searchable text support for Portable Document Format (PDF) files.
|
Searchable text support for Portable Document Format (PDF) files.
|
||||||
|
|
||||||
This uses the pdftotext command from xpdf to perform the extraction.
|
This uses the pdftotext command from xpdf to perform the extraction.
|
||||||
interface definitions for text transformations.
|
|
||||||
|
|
||||||
Based on code provided by zc.index and TextIndexNG3.
|
Based on code provided by zc.index and TextIndexNG3.
|
||||||
|
|
||||||
|
|
48
text/ppt.py
Normal file
48
text/ppt.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
#
|
||||||
|
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
#
|
||||||
|
|
||||||
|
"""
|
||||||
|
Searchable text support for Portable Document Format (PDF) files.
|
||||||
|
|
||||||
|
This uses the pdftotext command from xpdf to perform the extraction.
|
||||||
|
interface definitions for text transformations.
|
||||||
|
|
||||||
|
Based on code provided by zc.index and TextIndexNG3.
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os, sys
|
||||||
|
|
||||||
|
from cybertools.text import base
|
||||||
|
from cybertools.text.html import htmlToText
|
||||||
|
|
||||||
|
|
||||||
|
class PptTransform(base.BaseFileTransform):
|
||||||
|
|
||||||
|
extension = ".ppt"
|
||||||
|
|
||||||
|
def extract(self, directory, filename):
|
||||||
|
if not self.checkAvailable('ppthtml', 'ppthtml is not available'):
|
||||||
|
return u''
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
html = self.execute('ppthtml "%s" 2> nul:' % filename)
|
||||||
|
else:
|
||||||
|
html = self.execute('ppthtml "%s" 2> /dev/null' % filename)
|
||||||
|
data = htmlToText(html)
|
||||||
|
return data.decode('ISO8859-15')
|
BIN
text/testfiles/mary.odp
Normal file
BIN
text/testfiles/mary.odp
Normal file
Binary file not shown.
BIN
text/testfiles/mary.ods
Normal file
BIN
text/testfiles/mary.ods
Normal file
Binary file not shown.
BIN
text/testfiles/mary.ppt
Normal file
BIN
text/testfiles/mary.ppt
Normal file
Binary file not shown.
BIN
text/testfiles/mary.xls
Normal file
BIN
text/testfiles/mary.xls
Normal file
Binary file not shown.
45
text/xls.py
Normal file
45
text/xls.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
#
|
||||||
|
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
#
|
||||||
|
|
||||||
|
"""
|
||||||
|
Searchable text support for MS Excel (.xls) files.
|
||||||
|
|
||||||
|
This uses the xls2csv command to perform the extraction.
|
||||||
|
|
||||||
|
Based on code provided by zc.index and TextIndexNG3.
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os, sys
|
||||||
|
|
||||||
|
from cybertools.text import base
|
||||||
|
|
||||||
|
|
||||||
|
class XlsTransform(base.BaseFileTransform):
|
||||||
|
|
||||||
|
extension = ".xls"
|
||||||
|
|
||||||
|
def extract(self, directory, filename):
|
||||||
|
if not self.checkAvailable('xls2csv', 'xls2csv is not available'):
|
||||||
|
return u''
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
data = self.execute('xls2csv -d 8859-1 -q 0 "%s" 2> nul:' % filename)
|
||||||
|
else:
|
||||||
|
data = self.execute('xls2csv -d 8859-1 -q 0 "%s" 2> /dev/null' % filename)
|
||||||
|
return data.decode('ISO8859-1')
|
Loading…
Add table
Reference in a new issue