provide text converters for XLS and PPT
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1626 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
65249df13f
commit
06d4652807
9 changed files with 124 additions and 4 deletions
|
@ -66,3 +66,32 @@ Word Documents
|
|||
89
|
||||
>>> u'lamb' in words
|
||||
True
|
||||
|
||||
PowerPoint Presentations
|
||||
------------------------
|
||||
|
||||
>>> from cybertools.text.ppt import PptTransform
|
||||
>>> transform = PptTransform(None)
|
||||
>>> f = open(os.path.join(testdir, 'mary.ppt'))
|
||||
>>> result = transform(f)
|
||||
>>> print log
|
||||
>>> words = result.split()
|
||||
>>> len(words)
|
||||
102
|
||||
>>> u'lamb' in words
|
||||
True
|
||||
|
||||
Excel Spreadsheets
|
||||
------------------
|
||||
|
||||
>>> from cybertools.text.xls import XlsTransform
|
||||
>>> transform = XlsTransform(None)
|
||||
>>> f = open(os.path.join(testdir, 'mary.xls'))
|
||||
>>> result = transform(f)
|
||||
>>> print log
|
||||
>>> words = result.split()
|
||||
>>> len(words)
|
||||
89
|
||||
>>> u'lamb' in words
|
||||
True
|
||||
|
||||
|
|
|
@ -17,10 +17,9 @@
|
|||
#
|
||||
|
||||
"""
|
||||
Searchable text support for Portable Document Format (PDF) files.
|
||||
Searchable text support for MS Word (.doc) files.
|
||||
|
||||
This uses the pdftotext command from xpdf to perform the extraction.
|
||||
interface definitions for text transformations.
|
||||
This uses the wvware command to perform the extraction.
|
||||
|
||||
Based on code provided by zc.index and TextIndexNG3.
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
Searchable text support for Portable Document Format (PDF) files.
|
||||
|
||||
This uses the pdftotext command from xpdf to perform the extraction.
|
||||
interface definitions for text transformations.
|
||||
|
||||
Based on code provided by zc.index and TextIndexNG3.
|
||||
|
||||
|
|
48
text/ppt.py
Normal file
48
text/ppt.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
#
|
||||
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
#
|
||||
|
||||
"""
|
||||
Searchable text support for Portable Document Format (PDF) files.
|
||||
|
||||
This uses the pdftotext command from xpdf to perform the extraction.
|
||||
interface definitions for text transformations.
|
||||
|
||||
Based on code provided by zc.index and TextIndexNG3.
|
||||
|
||||
$Id$
|
||||
"""
|
||||
|
||||
import os, sys
|
||||
|
||||
from cybertools.text import base
|
||||
from cybertools.text.html import htmlToText
|
||||
|
||||
|
||||
class PptTransform(base.BaseFileTransform):
|
||||
|
||||
extension = ".ppt"
|
||||
|
||||
def extract(self, directory, filename):
|
||||
if not self.checkAvailable('ppthtml', 'ppthtml is not available'):
|
||||
return u''
|
||||
if sys.platform == 'win32':
|
||||
html = self.execute('ppthtml "%s" 2> nul:' % filename)
|
||||
else:
|
||||
html = self.execute('ppthtml "%s" 2> /dev/null' % filename)
|
||||
data = htmlToText(html)
|
||||
return data.decode('ISO8859-15')
|
BIN
text/testfiles/mary.odp
Normal file
BIN
text/testfiles/mary.odp
Normal file
Binary file not shown.
BIN
text/testfiles/mary.ods
Normal file
BIN
text/testfiles/mary.ods
Normal file
Binary file not shown.
BIN
text/testfiles/mary.ppt
Normal file
BIN
text/testfiles/mary.ppt
Normal file
Binary file not shown.
BIN
text/testfiles/mary.xls
Normal file
BIN
text/testfiles/mary.xls
Normal file
Binary file not shown.
45
text/xls.py
Normal file
45
text/xls.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
#
|
||||
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
#
|
||||
|
||||
"""
|
||||
Searchable text support for MS Excel (.xls) files.
|
||||
|
||||
This uses the xls2csv command to perform the extraction.
|
||||
|
||||
Based on code provided by zc.index and TextIndexNG3.
|
||||
|
||||
$Id$
|
||||
"""
|
||||
|
||||
import os, sys
|
||||
|
||||
from cybertools.text import base
|
||||
|
||||
|
||||
class XlsTransform(base.BaseFileTransform):
|
||||
|
||||
extension = ".xls"
|
||||
|
||||
def extract(self, directory, filename):
|
||||
if not self.checkAvailable('xls2csv', 'xls2csv is not available'):
|
||||
return u''
|
||||
if sys.platform == 'win32':
|
||||
data = self.execute('xls2csv -d 8859-1 -q 0 "%s" 2> nul:' % filename)
|
||||
else:
|
||||
data = self.execute('xls2csv -d 8859-1 -q 0 "%s" 2> /dev/null' % filename)
|
||||
return data.decode('ISO8859-1')
|
Loading…
Add table
Reference in a new issue