diff --git a/text/README.txt b/text/README.txt index 0fafd24..42b2ad2 100644 --- a/text/README.txt +++ b/text/README.txt @@ -66,3 +66,32 @@ Word Documents 89 >>> u'lamb' in words True + +PowerPoint Presentations +------------------------ + + >>> from cybertools.text.ppt import PptTransform + >>> transform = PptTransform(None) + >>> f = open(os.path.join(testdir, 'mary.ppt')) + >>> result = transform(f) + >>> print log + >>> words = result.split() + >>> len(words) + 102 + >>> u'lamb' in words + True + +Excel Spreadsheets +------------------ + + >>> from cybertools.text.xls import XlsTransform + >>> transform = XlsTransform(None) + >>> f = open(os.path.join(testdir, 'mary.xls')) + >>> result = transform(f) + >>> print log + >>> words = result.split() + >>> len(words) + 89 + >>> u'lamb' in words + True + diff --git a/text/doc.py b/text/doc.py index 3850246..8ea895f 100644 --- a/text/doc.py +++ b/text/doc.py @@ -17,10 +17,9 @@ # """ -Searchable text support for Portable Document Format (PDF) files. +Searchable text support for MS Word (.doc) files. -This uses the pdftotext command from xpdf to perform the extraction. -interface definitions for text transformations. +This uses the wvware command to perform the extraction. Based on code provided by zc.index and TextIndexNG3. diff --git a/text/pdf.py b/text/pdf.py index 55b30cd..41ae286 100644 --- a/text/pdf.py +++ b/text/pdf.py @@ -20,7 +20,6 @@ Searchable text support for Portable Document Format (PDF) files. This uses the pdftotext command from xpdf to perform the extraction. -interface definitions for text transformations. Based on code provided by zc.index and TextIndexNG3. diff --git a/text/ppt.py b/text/ppt.py new file mode 100644 index 0000000..706a657 --- /dev/null +++ b/text/ppt.py @@ -0,0 +1,48 @@ +# +# Copyright (c) 2007 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +""" +Searchable text support for Portable Document Format (PDF) files. + +This uses the pdftotext command from xpdf to perform the extraction. +interface definitions for text transformations. + +Based on code provided by zc.index and TextIndexNG3. + +$Id$ +""" + +import os, sys + +from cybertools.text import base +from cybertools.text.html import htmlToText + + +class PptTransform(base.BaseFileTransform): + + extension = ".ppt" + + def extract(self, directory, filename): + if not self.checkAvailable('ppthtml', 'ppthtml is not available'): + return u'' + if sys.platform == 'win32': + html = self.execute('ppthtml "%s" 2> nul:' % filename) + else: + html = self.execute('ppthtml "%s" 2> /dev/null' % filename) + data = htmlToText(html) + return data.decode('ISO8859-15') diff --git a/text/testfiles/mary.odp b/text/testfiles/mary.odp new file mode 100644 index 0000000..bfb4809 Binary files /dev/null and b/text/testfiles/mary.odp differ diff --git a/text/testfiles/mary.ods b/text/testfiles/mary.ods new file mode 100644 index 0000000..eb5137d Binary files /dev/null and b/text/testfiles/mary.ods differ diff --git a/text/testfiles/mary.ppt b/text/testfiles/mary.ppt new file mode 100644 index 0000000..ee3f01f Binary files /dev/null and b/text/testfiles/mary.ppt differ diff --git a/text/testfiles/mary.xls b/text/testfiles/mary.xls new file mode 100644 index 0000000..769aded Binary files /dev/null and b/text/testfiles/mary.xls differ diff --git a/text/xls.py b/text/xls.py new file mode 100644 index 0000000..31dad60 --- /dev/null +++ b/text/xls.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2007 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +""" +Searchable text support for MS Excel (.xls) files. + +This uses the xls2csv command to perform the extraction. + +Based on code provided by zc.index and TextIndexNG3. + +$Id$ +""" + +import os, sys + +from cybertools.text import base + + +class XlsTransform(base.BaseFileTransform): + + extension = ".xls" + + def extract(self, directory, filename): + if not self.checkAvailable('xls2csv', 'xls2csv is not available'): + return u'' + if sys.platform == 'win32': + data = self.execute('xls2csv -d 8859-1 -q 0 "%s" 2> nul:' % filename) + else: + data = self.execute('xls2csv -d 8859-1 -q 0 "%s" 2> /dev/null' % filename) + return data.decode('ISO8859-1')