diff --git a/text/README.txt b/text/README.txt index 2f40c73..00cee9e 100644 --- a/text/README.txt +++ b/text/README.txt @@ -1,18 +1,56 @@ ================================================= -Text transformations, e.g. for full-text indexing +Text Transformations, e.g. for Full-text Indexing ================================================= ($Id$) +If a converter program needed is not available we want to put a warning +into Zope's server log; in order to be able to test this we register +a log handler for testing: + + >>> from zope.testing.loggingsupport import InstalledHandler + >>> log = InstalledHandler('zope.server') + +The test files are in a subdirectory of the text package: + >>> import os >>> from cybertools import text - >>> directory = os.path.dirname(text.__file__) - >>> fn = os.path.sep.join((directory, 'testfiles', 'mary.pdf')) - >>> f = open(fn) + >>> testdir = os.path.join(os.path.dirname(text.__file__), 'testfiles') + +PDF Files +--------- + +Let's start with a PDF file: >>> from cybertools.text.pdf import PdfTransform >>> transform = PdfTransform(None) - >>> words = transform(f).split() + >>> f = open(os.path.join(testdir, 'mary.pdf')) + +This will be transformed to plain text: + + >>> result = transform(f) + +Let's check the log, should be empty: + + >>> print log + +So what is in the plain text result? + + >>> words = result.split() + >>> len(words) + 89 + >>> u'lamb' in words + True + +Word Documents +-------------- + + >>> from cybertools.text.doc import DocTransform + >>> transform = DocTransform(None) + >>> f = open(os.path.join(testdir, 'mary.doc')) + >>> result = transform(f) + >>> print log + >>> words = result.split() >>> len(words) 89 >>> u'lamb' in words diff --git a/text/base.py b/text/base.py index e384424..74057c3 100644 --- a/text/base.py +++ b/text/base.py @@ -19,32 +19,17 @@ """ Base classes for text transformations. -Based on code provided by zc.index. +Based on code provided by zc.index and TextIndexNG3. $Id$ """ -__docformat__ = "reStructuredText" - import os, shutil, sys, tempfile +import logging from zope.interface import implements from cybertools.text.interfaces import ITextTransform, IFileTransform -def haveProgram(name): - """Return true if the program `name` is available.""" - if sys.platform.lower().startswith("win"): - extensions = (".com", ".exe", ".bat") - else: - extensions = ("",) - execpath = os.environ.get("PATH", "").split(os.path.pathsep) - for path in execpath: - for ext in extensions: - fn = os.path.join(path, name + ext) - if os.path.isfile(fn): - return True - return False - class BaseTransform(object): @@ -54,11 +39,9 @@ class BaseTransform(object): self.context = context self.text = None - def __call__(self, f): + def __call__(self, fr): if self.text is None: - fr = open(f, 'r') self.text = fr.read() - fr.close() return self.text @@ -66,22 +49,45 @@ class BaseFileTransform(BaseTransform): implements(IFileTransform) + extension = '.txt' + def __call__(self, fr): if self.text is None: - #fr = f.open("rb") dirname = tempfile.mkdtemp() filename = os.path.join(dirname, "temp" + self.extension) try: fw = open(filename, "wb") shutil.copyfileobj(fr, fw) - #fr.close() fw.close() text = self.extract(dirname, filename) finally: shutil.rmtree(dirname) + #fr.close() self.text = text return self.text def extract(self, dirname, filename): raise ValueError('Method extract() has to be implemented by subclass.') + def execute(self, com): + try: + import win32pipe + result = win32pipe.popen(com).read() + except ImportError: + result = os.popen(com).read() + return result + + def checkAvailable(self, name, logMessage=''): + if sys.platform.lower().startswith("win"): + extensions = (".com", ".exe", ".bat") + else: + extensions = ("",) + execpath = os.environ.get("PATH", "").split(os.path.pathsep) + for path in execpath: + for ext in extensions: + fn = os.path.join(path, name + ext) + if os.path.isfile(fn): + return True + if logMessage: + logging.getLogger('zope.server').warn(logMessage) + return False diff --git a/text/config/wvText.xml b/text/config/wvText.xml new file mode 100755 index 0000000..e05d9fb --- /dev/null +++ b/text/config/wvText.xml @@ -0,0 +1,355 @@ +
+ +ABW + + + + + + + + + +
+ + + + +
+ + + + +
+ + +
+ + +type="1" +type="I" +type="i" +type="A" +type="a" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +0 +0 +0 +0 +0 +0 + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/text/doc.py b/text/doc.py new file mode 100644 index 0000000..3850246 --- /dev/null +++ b/text/doc.py @@ -0,0 +1,54 @@ +# +# Copyright (c) 2007 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +""" +Searchable text support for Portable Document Format (PDF) files. + +This uses the pdftotext command from xpdf to perform the extraction. +interface definitions for text transformations. + +Based on code provided by zc.index and TextIndexNG3. + +$Id$ +""" + +import os, sys + +from cybertools.text import base + +try: + from Globals import package_home + wvConf = os.path.join(package_home(globals()), 'config', 'wvText.xml') +except ImportError: + wvConf = os.path.join(os.path.dirname(__file__), 'config', 'wvText.xml') + + +class DocTransform(base.BaseFileTransform): + + extension = ".doc" + + def extract(self, directory, filename): + if not self.checkAvailable('wvWare', 'wvWare is not available'): + return u'' + if sys.platform == 'win32': + data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> nul:' + % (wvConf, filename)) + else: + data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> /dev/null' + % (wvConf, filename)) + return data.decode('UTF-8') diff --git a/text/interfaces.py b/text/interfaces.py index d795868..c74d6ff 100644 --- a/text/interfaces.py +++ b/text/interfaces.py @@ -27,17 +27,32 @@ from zope.interface import Interface class ITextTransform(Interface): - def __call__(f): - """ Transform the content of file f to plain text and return - the result as unicode. + def __call__(fr): + """ Transform the content of file fr (readfile) to plain text and + return the result as unicode. """ class IFileTransform(ITextTransform): - """ A transformation that uses an intermediate disk file. + """ A transformation that is performed by calling some external program + and that typically uses an intermediate disk file. """ def extract(dirname, filename): - """ Extract text contents from the file specified by dirnam, filename, + """ Extract text contents from the file specified by ``filename``, using some external programm, and return the result as unicode. + ``dirname`` is the path to a temporary directory that + usually (but not necessarily) contains the file and may + be used for creating other (temporary) files if needed. + """ + + def execute(command): + """ Execute a system command and return the output of the program + called. + """ + + def checkAvailable(progname, logMessage=''): + """ Check the availability of the program named ``progname``. + Return True if available; if ``logMessage`` is given, put this + as a warning message into the log if the program is not available. """ diff --git a/text/pdf.py b/text/pdf.py index f687219..55b30cd 100644 --- a/text/pdf.py +++ b/text/pdf.py @@ -1,9 +1,31 @@ -"""Searchable text support for Portable Document Format (PDF) files. - -This uses the pdftotext command from xpdf to perform the extraction. +# +# Copyright (c) 2007 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# """ -__docformat__ = "reStructuredText" +Searchable text support for Portable Document Format (PDF) files. + +This uses the pdftotext command from xpdf to perform the extraction. +interface definitions for text transformations. + +Based on code provided by zc.index and TextIndexNG3. + +$Id$ +""" import os, sys @@ -15,12 +37,7 @@ class PdfTransform(base.BaseFileTransform): extension = ".pdf" def extract(self, directory, filename): - if not base.haveProgram("pdftotext"): - print 'Warning: pdftotext is not available' + if not self.checkAvailable('pdftotext', 'pdftotext is not available'): return u'' - txtfile = os.path.join(directory, "words.txt") - st = os.system("pdftotext -enc UTF-8 %s %s" % (filename, txtfile)) - f = open(txtfile, "rb") - data = f.read() - f.close() - return unicode(data, "utf-8") + data = self.execute('pdftotext -enc UTF-8 "%s" -' % filename) + return data.decode('UTF-8') diff --git a/text/testfiles/mary.doc b/text/testfiles/mary.doc new file mode 100644 index 0000000..847a6de Binary files /dev/null and b/text/testfiles/mary.doc differ diff --git a/text/testfiles/mary.odt b/text/testfiles/mary.odt new file mode 100644 index 0000000..65e48dc Binary files /dev/null and b/text/testfiles/mary.odt differ