diff --git a/text/README.txt b/text/README.txt new file mode 100644 index 0000000..2f40c73 --- /dev/null +++ b/text/README.txt @@ -0,0 +1,19 @@ +================================================= +Text transformations, e.g. for full-text indexing +================================================= + + ($Id$) + + >>> import os + >>> from cybertools import text + >>> directory = os.path.dirname(text.__file__) + >>> fn = os.path.sep.join((directory, 'testfiles', 'mary.pdf')) + >>> f = open(fn) + + >>> from cybertools.text.pdf import PdfTransform + >>> transform = PdfTransform(None) + >>> words = transform(f).split() + >>> len(words) + 89 + >>> u'lamb' in words + True diff --git a/text/__init__.py b/text/__init__.py new file mode 100644 index 0000000..38314f3 --- /dev/null +++ b/text/__init__.py @@ -0,0 +1,3 @@ +""" +$Id$ +""" diff --git a/text/base.py b/text/base.py new file mode 100644 index 0000000..e384424 --- /dev/null +++ b/text/base.py @@ -0,0 +1,87 @@ +# +# Copyright (c) 2006 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +""" +Base classes for text transformations. + +Based on code provided by zc.index. + +$Id$ +""" + + +__docformat__ = "reStructuredText" + +import os, shutil, sys, tempfile +from zope.interface import implements +from cybertools.text.interfaces import ITextTransform, IFileTransform + +def haveProgram(name): + """Return true if the program `name` is available.""" + if sys.platform.lower().startswith("win"): + extensions = (".com", ".exe", ".bat") + else: + extensions = ("",) + execpath = os.environ.get("PATH", "").split(os.path.pathsep) + for path in execpath: + for ext in extensions: + fn = os.path.join(path, name + ext) + if os.path.isfile(fn): + return True + return False + + +class BaseTransform(object): + + implements(ITextTransform) + + def __init__(self, context): + self.context = context + self.text = None + + def __call__(self, f): + if self.text is None: + fr = open(f, 'r') + self.text = fr.read() + fr.close() + return self.text + + +class BaseFileTransform(BaseTransform): + + implements(IFileTransform) + + def __call__(self, fr): + if self.text is None: + #fr = f.open("rb") + dirname = tempfile.mkdtemp() + filename = os.path.join(dirname, "temp" + self.extension) + try: + fw = open(filename, "wb") + shutil.copyfileobj(fr, fw) + #fr.close() + fw.close() + text = self.extract(dirname, filename) + finally: + shutil.rmtree(dirname) + self.text = text + return self.text + + def extract(self, dirname, filename): + raise ValueError('Method extract() has to be implemented by subclass.') + diff --git a/text/interfaces.py b/text/interfaces.py new file mode 100644 index 0000000..d795868 --- /dev/null +++ b/text/interfaces.py @@ -0,0 +1,43 @@ +# +# Copyright (c) 2006 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +""" +interface definitions for text transformations. + +$Id$ +""" + +from zope.interface import Interface + + +class ITextTransform(Interface): + + def __call__(f): + """ Transform the content of file f to plain text and return + the result as unicode. + """ + + +class IFileTransform(ITextTransform): + """ A transformation that uses an intermediate disk file. + """ + + def extract(dirname, filename): + """ Extract text contents from the file specified by dirnam, filename, + using some external programm, and return the result as unicode. + """ diff --git a/text/pdf.py b/text/pdf.py new file mode 100644 index 0000000..f687219 --- /dev/null +++ b/text/pdf.py @@ -0,0 +1,26 @@ +"""Searchable text support for Portable Document Format (PDF) files. + +This uses the pdftotext command from xpdf to perform the extraction. + +""" +__docformat__ = "reStructuredText" + +import os, sys + +from cybertools.text import base + + +class PdfTransform(base.BaseFileTransform): + + extension = ".pdf" + + def extract(self, directory, filename): + if not base.haveProgram("pdftotext"): + print 'Warning: pdftotext is not available' + return u'' + txtfile = os.path.join(directory, "words.txt") + st = os.system("pdftotext -enc UTF-8 %s %s" % (filename, txtfile)) + f = open(txtfile, "rb") + data = f.read() + f.close() + return unicode(data, "utf-8") diff --git a/text/testfiles/mary.pdf b/text/testfiles/mary.pdf new file mode 100644 index 0000000..381d054 Binary files /dev/null and b/text/testfiles/mary.pdf differ diff --git a/text/tests.py b/text/tests.py new file mode 100755 index 0000000..05dbe4b --- /dev/null +++ b/text/tests.py @@ -0,0 +1,28 @@ +#! /usr/bin/python + +""" +Tests for the 'cybertools.text' package. + +$Id$ +""" + +import unittest, doctest +from zope.testing.doctestunit import DocFileSuite +from cybertools.text import pdf + +class Test(unittest.TestCase): + "Basic tests for the text package." + + def testBasicStuff(self): + pass + + +def test_suite(): + flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS + return unittest.TestSuite(( + unittest.makeSuite(Test), + DocFileSuite('README.txt', optionflags=flags), + )) + +if __name__ == '__main__': + unittest.main(defaultTest='test_suite')