added package cybertools.text

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1383 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
helmutm 2006-10-04 08:53:58 +00:00
parent f527de4f21
commit f2ec195a55
7 changed files with 206 additions and 0 deletions

19
text/README.txt Normal file
View file

@ -0,0 +1,19 @@
=================================================
Text transformations, e.g. for full-text indexing
=================================================
($Id$)
>>> import os
>>> from cybertools import text
>>> directory = os.path.dirname(text.__file__)
>>> fn = os.path.sep.join((directory, 'testfiles', 'mary.pdf'))
>>> f = open(fn)
>>> from cybertools.text.pdf import PdfTransform
>>> transform = PdfTransform(None)
>>> words = transform(f).split()
>>> len(words)
89
>>> u'lamb' in words
True

3
text/__init__.py Normal file
View file

@ -0,0 +1,3 @@
"""
$Id$
"""

87
text/base.py Normal file
View file

@ -0,0 +1,87 @@
#
# Copyright (c) 2006 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
Base classes for text transformations.
Based on code provided by zc.index.
$Id$
"""
__docformat__ = "reStructuredText"
import os, shutil, sys, tempfile
from zope.interface import implements
from cybertools.text.interfaces import ITextTransform, IFileTransform
def haveProgram(name):
"""Return true if the program `name` is available."""
if sys.platform.lower().startswith("win"):
extensions = (".com", ".exe", ".bat")
else:
extensions = ("",)
execpath = os.environ.get("PATH", "").split(os.path.pathsep)
for path in execpath:
for ext in extensions:
fn = os.path.join(path, name + ext)
if os.path.isfile(fn):
return True
return False
class BaseTransform(object):
implements(ITextTransform)
def __init__(self, context):
self.context = context
self.text = None
def __call__(self, f):
if self.text is None:
fr = open(f, 'r')
self.text = fr.read()
fr.close()
return self.text
class BaseFileTransform(BaseTransform):
implements(IFileTransform)
def __call__(self, fr):
if self.text is None:
#fr = f.open("rb")
dirname = tempfile.mkdtemp()
filename = os.path.join(dirname, "temp" + self.extension)
try:
fw = open(filename, "wb")
shutil.copyfileobj(fr, fw)
#fr.close()
fw.close()
text = self.extract(dirname, filename)
finally:
shutil.rmtree(dirname)
self.text = text
return self.text
def extract(self, dirname, filename):
raise ValueError('Method extract() has to be implemented by subclass.')

43
text/interfaces.py Normal file
View file

@ -0,0 +1,43 @@
#
# Copyright (c) 2006 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
interface definitions for text transformations.
$Id$
"""
from zope.interface import Interface
class ITextTransform(Interface):
def __call__(f):
""" Transform the content of file f to plain text and return
the result as unicode.
"""
class IFileTransform(ITextTransform):
""" A transformation that uses an intermediate disk file.
"""
def extract(dirname, filename):
""" Extract text contents from the file specified by dirnam, filename,
using some external programm, and return the result as unicode.
"""

26
text/pdf.py Normal file
View file

@ -0,0 +1,26 @@
"""Searchable text support for Portable Document Format (PDF) files.
This uses the pdftotext command from xpdf to perform the extraction.
"""
__docformat__ = "reStructuredText"
import os, sys
from cybertools.text import base
class PdfTransform(base.BaseFileTransform):
extension = ".pdf"
def extract(self, directory, filename):
if not base.haveProgram("pdftotext"):
print 'Warning: pdftotext is not available'
return u''
txtfile = os.path.join(directory, "words.txt")
st = os.system("pdftotext -enc UTF-8 %s %s" % (filename, txtfile))
f = open(txtfile, "rb")
data = f.read()
f.close()
return unicode(data, "utf-8")

BIN
text/testfiles/mary.pdf Normal file

Binary file not shown.

28
text/tests.py Executable file
View file

@ -0,0 +1,28 @@
#! /usr/bin/python
"""
Tests for the 'cybertools.text' package.
$Id$
"""
import unittest, doctest
from zope.testing.doctestunit import DocFileSuite
from cybertools.text import pdf
class Test(unittest.TestCase):
"Basic tests for the text package."
def testBasicStuff(self):
pass
def test_suite():
flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS
return unittest.TestSuite((
unittest.makeSuite(Test),
DocFileSuite('README.txt', optionflags=flags),
))
if __name__ == '__main__':
unittest.main(defaultTest='test_suite')