added package cybertools.text
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1383 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
f527de4f21
commit
f2ec195a55
7 changed files with 206 additions and 0 deletions
19
text/README.txt
Normal file
19
text/README.txt
Normal file
|
@ -0,0 +1,19 @@
|
|||
=================================================
|
||||
Text transformations, e.g. for full-text indexing
|
||||
=================================================
|
||||
|
||||
($Id$)
|
||||
|
||||
>>> import os
|
||||
>>> from cybertools import text
|
||||
>>> directory = os.path.dirname(text.__file__)
|
||||
>>> fn = os.path.sep.join((directory, 'testfiles', 'mary.pdf'))
|
||||
>>> f = open(fn)
|
||||
|
||||
>>> from cybertools.text.pdf import PdfTransform
|
||||
>>> transform = PdfTransform(None)
|
||||
>>> words = transform(f).split()
|
||||
>>> len(words)
|
||||
89
|
||||
>>> u'lamb' in words
|
||||
True
|
3
text/__init__.py
Normal file
3
text/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
"""
|
||||
$Id$
|
||||
"""
|
87
text/base.py
Normal file
87
text/base.py
Normal file
|
@ -0,0 +1,87 @@
|
|||
#
|
||||
# Copyright (c) 2006 Helmut Merz helmutm@cy55.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
#
|
||||
|
||||
"""
|
||||
Base classes for text transformations.
|
||||
|
||||
Based on code provided by zc.index.
|
||||
|
||||
$Id$
|
||||
"""
|
||||
|
||||
|
||||
__docformat__ = "reStructuredText"
|
||||
|
||||
import os, shutil, sys, tempfile
|
||||
from zope.interface import implements
|
||||
from cybertools.text.interfaces import ITextTransform, IFileTransform
|
||||
|
||||
def haveProgram(name):
|
||||
"""Return true if the program `name` is available."""
|
||||
if sys.platform.lower().startswith("win"):
|
||||
extensions = (".com", ".exe", ".bat")
|
||||
else:
|
||||
extensions = ("",)
|
||||
execpath = os.environ.get("PATH", "").split(os.path.pathsep)
|
||||
for path in execpath:
|
||||
for ext in extensions:
|
||||
fn = os.path.join(path, name + ext)
|
||||
if os.path.isfile(fn):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class BaseTransform(object):
|
||||
|
||||
implements(ITextTransform)
|
||||
|
||||
def __init__(self, context):
|
||||
self.context = context
|
||||
self.text = None
|
||||
|
||||
def __call__(self, f):
|
||||
if self.text is None:
|
||||
fr = open(f, 'r')
|
||||
self.text = fr.read()
|
||||
fr.close()
|
||||
return self.text
|
||||
|
||||
|
||||
class BaseFileTransform(BaseTransform):
|
||||
|
||||
implements(IFileTransform)
|
||||
|
||||
def __call__(self, fr):
|
||||
if self.text is None:
|
||||
#fr = f.open("rb")
|
||||
dirname = tempfile.mkdtemp()
|
||||
filename = os.path.join(dirname, "temp" + self.extension)
|
||||
try:
|
||||
fw = open(filename, "wb")
|
||||
shutil.copyfileobj(fr, fw)
|
||||
#fr.close()
|
||||
fw.close()
|
||||
text = self.extract(dirname, filename)
|
||||
finally:
|
||||
shutil.rmtree(dirname)
|
||||
self.text = text
|
||||
return self.text
|
||||
|
||||
def extract(self, dirname, filename):
|
||||
raise ValueError('Method extract() has to be implemented by subclass.')
|
||||
|
43
text/interfaces.py
Normal file
43
text/interfaces.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
#
|
||||
# Copyright (c) 2006 Helmut Merz helmutm@cy55.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
#
|
||||
|
||||
"""
|
||||
interface definitions for text transformations.
|
||||
|
||||
$Id$
|
||||
"""
|
||||
|
||||
from zope.interface import Interface
|
||||
|
||||
|
||||
class ITextTransform(Interface):
|
||||
|
||||
def __call__(f):
|
||||
""" Transform the content of file f to plain text and return
|
||||
the result as unicode.
|
||||
"""
|
||||
|
||||
|
||||
class IFileTransform(ITextTransform):
|
||||
""" A transformation that uses an intermediate disk file.
|
||||
"""
|
||||
|
||||
def extract(dirname, filename):
|
||||
""" Extract text contents from the file specified by dirnam, filename,
|
||||
using some external programm, and return the result as unicode.
|
||||
"""
|
26
text/pdf.py
Normal file
26
text/pdf.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
"""Searchable text support for Portable Document Format (PDF) files.
|
||||
|
||||
This uses the pdftotext command from xpdf to perform the extraction.
|
||||
|
||||
"""
|
||||
__docformat__ = "reStructuredText"
|
||||
|
||||
import os, sys
|
||||
|
||||
from cybertools.text import base
|
||||
|
||||
|
||||
class PdfTransform(base.BaseFileTransform):
|
||||
|
||||
extension = ".pdf"
|
||||
|
||||
def extract(self, directory, filename):
|
||||
if not base.haveProgram("pdftotext"):
|
||||
print 'Warning: pdftotext is not available'
|
||||
return u''
|
||||
txtfile = os.path.join(directory, "words.txt")
|
||||
st = os.system("pdftotext -enc UTF-8 %s %s" % (filename, txtfile))
|
||||
f = open(txtfile, "rb")
|
||||
data = f.read()
|
||||
f.close()
|
||||
return unicode(data, "utf-8")
|
BIN
text/testfiles/mary.pdf
Normal file
BIN
text/testfiles/mary.pdf
Normal file
Binary file not shown.
28
text/tests.py
Executable file
28
text/tests.py
Executable file
|
@ -0,0 +1,28 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
"""
|
||||
Tests for the 'cybertools.text' package.
|
||||
|
||||
$Id$
|
||||
"""
|
||||
|
||||
import unittest, doctest
|
||||
from zope.testing.doctestunit import DocFileSuite
|
||||
from cybertools.text import pdf
|
||||
|
||||
class Test(unittest.TestCase):
|
||||
"Basic tests for the text package."
|
||||
|
||||
def testBasicStuff(self):
|
||||
pass
|
||||
|
||||
|
||||
def test_suite():
|
||||
flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS
|
||||
return unittest.TestSuite((
|
||||
unittest.makeSuite(Test),
|
||||
DocFileSuite('README.txt', optionflags=flags),
|
||||
))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main(defaultTest='test_suite')
|
Loading…
Add table
Reference in a new issue