added package cybertools.text
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1383 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
f527de4f21
commit
f2ec195a55
7 changed files with 206 additions and 0 deletions
19
text/README.txt
Normal file
19
text/README.txt
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
=================================================
|
||||||
|
Text transformations, e.g. for full-text indexing
|
||||||
|
=================================================
|
||||||
|
|
||||||
|
($Id$)
|
||||||
|
|
||||||
|
>>> import os
|
||||||
|
>>> from cybertools import text
|
||||||
|
>>> directory = os.path.dirname(text.__file__)
|
||||||
|
>>> fn = os.path.sep.join((directory, 'testfiles', 'mary.pdf'))
|
||||||
|
>>> f = open(fn)
|
||||||
|
|
||||||
|
>>> from cybertools.text.pdf import PdfTransform
|
||||||
|
>>> transform = PdfTransform(None)
|
||||||
|
>>> words = transform(f).split()
|
||||||
|
>>> len(words)
|
||||||
|
89
|
||||||
|
>>> u'lamb' in words
|
||||||
|
True
|
3
text/__init__.py
Normal file
3
text/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
"""
|
||||||
|
$Id$
|
||||||
|
"""
|
87
text/base.py
Normal file
87
text/base.py
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
#
|
||||||
|
# Copyright (c) 2006 Helmut Merz helmutm@cy55.de
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
#
|
||||||
|
|
||||||
|
"""
|
||||||
|
Base classes for text transformations.
|
||||||
|
|
||||||
|
Based on code provided by zc.index.
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
__docformat__ = "reStructuredText"
|
||||||
|
|
||||||
|
import os, shutil, sys, tempfile
|
||||||
|
from zope.interface import implements
|
||||||
|
from cybertools.text.interfaces import ITextTransform, IFileTransform
|
||||||
|
|
||||||
|
def haveProgram(name):
|
||||||
|
"""Return true if the program `name` is available."""
|
||||||
|
if sys.platform.lower().startswith("win"):
|
||||||
|
extensions = (".com", ".exe", ".bat")
|
||||||
|
else:
|
||||||
|
extensions = ("",)
|
||||||
|
execpath = os.environ.get("PATH", "").split(os.path.pathsep)
|
||||||
|
for path in execpath:
|
||||||
|
for ext in extensions:
|
||||||
|
fn = os.path.join(path, name + ext)
|
||||||
|
if os.path.isfile(fn):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class BaseTransform(object):
|
||||||
|
|
||||||
|
implements(ITextTransform)
|
||||||
|
|
||||||
|
def __init__(self, context):
|
||||||
|
self.context = context
|
||||||
|
self.text = None
|
||||||
|
|
||||||
|
def __call__(self, f):
|
||||||
|
if self.text is None:
|
||||||
|
fr = open(f, 'r')
|
||||||
|
self.text = fr.read()
|
||||||
|
fr.close()
|
||||||
|
return self.text
|
||||||
|
|
||||||
|
|
||||||
|
class BaseFileTransform(BaseTransform):
|
||||||
|
|
||||||
|
implements(IFileTransform)
|
||||||
|
|
||||||
|
def __call__(self, fr):
|
||||||
|
if self.text is None:
|
||||||
|
#fr = f.open("rb")
|
||||||
|
dirname = tempfile.mkdtemp()
|
||||||
|
filename = os.path.join(dirname, "temp" + self.extension)
|
||||||
|
try:
|
||||||
|
fw = open(filename, "wb")
|
||||||
|
shutil.copyfileobj(fr, fw)
|
||||||
|
#fr.close()
|
||||||
|
fw.close()
|
||||||
|
text = self.extract(dirname, filename)
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(dirname)
|
||||||
|
self.text = text
|
||||||
|
return self.text
|
||||||
|
|
||||||
|
def extract(self, dirname, filename):
|
||||||
|
raise ValueError('Method extract() has to be implemented by subclass.')
|
||||||
|
|
43
text/interfaces.py
Normal file
43
text/interfaces.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
#
|
||||||
|
# Copyright (c) 2006 Helmut Merz helmutm@cy55.de
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
#
|
||||||
|
|
||||||
|
"""
|
||||||
|
interface definitions for text transformations.
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
"""
|
||||||
|
|
||||||
|
from zope.interface import Interface
|
||||||
|
|
||||||
|
|
||||||
|
class ITextTransform(Interface):
|
||||||
|
|
||||||
|
def __call__(f):
|
||||||
|
""" Transform the content of file f to plain text and return
|
||||||
|
the result as unicode.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class IFileTransform(ITextTransform):
|
||||||
|
""" A transformation that uses an intermediate disk file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def extract(dirname, filename):
|
||||||
|
""" Extract text contents from the file specified by dirnam, filename,
|
||||||
|
using some external programm, and return the result as unicode.
|
||||||
|
"""
|
26
text/pdf.py
Normal file
26
text/pdf.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
"""Searchable text support for Portable Document Format (PDF) files.
|
||||||
|
|
||||||
|
This uses the pdftotext command from xpdf to perform the extraction.
|
||||||
|
|
||||||
|
"""
|
||||||
|
__docformat__ = "reStructuredText"
|
||||||
|
|
||||||
|
import os, sys
|
||||||
|
|
||||||
|
from cybertools.text import base
|
||||||
|
|
||||||
|
|
||||||
|
class PdfTransform(base.BaseFileTransform):
|
||||||
|
|
||||||
|
extension = ".pdf"
|
||||||
|
|
||||||
|
def extract(self, directory, filename):
|
||||||
|
if not base.haveProgram("pdftotext"):
|
||||||
|
print 'Warning: pdftotext is not available'
|
||||||
|
return u''
|
||||||
|
txtfile = os.path.join(directory, "words.txt")
|
||||||
|
st = os.system("pdftotext -enc UTF-8 %s %s" % (filename, txtfile))
|
||||||
|
f = open(txtfile, "rb")
|
||||||
|
data = f.read()
|
||||||
|
f.close()
|
||||||
|
return unicode(data, "utf-8")
|
BIN
text/testfiles/mary.pdf
Normal file
BIN
text/testfiles/mary.pdf
Normal file
Binary file not shown.
28
text/tests.py
Executable file
28
text/tests.py
Executable file
|
@ -0,0 +1,28 @@
|
||||||
|
#! /usr/bin/python
|
||||||
|
|
||||||
|
"""
|
||||||
|
Tests for the 'cybertools.text' package.
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
"""
|
||||||
|
|
||||||
|
import unittest, doctest
|
||||||
|
from zope.testing.doctestunit import DocFileSuite
|
||||||
|
from cybertools.text import pdf
|
||||||
|
|
||||||
|
class Test(unittest.TestCase):
|
||||||
|
"Basic tests for the text package."
|
||||||
|
|
||||||
|
def testBasicStuff(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_suite():
|
||||||
|
flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS
|
||||||
|
return unittest.TestSuite((
|
||||||
|
unittest.makeSuite(Test),
|
||||||
|
DocFileSuite('README.txt', optionflags=flags),
|
||||||
|
))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main(defaultTest='test_suite')
|
Loading…
Add table
Reference in a new issue