clean-up of cybertools.text; provide msword conversion

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1603 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
helmutm 2007-03-01 11:40:35 +00:00
parent 53ae9f332a
commit b380b0fcd7
8 changed files with 529 additions and 44 deletions

View file

@ -1,18 +1,56 @@
================================================= =================================================
Text transformations, e.g. for full-text indexing Text Transformations, e.g. for Full-text Indexing
================================================= =================================================
($Id$) ($Id$)
If a converter program needed is not available we want to put a warning
into Zope's server log; in order to be able to test this we register
a log handler for testing:
>>> from zope.testing.loggingsupport import InstalledHandler
>>> log = InstalledHandler('zope.server')
The test files are in a subdirectory of the text package:
>>> import os >>> import os
>>> from cybertools import text >>> from cybertools import text
>>> directory = os.path.dirname(text.__file__) >>> testdir = os.path.join(os.path.dirname(text.__file__), 'testfiles')
>>> fn = os.path.sep.join((directory, 'testfiles', 'mary.pdf'))
>>> f = open(fn) PDF Files
---------
Let's start with a PDF file:
>>> from cybertools.text.pdf import PdfTransform >>> from cybertools.text.pdf import PdfTransform
>>> transform = PdfTransform(None) >>> transform = PdfTransform(None)
>>> words = transform(f).split() >>> f = open(os.path.join(testdir, 'mary.pdf'))
This will be transformed to plain text:
>>> result = transform(f)
Let's check the log, should be empty:
>>> print log
So what is in the plain text result?
>>> words = result.split()
>>> len(words)
89
>>> u'lamb' in words
True
Word Documents
--------------
>>> from cybertools.text.doc import DocTransform
>>> transform = DocTransform(None)
>>> f = open(os.path.join(testdir, 'mary.doc'))
>>> result = transform(f)
>>> print log
>>> words = result.split()
>>> len(words) >>> len(words)
89 89
>>> u'lamb' in words >>> u'lamb' in words

View file

@ -19,32 +19,17 @@
""" """
Base classes for text transformations. Base classes for text transformations.
Based on code provided by zc.index. Based on code provided by zc.index and TextIndexNG3.
$Id$ $Id$
""" """
__docformat__ = "reStructuredText"
import os, shutil, sys, tempfile import os, shutil, sys, tempfile
import logging
from zope.interface import implements from zope.interface import implements
from cybertools.text.interfaces import ITextTransform, IFileTransform from cybertools.text.interfaces import ITextTransform, IFileTransform
def haveProgram(name):
"""Return true if the program `name` is available."""
if sys.platform.lower().startswith("win"):
extensions = (".com", ".exe", ".bat")
else:
extensions = ("",)
execpath = os.environ.get("PATH", "").split(os.path.pathsep)
for path in execpath:
for ext in extensions:
fn = os.path.join(path, name + ext)
if os.path.isfile(fn):
return True
return False
class BaseTransform(object): class BaseTransform(object):
@ -54,11 +39,9 @@ class BaseTransform(object):
self.context = context self.context = context
self.text = None self.text = None
def __call__(self, f): def __call__(self, fr):
if self.text is None: if self.text is None:
fr = open(f, 'r')
self.text = fr.read() self.text = fr.read()
fr.close()
return self.text return self.text
@ -66,22 +49,45 @@ class BaseFileTransform(BaseTransform):
implements(IFileTransform) implements(IFileTransform)
extension = '.txt'
def __call__(self, fr): def __call__(self, fr):
if self.text is None: if self.text is None:
#fr = f.open("rb")
dirname = tempfile.mkdtemp() dirname = tempfile.mkdtemp()
filename = os.path.join(dirname, "temp" + self.extension) filename = os.path.join(dirname, "temp" + self.extension)
try: try:
fw = open(filename, "wb") fw = open(filename, "wb")
shutil.copyfileobj(fr, fw) shutil.copyfileobj(fr, fw)
#fr.close()
fw.close() fw.close()
text = self.extract(dirname, filename) text = self.extract(dirname, filename)
finally: finally:
shutil.rmtree(dirname) shutil.rmtree(dirname)
#fr.close()
self.text = text self.text = text
return self.text return self.text
def extract(self, dirname, filename): def extract(self, dirname, filename):
raise ValueError('Method extract() has to be implemented by subclass.') raise ValueError('Method extract() has to be implemented by subclass.')
def execute(self, com):
try:
import win32pipe
result = win32pipe.popen(com).read()
except ImportError:
result = os.popen(com).read()
return result
def checkAvailable(self, name, logMessage=''):
if sys.platform.lower().startswith("win"):
extensions = (".com", ".exe", ".bat")
else:
extensions = ("",)
execpath = os.environ.get("PATH", "").split(os.path.pathsep)
for path in execpath:
for ext in extensions:
fn = os.path.join(path, name + ext)
if os.path.isfile(fn):
return True
if logMessage:
logging.getLogger('zope.server').warn(logMessage)
return False

355
text/config/wvText.xml Executable file
View file

@ -0,0 +1,355 @@
<main>
<charentity>
<begin>ABW</begin>
</charentity>
<document>
<begin>
</begin>
<end>
</end>
</document>
<section>
<begin>
</begin>
<end>
</end>
</section>
<justification>
<left></left>
<right></right>
<center></center>
<block></block>
<asian></asian>
</justification>
<numbering>
<Arabic>type=&quot;1&quot;</Arabic>
<UpperRoman>type=&quot;I&quot;</UpperRoman>
<LowerRoman>type=&quot;i&quot;</LowerRoman>
<UpperCaseN>type=&quot;A&quot;</UpperCaseN>
<LowerCaseN>type=&quot;a&quot;</LowerCaseN>
</numbering>
<border>
<noned></noned>
<singled></singled>
<thickd></thickd>
<doubled></doubled>
<number4d></number4d>
<hairlined></hairlined>
<dotd></dotd>
<dashlargegapd></dashlargegapd>
<dotdashd></dotdashd>
<dotdotdashd></dotdotdashd>
<tripled></tripled>
<thin-thicksmallgapd></thin-thicksmallgapd>
<thick-thinsmallgapd></thick-thinsmallgapd>
<thin-thick-thinsmallgapd></thin-thick-thinsmallgapd>
<thin-thickmediumgapd></thin-thickmediumgapd>
<thick-thinmediumgapd></thick-thinmediumgapd>
<thin-thick-thinmediumgapd></thin-thick-thinmediumgapd>
<thin-thicklargegapd></thin-thicklargegapd>
<thick-thinlargegapd></thick-thinlargegapd>
<thin-thick-thinlargegapd></thin-thick-thinlargegapd>
<waved></waved>
<doublewaved></doublewaved>
<dashsmallgapd></dashsmallgapd>
<dashdotstrokedd></dashdotstrokedd>
<emboss3Dd></emboss3Dd>
<engrave3Dd></engrave3Dd>
<defaultd></defaultd>
</border>
<olist>
<begin></begin>
<end></end>
</olist>
<ulist>
<begin></begin>
<end></end>
</ulist>
<entry>
<begin></begin>
<end></end>
</entry>
<!-- the only thing of significance -->
<text>
<begin></begin>
<end>
</end>
</text>
<!--
this tableoverride option can be used to turn off handling of
these tags in tables, which I find is necessary for at least netscape
-->
<tableoverrides>
<ParaBefore>0</ParaBefore>
<ParaRight>0</ParaRight>
<ParaAfter>0</ParaAfter>
<ParaLeft>0</ParaLeft>
<ParaLeft1>0</ParaLeft1>
<VertMergedCells>0</VertMergedCells>
</tableoverrides>
<table>
<begin></begin>
<end></end>
</table>
<row>
<begin></begin>
<end></end>
</row>
<cell>
<begin></begin>
<end></end>
</cell>
<paragraph>
<begin><text.begin/></begin>
<end><text.end/></end>
</paragraph>
<!-- these are all the character properties that can show up in word -->
<bold><begin></begin><end></end></bold>
<italic><begin></begin><end></end></italic>
<!--
text that has been deleted and will be displayed with strikethrough when
revision marked text is to be displayed
use either this line...
-->
<RMarkDel><begin></begin>
<end></end>
</RMarkDel>
<!--
or uncomment below to make deleted text dissappear (well, become commented out)
-->
<!--
<RMarkDel><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></RMarkDel>
-->
<!-- I don't even know what outline means -->
<outline><begin></begin><end></end></outline>
<smallcaps><begin></begin><end></end></smallcaps>
<caps><begin></begin><end></end></caps>
<vanish><begin></begin><end></end></vanish>
<!--If you uncomment this then the annotation text links will become commented out by html tags-->
<!--
<vanish><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></vanish>
-->
<!--
text that has been newly typed since the last time revision marks have been accepted
and will be displayed with underline when revision marked text is to be displayed
use either this line...
-->
<RMark><begin></begin><end></end></RMark>
<!--
or uncomment below to make the underline dissappear
-->
<!--
<RMark><begin></begin><end></end></RMark>
-->
<strike><begin></begin><end></end></strike>
<shadow><begin></begin><end></end></shadow>
<lowercase><begin></begin><end></end></lowercase>
<emboss><begin></begin><end></end></emboss>
<imprint><begin></begin><end></end></imprint>
<!--double strike-->
<dstrike><begin></begin><end></end></dstrike>
<!--
ftc's
&
hps
keep them for font face and do that later.
-->
<super><begin></begin><end></end></super>
<sub><begin></begin><end></end></sub>
<singleu><begin></begin><end></end></singleu>
<wordu><begin></begin><end></end></wordu>
<doubleu><begin></begin><end></end></doubleu>
<dottedu><begin></begin><end></end></dottedu>
<hiddenu><begin></begin><end></end></hiddenu>
<thicku><begin></begin><end></end></thicku>
<dashu><begin></begin><end></end></dashu>
<dotu><begin></begin><end></end></dotu>
<dotdashu><begin></begin><end></end></dotdashu>
<dotdotdashu><begin></begin><end></end></dotdotdashu>
<waveu><begin></begin><end></end></waveu>
<!--
text whose properties have been changed since the last time revision marks have been accepted
and will be displayed with a note showing the change points.
use either this line (which admit it a bit scary looking, but harmless)...
-->
<PropRMark><begin><ibstPropRMark/></begin><end></end></PropRMark>
<!--
or uncomment below to make the notes dissappear
-->
<!--
<PropRMark><begin></begin><end></end></PropRMark>
-->
<!--
<color>
-->
<Black><begin></begin><end></end></Black>
<Blue><begin></begin><end></end></Blue>
<Cyan><begin></begin><end></end></Cyan>
<Green><begin></begin><end></end></Green>
<Magenta><begin></begin><end></end></Magenta>
<Red><begin></begin><end></end></Red>
<Yellow><begin></begin><end></end></Yellow>
<White><begin></begin><end></end></White>
<DkBlue><begin></begin><end></end></DkBlue>
<DkCyan><begin></begin><end></end></DkCyan>
<DkGreen><begin></begin><end></end></DkGreen>
<DkMagenta><begin></begin><end></end></DkMagenta>
<DkRed><begin></begin><end></end></DkRed>
<DkYellow><begin></begin><end></end></DkYellow>
<DkGray><begin></begin><end></end></DkGray>
<LtGray><begin></begin><end></end></LtGray>
<!--
</color>
-->
<!--
<animation>
-->
<LasVegas><begin></begin><end></end></LasVegas>
<BackgroundBlink><begin></begin><end></end></BackgroundBlink>
<SparkleText><begin></begin><end></end></SparkleText>
<MarchingAnts><begin></begin><end></end></MarchingAnts>
<MarchingRedAnts><begin></begin><end></end></MarchingRedAnts>
<Shimmer><begin></begin><end></end></Shimmer>
<!--
</animation>
-->
<!--
I dont understand what this one is, and ive never come across it
use this sample line (which admit it a bit scary looking, but harmless)...
-->
<DispFldRMark><begin></begin><end></end></DispFldRMark>
<!--
or uncomment below to ignore it, the previous might even crash wv ?
-->
<!--
<DispFldRMark><begin></begin><end></end></DispFldRMark>
-->
<animation>
<begin><LasVegas.begin/><BackgroundBlink.begin/><SparkleText.begin/><MarchingAnts.begin/><MarchingRedAnts.begin/><Shimmer.begin/></begin>
<end><Shimmer.end/><MarchingRedAnts.end/><MarchingAnts.end/><SparkleText.end/><BackgroundBlink.end/><LasVegas.end/></end>
</animation>
<fontstr>
<begin></begin>
<end></end>
</fontstr>
<comment>
<begin>
</begin>
<end>
</end>
</comment>
<style name="Normal">
<character>
<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
</character>
<!-- Netscape does handle this correctly yet, here is how each different side of the border should work.
border-top: thin <bordertopstyle/> <bordertopcolor/>;
border-left: thin <borderleftstyle/> <borderleftcolor/>;
border-right: thin <borderrightstyle/> <borderrightcolor/>;
border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
-->
<pmargin>
<begin><!-- <mmParaBefore/> <mmParaRight/> <mmParaAfter/> <mmParaLeft/>;--></begin>
</pmargin>
<pborder>
<begin>
<!--
border: thin <borderleftstyle/> <borderleftcolor/>;
border-top: thin <bordertopstyle/> <bordertopcolor/>;
border-left: thin <borderleftstyle/> <borderleftcolor/>;
border-right: thin <borderrightstyle/> <borderrightcolor/>;
border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
-->
</begin>
</pborder>
<picture>
<begin>
</begin>
<!-- images are lacking for now -->
</picture>
</style>
<!--we need to be override the character properties-->
<!--
<style name="Normal">
<character>
<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
</character>
<text>
<begin></begin>
<end>
</end>
</text>
</style>
<style name="Heading 1">
<character>
<begin></begin>
<end></end>
</character>
<text>
<begin></begin>
<end>
</end>
</text>
</style>
-->
</main>

54
text/doc.py Normal file
View file

@ -0,0 +1,54 @@
#
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
Searchable text support for Portable Document Format (PDF) files.
This uses the pdftotext command from xpdf to perform the extraction.
interface definitions for text transformations.
Based on code provided by zc.index and TextIndexNG3.
$Id$
"""
import os, sys
from cybertools.text import base
try:
from Globals import package_home
wvConf = os.path.join(package_home(globals()), 'config', 'wvText.xml')
except ImportError:
wvConf = os.path.join(os.path.dirname(__file__), 'config', 'wvText.xml')
class DocTransform(base.BaseFileTransform):
extension = ".doc"
def extract(self, directory, filename):
if not self.checkAvailable('wvWare', 'wvWare is not available'):
return u''
if sys.platform == 'win32':
data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> nul:'
% (wvConf, filename))
else:
data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> /dev/null'
% (wvConf, filename))
return data.decode('UTF-8')

View file

@ -27,17 +27,32 @@ from zope.interface import Interface
class ITextTransform(Interface): class ITextTransform(Interface):
def __call__(f): def __call__(fr):
""" Transform the content of file f to plain text and return """ Transform the content of file fr (readfile) to plain text and
the result as unicode. return the result as unicode.
""" """
class IFileTransform(ITextTransform): class IFileTransform(ITextTransform):
""" A transformation that uses an intermediate disk file. """ A transformation that is performed by calling some external program
and that typically uses an intermediate disk file.
""" """
def extract(dirname, filename): def extract(dirname, filename):
""" Extract text contents from the file specified by dirnam, filename, """ Extract text contents from the file specified by ``filename``,
using some external programm, and return the result as unicode. using some external programm, and return the result as unicode.
``dirname`` is the path to a temporary directory that
usually (but not necessarily) contains the file and may
be used for creating other (temporary) files if needed.
"""
def execute(command):
""" Execute a system command and return the output of the program
called.
"""
def checkAvailable(progname, logMessage=''):
""" Check the availability of the program named ``progname``.
Return True if available; if ``logMessage`` is given, put this
as a warning message into the log if the program is not available.
""" """

View file

@ -1,9 +1,31 @@
"""Searchable text support for Portable Document Format (PDF) files. #
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
This uses the pdftotext command from xpdf to perform the extraction. #
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
""" """
__docformat__ = "reStructuredText" Searchable text support for Portable Document Format (PDF) files.
This uses the pdftotext command from xpdf to perform the extraction.
interface definitions for text transformations.
Based on code provided by zc.index and TextIndexNG3.
$Id$
"""
import os, sys import os, sys
@ -15,12 +37,7 @@ class PdfTransform(base.BaseFileTransform):
extension = ".pdf" extension = ".pdf"
def extract(self, directory, filename): def extract(self, directory, filename):
if not base.haveProgram("pdftotext"): if not self.checkAvailable('pdftotext', 'pdftotext is not available'):
print 'Warning: pdftotext is not available'
return u'' return u''
txtfile = os.path.join(directory, "words.txt") data = self.execute('pdftotext -enc UTF-8 "%s" -' % filename)
st = os.system("pdftotext -enc UTF-8 %s %s" % (filename, txtfile)) return data.decode('UTF-8')
f = open(txtfile, "rb")
data = f.read()
f.close()
return unicode(data, "utf-8")

BIN
text/testfiles/mary.doc Normal file

Binary file not shown.

BIN
text/testfiles/mary.odt Normal file

Binary file not shown.