clean-up of cybertools.text; provide msword conversion

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1603 fd906abe-77d9-0310-91a1-e0d9ade77398
2007-03-01 11:40:35 +00:00 · 2007-03-01 11:40:35 +00:00 · b380b0fcd7
commit b380b0fcd7
parent 53ae9f332a
8 changed files with 529 additions and 44 deletions
--- a/text/README.txt
+++ b/text/README.txt
@ -1,18 +1,56 @@
 =================================================
-Text transformations, e.g. for full-text indexing
+Text Transformations, e.g. for Full-text Indexing
 =================================================
  ($Id$)
 If a converter program needed is not available we want to put a warning
 into Zope's server log; in order to be able to test this we register
 a log handler for testing:
  >>> from zope.testing.loggingsupport import InstalledHandler
  >>> log = InstalledHandler('zope.server')
 The test files are in a subdirectory of the text package:
  >>> import os
  >>> from cybertools import text
-  >>> directory = os.path.dirname(text.__file__)
+  >>> testdir = os.path.join(os.path.dirname(text.__file__), 'testfiles')
-  >>> fn = os.path.sep.join((directory, 'testfiles', 'mary.pdf'))
+
-  >>> f = open(fn)
+PDF Files
 ---------
 Let's start with a PDF file:
  >>> from cybertools.text.pdf import PdfTransform
  >>> transform = PdfTransform(None)
-  >>> words = transform(f).split()
+  >>> f = open(os.path.join(testdir, 'mary.pdf'))
 This will be transformed to plain text:
  >>> result = transform(f)
 Let's check the log, should be empty:
  >>> print log
 So what is in the plain text result?
  >>> words = result.split()
  >>> len(words)
  89
  >>> u'lamb' in words
  True
 Word Documents
 --------------
  >>> from cybertools.text.doc import DocTransform
  >>> transform = DocTransform(None)
  >>> f = open(os.path.join(testdir, 'mary.doc'))
  >>> result = transform(f)
  >>> print log
  >>> words = result.split()
  >>> len(words)
  89
  >>> u'lamb' in words
--- a/text/base.py
+++ b/text/base.py
@ -19,32 +19,17 @@
 """
 Base classes for text transformations.
-Based on code provided by zc.index.
+Based on code provided by zc.index and TextIndexNG3.
 $Id$
 """
 __docformat__ = "reStructuredText"
 import os, shutil, sys, tempfile
 import logging
 from zope.interface import implements
 from cybertools.text.interfaces import ITextTransform, IFileTransform
 def haveProgram(name):
    """Return true if the program `name` is available."""
    if sys.platform.lower().startswith("win"):
        extensions = (".com", ".exe", ".bat")
    else:
        extensions = ("",)
    execpath = os.environ.get("PATH", "").split(os.path.pathsep)
    for path in execpath:
        for ext in extensions:
            fn = os.path.join(path, name + ext)
            if os.path.isfile(fn):
                return True
    return False
 class BaseTransform(object):
@ -54,11 +39,9 @@ class BaseTransform(object):
        self.context = context
        self.text = None
-    def __call__(self, f):
+    def __call__(self, fr):
        if self.text is None:
            fr = open(f, 'r')
            self.text = fr.read()
            fr.close()
        return self.text
@ -66,22 +49,45 @@ class BaseFileTransform(BaseTransform):
    implements(IFileTransform)
    extension = '.txt'
    def __call__(self, fr):
        if self.text is None:
            #fr = f.open("rb")
            dirname = tempfile.mkdtemp()
            filename = os.path.join(dirname, "temp" + self.extension)
            try:
                fw = open(filename, "wb")
                shutil.copyfileobj(fr, fw)
                #fr.close()
                fw.close()
                text = self.extract(dirname, filename)
            finally:
                shutil.rmtree(dirname)
                #fr.close()
            self.text = text
        return self.text
    def extract(self, dirname, filename):
        raise ValueError('Method extract() has to be implemented by subclass.')
    def execute(self, com):
        try:
            import win32pipe
            result = win32pipe.popen(com).read()
        except ImportError:
            result = os.popen(com).read()
        return result
    def checkAvailable(self, name, logMessage=''):
        if sys.platform.lower().startswith("win"):
            extensions = (".com", ".exe", ".bat")
        else:
            extensions = ("",)
        execpath = os.environ.get("PATH", "").split(os.path.pathsep)
        for path in execpath:
            for ext in extensions:
                fn = os.path.join(path, name + ext)
                if os.path.isfile(fn):
                    return True
        if logMessage:
            logging.getLogger('zope.server').warn(logMessage)
        return False
--- a/text/config/wvText.xml
+++ b/text/config/wvText.xml
@ -0,0 +1,355 @@
 <main>
 <charentity>
 <begin>ABW</begin>
 </charentity>
 <document>
 <begin>
 </begin>
 <end>
 </end>
 </document>
 <section>
 <begin>
 </begin>
 <end>
 </end>
 </section>
 <justification>
 <left></left>
 <right></right>
 <center></center>
 <block></block>
 <asian></asian>
 </justification>
 <numbering>
 <Arabic>type=&quot;1&quot;</Arabic>
 <UpperRoman>type=&quot;I&quot;</UpperRoman>
 <LowerRoman>type=&quot;i&quot;</LowerRoman>
 <UpperCaseN>type=&quot;A&quot;</UpperCaseN>
 <LowerCaseN>type=&quot;a&quot;</LowerCaseN>
 </numbering>
 <border>
 <noned></noned>
 <singled></singled>
 <thickd></thickd>
 <doubled></doubled>
 <number4d></number4d>
 <hairlined></hairlined>
 <dotd></dotd>
 <dashlargegapd></dashlargegapd>
 <dotdashd></dotdashd>
 <dotdotdashd></dotdotdashd>
 <tripled></tripled>
 <thin-thicksmallgapd></thin-thicksmallgapd>
 <thick-thinsmallgapd></thick-thinsmallgapd>
 <thin-thick-thinsmallgapd></thin-thick-thinsmallgapd>
 <thin-thickmediumgapd></thin-thickmediumgapd>
 <thick-thinmediumgapd></thick-thinmediumgapd>
 <thin-thick-thinmediumgapd></thin-thick-thinmediumgapd>
 <thin-thicklargegapd></thin-thicklargegapd>
 <thick-thinlargegapd></thick-thinlargegapd>
 <thin-thick-thinlargegapd></thin-thick-thinlargegapd>
 <waved></waved>
 <doublewaved></doublewaved>
 <dashsmallgapd></dashsmallgapd>
 <dashdotstrokedd></dashdotstrokedd>
 <emboss3Dd></emboss3Dd>
 <engrave3Dd></engrave3Dd>
 <defaultd></defaultd>
 </border>
 <olist>
 <begin></begin>
 <end></end>
 </olist>
 <ulist>
 <begin></begin>
 <end></end>
 </ulist>
 <entry>
 <begin></begin>
 <end></end>
 </entry>
 <!-- the only thing of significance -->
 <text>	
 <begin></begin>
 <end>
 </end>
 </text>
 <!-- 
 this tableoverride option can be used to turn off handling of
 these tags in tables, which I find is necessary for at least netscape
 -->
 <tableoverrides>
 <ParaBefore>0</ParaBefore>
 <ParaRight>0</ParaRight>
 <ParaAfter>0</ParaAfter>
 <ParaLeft>0</ParaLeft>
 <ParaLeft1>0</ParaLeft1>
 <VertMergedCells>0</VertMergedCells>
 </tableoverrides>
 <table>
 <begin></begin>
 <end></end>
 </table>
 <row>
 <begin></begin>
 <end></end>
 </row>
 <cell>
 <begin></begin>
 <end></end>
 </cell>
 <paragraph>
 <begin><text.begin/></begin>
 <end><text.end/></end>
 </paragraph>
 <!-- these are all the character properties that can show up in word -->
 <bold><begin></begin><end></end></bold>
 <italic><begin></begin><end></end></italic>
 <!--
 text that has been deleted and will be displayed with strikethrough when
 revision marked text is to be displayed
 use either this line...
 --> 
 <RMarkDel><begin></begin>
 <end></end>
 </RMarkDel>
 <!--
 or uncomment below to make deleted text dissappear (well, become commented out)
 -->
 <!--
 <RMarkDel><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></RMarkDel>
 -->
 <!-- I don't even know what outline means -->
 <outline><begin></begin><end></end></outline>
 <smallcaps><begin></begin><end></end></smallcaps>
 <caps><begin></begin><end></end></caps>
 <vanish><begin></begin><end></end></vanish>
 <!--If you uncomment this then the annotation text links will become commented out by html tags-->
 <!--
 <vanish><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></vanish>
 -->
 <!--
 text that has been newly typed since the last time revision marks have been accepted
 and will be displayed with underline when revision marked text is to be displayed
 use either this line...
 -->
 <RMark><begin></begin><end></end></RMark>
 <!--
 or uncomment below to make the underline dissappear
 -->
 <!--
 <RMark><begin></begin><end></end></RMark>
 -->
 <strike><begin></begin><end></end></strike>
 <shadow><begin></begin><end></end></shadow>
 <lowercase><begin></begin><end></end></lowercase>
 <emboss><begin></begin><end></end></emboss>
 <imprint><begin></begin><end></end></imprint>
 <!--double strike-->
 <dstrike><begin></begin><end></end></dstrike>
 <!--
 ftc's
 &
 hps
 keep them for font face and do that later.
 -->
 <super><begin></begin><end></end></super>
 <sub><begin></begin><end></end></sub>
 <singleu><begin></begin><end></end></singleu>
 <wordu><begin></begin><end></end></wordu>
 <doubleu><begin></begin><end></end></doubleu>
 <dottedu><begin></begin><end></end></dottedu>
 <hiddenu><begin></begin><end></end></hiddenu>
 <thicku><begin></begin><end></end></thicku>
 <dashu><begin></begin><end></end></dashu>
 <dotu><begin></begin><end></end></dotu>
 <dotdashu><begin></begin><end></end></dotdashu>
 <dotdotdashu><begin></begin><end></end></dotdotdashu>
 <waveu><begin></begin><end></end></waveu>
 <!--
 text whose properties have been changed since the last time revision marks have been accepted
 and will be displayed with a note showing the change points.
 use either this line (which admit it a bit scary looking, but harmless)...
 --> 
 <PropRMark><begin><ibstPropRMark/></begin><end></end></PropRMark>
 <!--
 or uncomment below to make the notes dissappear
 -->
 <!--
 <PropRMark><begin></begin><end></end></PropRMark>
 -->
 <!--
 <color>
 -->
 <Black><begin></begin><end></end></Black>
 <Blue><begin></begin><end></end></Blue>
 <Cyan><begin></begin><end></end></Cyan>
 <Green><begin></begin><end></end></Green>
 <Magenta><begin></begin><end></end></Magenta>
 <Red><begin></begin><end></end></Red>
 <Yellow><begin></begin><end></end></Yellow>
 <White><begin></begin><end></end></White>
 <DkBlue><begin></begin><end></end></DkBlue>
 <DkCyan><begin></begin><end></end></DkCyan>
 <DkGreen><begin></begin><end></end></DkGreen>
 <DkMagenta><begin></begin><end></end></DkMagenta>
 <DkRed><begin></begin><end></end></DkRed>
 <DkYellow><begin></begin><end></end></DkYellow>
 <DkGray><begin></begin><end></end></DkGray>
 <LtGray><begin></begin><end></end></LtGray>
 <!--
 </color>
 -->
 <!--
 <animation>
 -->
 <LasVegas><begin></begin><end></end></LasVegas>
 <BackgroundBlink><begin></begin><end></end></BackgroundBlink>
 <SparkleText><begin></begin><end></end></SparkleText>
 <MarchingAnts><begin></begin><end></end></MarchingAnts>
 <MarchingRedAnts><begin></begin><end></end></MarchingRedAnts>
 <Shimmer><begin></begin><end></end></Shimmer>
 <!--
 </animation>
 -->
 <!--
 I dont understand what this one is, and ive never come across it
 use this sample line (which admit it a bit scary looking, but harmless)...
 -->
 <DispFldRMark><begin></begin><end></end></DispFldRMark>
 <!--
 or uncomment below to ignore it, the previous might even crash wv ?
 -->
 <!--
 <DispFldRMark><begin></begin><end></end></DispFldRMark>
 -->
 <animation>
 <begin><LasVegas.begin/><BackgroundBlink.begin/><SparkleText.begin/><MarchingAnts.begin/><MarchingRedAnts.begin/><Shimmer.begin/></begin>
 <end><Shimmer.end/><MarchingRedAnts.end/><MarchingAnts.end/><SparkleText.end/><BackgroundBlink.end/><LasVegas.end/></end>
 </animation>
 <fontstr>
 <begin></begin>
 <end></end>
 </fontstr>
 <comment>
 <begin>
 </begin>
 <end>
 </end>
 </comment>
 <style name="Normal">
 <character>
 <begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
 <end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
 </character>
 <!-- Netscape does handle this correctly yet, here is how each different side of the border should work.
 border-top: thin <bordertopstyle/> <bordertopcolor/>;
 border-left: thin <borderleftstyle/> <borderleftcolor/>;
 border-right: thin <borderrightstyle/> <borderrightcolor/>;
 border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
 -->
 <pmargin>
 <begin><!-- <mmParaBefore/> <mmParaRight/> <mmParaAfter/> <mmParaLeft/>;--></begin>
 </pmargin>
 <pborder>
 <begin>
 <!--
 border: thin <borderleftstyle/> <borderleftcolor/>;
 border-top: thin <bordertopstyle/> <bordertopcolor/>;
 border-left: thin <borderleftstyle/> <borderleftcolor/>;
 border-right: thin <borderrightstyle/> <borderrightcolor/>;
 border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
 -->
 </begin>
 </pborder>
 <picture>
 <begin>
 </begin>
 <!-- images are lacking for now -->
 </picture>
 </style>
 <!--we need to be override the character properties-->
 <!--
 <style name="Normal">
 <character>
 <begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
 <end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
 </character>
 <text>	
 <begin></begin>
 <end>
 </end>
 </text>
 </style>
 <style name="Heading 1">
 <character>
 <begin></begin>
 <end></end>
 </character>
 <text>	
 <begin></begin>
 <end>
 </end>
 </text>
 </style>
 -->
 </main>
--- a/text/doc.py
+++ b/text/doc.py
@ -0,0 +1,54 @@
 #
 #  Copyright (c) 2007 Helmut Merz helmutm@cy55.de
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, write to the Free Software
 #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 #
 """
 Searchable text support for Portable Document Format (PDF) files.
 This uses the pdftotext command from xpdf to perform the extraction.
 interface definitions for text transformations.
 Based on code provided by zc.index and TextIndexNG3.
 $Id$
 """
 import os, sys
 from cybertools.text import base
 try:
    from Globals import package_home
    wvConf = os.path.join(package_home(globals()), 'config', 'wvText.xml')
 except ImportError:
    wvConf = os.path.join(os.path.dirname(__file__), 'config', 'wvText.xml')
 class DocTransform(base.BaseFileTransform):
    extension = ".doc"
    def extract(self, directory, filename):
        if not self.checkAvailable('wvWare', 'wvWare is not available'):
            return u''
        if sys.platform == 'win32':
            data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> nul:'
                                % (wvConf, filename))
        else:
            data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> /dev/null'
                                % (wvConf, filename))
        return data.decode('UTF-8')
--- a/text/interfaces.py
+++ b/text/interfaces.py
@ -27,17 +27,32 @@ from zope.interface import Interface
 class ITextTransform(Interface):
-    def __call__(f):
+    def __call__(fr):
-        """ Transform the content of file f to plain text and return
+        """ Transform the content of file fr (readfile) to plain text and
-            the result as unicode.
+            return the result as unicode.
        """
 class IFileTransform(ITextTransform):
-    """ A transformation that uses an intermediate disk file.
+    """ A transformation that is performed by calling some external program
        and that typically uses an intermediate disk file.
    """
    def extract(dirname, filename):
-        """ Extract text contents from the file specified by dirnam, filename,
+        """ Extract text contents from the file specified by ``filename``,
            using some external programm, and return the result as unicode.
            ``dirname`` is the path to a temporary directory that
            usually (but not necessarily) contains the file and may
            be used for creating other (temporary) files if needed.
        """
    def execute(command):
        """ Execute a system command and return the output of the program
            called.
        """
    def checkAvailable(progname, logMessage=''):
        """ Check the availability of the program named ``progname``.
            Return True if available; if ``logMessage`` is given, put this
            as a warning message into the log if the program is not available.
        """
--- a/text/pdf.py
+++ b/text/pdf.py
@ -1,9 +1,31 @@
-"""Searchable text support for Portable Document Format (PDF) files.
+#
-
+#  Copyright (c) 2007 Helmut Merz helmutm@cy55.de
-This uses the pdftotext command from xpdf to perform the extraction.
+#
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, write to the Free Software
 #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 #
 """
-__docformat__ = "reStructuredText"
+Searchable text support for Portable Document Format (PDF) files.
 This uses the pdftotext command from xpdf to perform the extraction.
 interface definitions for text transformations.
 Based on code provided by zc.index and TextIndexNG3.
 $Id$
 """
 import os, sys
@ -15,12 +37,7 @@ class PdfTransform(base.BaseFileTransform):
    extension = ".pdf"
    def extract(self, directory, filename):
-        if not base.haveProgram("pdftotext"):
+        if not self.checkAvailable('pdftotext', 'pdftotext is not available'):
            print 'Warning: pdftotext is not available'
            return u''
-        txtfile = os.path.join(directory, "words.txt")
+        data = self.execute('pdftotext -enc UTF-8 "%s" -' % filename)
-        st = os.system("pdftotext -enc UTF-8 %s %s" % (filename, txtfile))
+        return data.decode('UTF-8')
        f = open(txtfile, "rb")
        data = f.read()
        f.close()
        return unicode(data, "utf-8")
--- a/text/testfiles/mary.doc
+++ b/text/testfiles/mary.doc
--- a/text/testfiles/mary.odt
+++ b/text/testfiles/mary.odt