clean-up of cybertools.text; provide msword conversion

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1603 fd906abe-77d9-0310-91a1-e0d9ade77398
2007-03-01 11:40:35 +00:00 · 2007-03-01 11:40:35 +00:00 · b380b0fcd7
commit b380b0fcd7
parent 53ae9f332a
8 changed files with 529 additions and 44 deletions
--- a/text/README.txt
+++ b/text/README.txt
@ -1,18 +1,56 @@
 =================================================
-Text transformations, e.g. for full-text indexing
+Text Transformations, e.g. for Full-text Indexing
 =================================================

  ($Id$)

+If a converter program needed is not available we want to put a warning
+into Zope's server log; in order to be able to test this we register
+a log handler for testing:
+
+  >>> from zope.testing.loggingsupport import InstalledHandler
+  >>> log = InstalledHandler('zope.server')
+
+The test files are in a subdirectory of the text package:
+
  >>> import os
  >>> from cybertools import text
-  >>> directory = os.path.dirname(text.__file__)
-  >>> fn = os.path.sep.join((directory, 'testfiles', 'mary.pdf'))
-  >>> f = open(fn)
+  >>> testdir = os.path.join(os.path.dirname(text.__file__), 'testfiles')
+
+PDF Files
+---------
+
+Let's start with a PDF file:

  >>> from cybertools.text.pdf import PdfTransform
  >>> transform = PdfTransform(None)
-  >>> words = transform(f).split()
+  >>> f = open(os.path.join(testdir, 'mary.pdf'))
+
+This will be transformed to plain text:
+
+  >>> result = transform(f)
+
+Let's check the log, should be empty:
+
+  >>> print log
+
+So what is in the plain text result?
+
+  >>> words = result.split()
+  >>> len(words)
+  89
+  >>> u'lamb' in words
+  True
+
+Word Documents
+--------------
+
+  >>> from cybertools.text.doc import DocTransform
+  >>> transform = DocTransform(None)
+  >>> f = open(os.path.join(testdir, 'mary.doc'))
+  >>> result = transform(f)
+  >>> print log
+  >>> words = result.split()
  >>> len(words)
  89
  >>> u'lamb' in words
--- a/text/base.py
+++ b/text/base.py
@ -19,32 +19,17 @@
 """
 Base classes for text transformations.

-Based on code provided by zc.index.
+Based on code provided by zc.index and TextIndexNG3.

 $Id$
 """


-__docformat__ = "reStructuredText"
-
 import os, shutil, sys, tempfile
+import logging
 from zope.interface import implements
 from cybertools.text.interfaces import ITextTransform, IFileTransform

-def haveProgram(name):
-    """Return true if the program `name` is available."""
-    if sys.platform.lower().startswith("win"):
-        extensions = (".com", ".exe", ".bat")
-    else:
-        extensions = ("",)
-    execpath = os.environ.get("PATH", "").split(os.path.pathsep)
-    for path in execpath:
-        for ext in extensions:
-            fn = os.path.join(path, name + ext)
-            if os.path.isfile(fn):
-                return True
-    return False
-

 class BaseTransform(object):

@ -54,11 +39,9 @@ class BaseTransform(object):
        self.context = context
        self.text = None

-    def __call__(self, f):
+    def __call__(self, fr):
        if self.text is None:
-            fr = open(f, 'r')
            self.text = fr.read()
-            fr.close()
        return self.text


@ -66,22 +49,45 @@ class BaseFileTransform(BaseTransform):

    implements(IFileTransform)

+    extension = '.txt'
+
    def __call__(self, fr):
        if self.text is None:
-            #fr = f.open("rb")
            dirname = tempfile.mkdtemp()
            filename = os.path.join(dirname, "temp" + self.extension)
            try:
                fw = open(filename, "wb")
                shutil.copyfileobj(fr, fw)
-                #fr.close()
                fw.close()
                text = self.extract(dirname, filename)
            finally:
                shutil.rmtree(dirname)
+                #fr.close()
            self.text = text
        return self.text

    def extract(self, dirname, filename):
        raise ValueError('Method extract() has to be implemented by subclass.')

+    def execute(self, com):
+        try:
+            import win32pipe
+            result = win32pipe.popen(com).read()
+        except ImportError:
+            result = os.popen(com).read()
+        return result
+
+    def checkAvailable(self, name, logMessage=''):
+        if sys.platform.lower().startswith("win"):
+            extensions = (".com", ".exe", ".bat")
+        else:
+            extensions = ("",)
+        execpath = os.environ.get("PATH", "").split(os.path.pathsep)
+        for path in execpath:
+            for ext in extensions:
+                fn = os.path.join(path, name + ext)
+                if os.path.isfile(fn):
+                    return True
+        if logMessage:
+            logging.getLogger('zope.server').warn(logMessage)
+        return False
--- a/text/config/wvText.xml
+++ b/text/config/wvText.xml
@ -0,0 +1,355 @@
+<main>
+<charentity>
+<begin>ABW</begin>
+</charentity>
+
+<document>
+<begin>
+</begin>
+<end>
+</end>
+</document>
+
+<section>
+<begin>
+</begin>
+<end>
+</end>
+</section>
+
+<justification>
+<left></left>
+<right></right>
+<center></center>
+<block></block>
+<asian></asian>
+</justification>
+
+<numbering>
+<Arabic>type=&quot;1&quot;</Arabic>
+<UpperRoman>type=&quot;I&quot;</UpperRoman>
+<LowerRoman>type=&quot;i&quot;</LowerRoman>
+<UpperCaseN>type=&quot;A&quot;</UpperCaseN>
+<LowerCaseN>type=&quot;a&quot;</LowerCaseN>
+</numbering>
+
+<border>
+<noned></noned>
+<singled></singled>
+<thickd></thickd>
+<doubled></doubled>
+<number4d></number4d>
+<hairlined></hairlined>
+<dotd></dotd>
+<dashlargegapd></dashlargegapd>
+<dotdashd></dotdashd>
+<dotdotdashd></dotdotdashd>
+<tripled></tripled>
+<thin-thicksmallgapd></thin-thicksmallgapd>
+<thick-thinsmallgapd></thick-thinsmallgapd>
+<thin-thick-thinsmallgapd></thin-thick-thinsmallgapd>
+<thin-thickmediumgapd></thin-thickmediumgapd>
+<thick-thinmediumgapd></thick-thinmediumgapd>
+<thin-thick-thinmediumgapd></thin-thick-thinmediumgapd>
+<thin-thicklargegapd></thin-thicklargegapd>
+<thick-thinlargegapd></thick-thinlargegapd>
+<thin-thick-thinlargegapd></thin-thick-thinlargegapd>
+<waved></waved>
+<doublewaved></doublewaved>
+<dashsmallgapd></dashsmallgapd>
+<dashdotstrokedd></dashdotstrokedd>
+<emboss3Dd></emboss3Dd>
+<engrave3Dd></engrave3Dd>
+<defaultd></defaultd>
+</border>
+
+<olist>
+<begin></begin>
+<end></end>
+</olist>
+
+<ulist>
+<begin></begin>
+<end></end>
+</ulist>
+
+<entry>
+<begin></begin>
+<end></end>
+</entry>
+
+<!-- the only thing of significance -->
+<text>	
+<begin></begin>
+<end>
+</end>
+</text>
+
+<!-- 
+this tableoverride option can be used to turn off handling of
+these tags in tables, which I find is necessary for at least netscape
+-->
+<tableoverrides>
+<ParaBefore>0</ParaBefore>
+<ParaRight>0</ParaRight>
+<ParaAfter>0</ParaAfter>
+<ParaLeft>0</ParaLeft>
+<ParaLeft1>0</ParaLeft1>
+<VertMergedCells>0</VertMergedCells>
+</tableoverrides>
+
+<table>
+<begin></begin>
+<end></end>
+</table>
+
+<row>
+<begin></begin>
+<end></end>
+</row>
+
+<cell>
+<begin></begin>
+<end></end>
+</cell>
+
+<paragraph>
+<begin><text.begin/></begin>
+<end><text.end/></end>
+</paragraph>
+
+<!-- these are all the character properties that can show up in word -->
+<bold><begin></begin><end></end></bold>
+<italic><begin></begin><end></end></italic>
+
+<!--
+text that has been deleted and will be displayed with strikethrough when
+revision marked text is to be displayed
+
+use either this line...
+--> 
+<RMarkDel><begin></begin>
+<end></end>
+</RMarkDel>
+
+<!--
+or uncomment below to make deleted text dissappear (well, become commented out)
+-->
+<!--
+<RMarkDel><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></RMarkDel>
+-->
+
+<!-- I don't even know what outline means -->
+<outline><begin></begin><end></end></outline>
+<smallcaps><begin></begin><end></end></smallcaps>
+<caps><begin></begin><end></end></caps>
+<vanish><begin></begin><end></end></vanish>
+
+<!--If you uncomment this then the annotation text links will become commented out by html tags-->
+<!--
+<vanish><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></vanish>
+-->
+
+<!--
+text that has been newly typed since the last time revision marks have been accepted
+and will be displayed with underline when revision marked text is to be displayed
+
+use either this line...
+-->
+<RMark><begin></begin><end></end></RMark>
+
+<!--
+or uncomment below to make the underline dissappear
+-->
+<!--
+<RMark><begin></begin><end></end></RMark>
+-->
+
+
+<strike><begin></begin><end></end></strike>
+<shadow><begin></begin><end></end></shadow>
+<lowercase><begin></begin><end></end></lowercase>
+<emboss><begin></begin><end></end></emboss>
+<imprint><begin></begin><end></end></imprint>
+<!--double strike-->
+<dstrike><begin></begin><end></end></dstrike>
+
+<!--
+ftc's
+&
+hps
+
+keep them for font face and do that later.
+-->
+
+<super><begin></begin><end></end></super>
+<sub><begin></begin><end></end></sub>
+
+<singleu><begin></begin><end></end></singleu>
+<wordu><begin></begin><end></end></wordu>
+<doubleu><begin></begin><end></end></doubleu>
+<dottedu><begin></begin><end></end></dottedu>
+<hiddenu><begin></begin><end></end></hiddenu>
+<thicku><begin></begin><end></end></thicku>
+<dashu><begin></begin><end></end></dashu>
+<dotu><begin></begin><end></end></dotu>
+<dotdashu><begin></begin><end></end></dotdashu>
+<dotdotdashu><begin></begin><end></end></dotdotdashu>
+<waveu><begin></begin><end></end></waveu>
+
+<!--
+text whose properties have been changed since the last time revision marks have been accepted
+and will be displayed with a note showing the change points.
+
+use either this line (which admit it a bit scary looking, but harmless)...
+--> 
+<PropRMark><begin><ibstPropRMark/></begin><end></end></PropRMark>
+
+<!--
+or uncomment below to make the notes dissappear
+-->
+<!--
+<PropRMark><begin></begin><end></end></PropRMark>
+-->
+
+<!--
+<color>
+-->
+<Black><begin></begin><end></end></Black>
+<Blue><begin></begin><end></end></Blue>
+<Cyan><begin></begin><end></end></Cyan>
+<Green><begin></begin><end></end></Green>
+<Magenta><begin></begin><end></end></Magenta>
+<Red><begin></begin><end></end></Red>
+<Yellow><begin></begin><end></end></Yellow>
+<White><begin></begin><end></end></White>
+<DkBlue><begin></begin><end></end></DkBlue>
+<DkCyan><begin></begin><end></end></DkCyan>
+<DkGreen><begin></begin><end></end></DkGreen>
+<DkMagenta><begin></begin><end></end></DkMagenta>
+<DkRed><begin></begin><end></end></DkRed>
+<DkYellow><begin></begin><end></end></DkYellow>
+<DkGray><begin></begin><end></end></DkGray>
+<LtGray><begin></begin><end></end></LtGray>
+<!--
+</color>
+-->
+
+<!--
+<animation>
+-->
+<LasVegas><begin></begin><end></end></LasVegas>
+<BackgroundBlink><begin></begin><end></end></BackgroundBlink>
+<SparkleText><begin></begin><end></end></SparkleText>
+<MarchingAnts><begin></begin><end></end></MarchingAnts>
+<MarchingRedAnts><begin></begin><end></end></MarchingRedAnts>
+<Shimmer><begin></begin><end></end></Shimmer>
+<!--
+</animation>
+-->
+
+<!--
+I dont understand what this one is, and ive never come across it
+
+use this sample line (which admit it a bit scary looking, but harmless)...
+-->
+<DispFldRMark><begin></begin><end></end></DispFldRMark>
+
+<!--
+or uncomment below to ignore it, the previous might even crash wv ?
+-->
+<!--
+<DispFldRMark><begin></begin><end></end></DispFldRMark>
+-->
+
+<animation>
+<begin><LasVegas.begin/><BackgroundBlink.begin/><SparkleText.begin/><MarchingAnts.begin/><MarchingRedAnts.begin/><Shimmer.begin/></begin>
+<end><Shimmer.end/><MarchingRedAnts.end/><MarchingAnts.end/><SparkleText.end/><BackgroundBlink.end/><LasVegas.end/></end>
+</animation>
+
+<fontstr>
+<begin></begin>
+<end></end>
+</fontstr>
+
+<comment>
+<begin>
+</begin>
+<end>
+</end>
+</comment>
+
+<style name="Normal">
+<character>
+<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
+<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
+</character>
+
+<!-- Netscape does handle this correctly yet, here is how each different side of the border should work.
+border-top: thin <bordertopstyle/> <bordertopcolor/>;
+border-left: thin <borderleftstyle/> <borderleftcolor/>;
+border-right: thin <borderrightstyle/> <borderrightcolor/>;
+border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
+-->
+
+
+<pmargin>
+<begin><!-- <mmParaBefore/> <mmParaRight/> <mmParaAfter/> <mmParaLeft/>;--></begin>
+</pmargin>
+
+<pborder>
+<begin>
+<!--
+border: thin <borderleftstyle/> <borderleftcolor/>;
+border-top: thin <bordertopstyle/> <bordertopcolor/>;
+border-left: thin <borderleftstyle/> <borderleftcolor/>;
+border-right: thin <borderrightstyle/> <borderrightcolor/>;
+border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
+-->
+</begin>
+</pborder>
+
+<picture>
+<begin>
+</begin>
+<!-- images are lacking for now -->
+
+</picture>
+
+</style>
+
+<!--we need to be override the character properties-->
+<!--
+<style name="Normal">
+<character>
+<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
+<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
+</character>
+
+<text>	
+<begin></begin>
+<end>
+</end>
+</text>
+
+</style>
+
+<style name="Heading 1">
+
+<character>
+<begin></begin>
+<end></end>
+</character>
+
+<text>	
+<begin></begin>
+<end>
+</end>
+</text>
+
+
+
+</style>
+-->
+
+</main>
--- a/text/doc.py
+++ b/text/doc.py
@ -0,0 +1,54 @@
+#
+#  Copyright (c) 2007 Helmut Merz helmutm@cy55.de
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+"""
+Searchable text support for Portable Document Format (PDF) files.
+
+This uses the pdftotext command from xpdf to perform the extraction.
+interface definitions for text transformations.
+
+Based on code provided by zc.index and TextIndexNG3.
+
+$Id$
+"""
+
+import os, sys
+
+from cybertools.text import base
+
+try:
+    from Globals import package_home
+    wvConf = os.path.join(package_home(globals()), 'config', 'wvText.xml')
+except ImportError:
+    wvConf = os.path.join(os.path.dirname(__file__), 'config', 'wvText.xml')
+
+
+class DocTransform(base.BaseFileTransform):
+
+    extension = ".doc"
+
+    def extract(self, directory, filename):
+        if not self.checkAvailable('wvWare', 'wvWare is not available'):
+            return u''
+        if sys.platform == 'win32':
+            data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> nul:'
+                                % (wvConf, filename))
+        else:
+            data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> /dev/null'
+                                % (wvConf, filename))
+        return data.decode('UTF-8')
--- a/text/interfaces.py
+++ b/text/interfaces.py
@ -27,17 +27,32 @@ from zope.interface import Interface

 class ITextTransform(Interface):

-    def __call__(f):
-        """ Transform the content of file f to plain text and return
-            the result as unicode.
+    def __call__(fr):
+        """ Transform the content of file fr (readfile) to plain text and
+            return the result as unicode.
        """


 class IFileTransform(ITextTransform):
-    """ A transformation that uses an intermediate disk file.
+    """ A transformation that is performed by calling some external program
+        and that typically uses an intermediate disk file.
    """

    def extract(dirname, filename):
-        """ Extract text contents from the file specified by dirnam, filename,
+        """ Extract text contents from the file specified by ``filename``,
            using some external programm, and return the result as unicode.
+            ``dirname`` is the path to a temporary directory that
+            usually (but not necessarily) contains the file and may
+            be used for creating other (temporary) files if needed.
+        """
+
+    def execute(command):
+        """ Execute a system command and return the output of the program
+            called.
+        """
+
+    def checkAvailable(progname, logMessage=''):
+        """ Check the availability of the program named ``progname``.
+            Return True if available; if ``logMessage`` is given, put this
+            as a warning message into the log if the program is not available.
        """
--- a/text/pdf.py
+++ b/text/pdf.py
@ -1,9 +1,31 @@
-"""Searchable text support for Portable Document Format (PDF) files.
-
-This uses the pdftotext command from xpdf to perform the extraction.
+#
+#  Copyright (c) 2007 Helmut Merz helmutm@cy55.de
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#

 """
-__docformat__ = "reStructuredText"
+Searchable text support for Portable Document Format (PDF) files.
+
+This uses the pdftotext command from xpdf to perform the extraction.
+interface definitions for text transformations.
+
+Based on code provided by zc.index and TextIndexNG3.
+
+$Id$
+"""

 import os, sys

@ -15,12 +37,7 @@ class PdfTransform(base.BaseFileTransform):
    extension = ".pdf"

    def extract(self, directory, filename):
-        if not base.haveProgram("pdftotext"):
-            print 'Warning: pdftotext is not available'
+        if not self.checkAvailable('pdftotext', 'pdftotext is not available'):
            return u''
-        txtfile = os.path.join(directory, "words.txt")
-        st = os.system("pdftotext -enc UTF-8 %s %s" % (filename, txtfile))
-        f = open(txtfile, "rb")
-        data = f.read()
-        f.close()
-        return unicode(data, "utf-8")
+        data = self.execute('pdftotext -enc UTF-8 "%s" -' % filename)
+        return data.decode('UTF-8')
--- a/text/testfiles/mary.doc
+++ b/text/testfiles/mary.doc
--- a/text/testfiles/mary.odt
+++ b/text/testfiles/mary.odt