clean-up of cybertools.text; provide msword conversion
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1603 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
53ae9f332a
commit
b380b0fcd7
8 changed files with 529 additions and 44 deletions
|
@ -1,18 +1,56 @@
|
|||
=================================================
|
||||
Text transformations, e.g. for full-text indexing
|
||||
Text Transformations, e.g. for Full-text Indexing
|
||||
=================================================
|
||||
|
||||
($Id$)
|
||||
|
||||
If a converter program needed is not available we want to put a warning
|
||||
into Zope's server log; in order to be able to test this we register
|
||||
a log handler for testing:
|
||||
|
||||
>>> from zope.testing.loggingsupport import InstalledHandler
|
||||
>>> log = InstalledHandler('zope.server')
|
||||
|
||||
The test files are in a subdirectory of the text package:
|
||||
|
||||
>>> import os
|
||||
>>> from cybertools import text
|
||||
>>> directory = os.path.dirname(text.__file__)
|
||||
>>> fn = os.path.sep.join((directory, 'testfiles', 'mary.pdf'))
|
||||
>>> f = open(fn)
|
||||
>>> testdir = os.path.join(os.path.dirname(text.__file__), 'testfiles')
|
||||
|
||||
PDF Files
|
||||
---------
|
||||
|
||||
Let's start with a PDF file:
|
||||
|
||||
>>> from cybertools.text.pdf import PdfTransform
|
||||
>>> transform = PdfTransform(None)
|
||||
>>> words = transform(f).split()
|
||||
>>> f = open(os.path.join(testdir, 'mary.pdf'))
|
||||
|
||||
This will be transformed to plain text:
|
||||
|
||||
>>> result = transform(f)
|
||||
|
||||
Let's check the log, should be empty:
|
||||
|
||||
>>> print log
|
||||
|
||||
So what is in the plain text result?
|
||||
|
||||
>>> words = result.split()
|
||||
>>> len(words)
|
||||
89
|
||||
>>> u'lamb' in words
|
||||
True
|
||||
|
||||
Word Documents
|
||||
--------------
|
||||
|
||||
>>> from cybertools.text.doc import DocTransform
|
||||
>>> transform = DocTransform(None)
|
||||
>>> f = open(os.path.join(testdir, 'mary.doc'))
|
||||
>>> result = transform(f)
|
||||
>>> print log
|
||||
>>> words = result.split()
|
||||
>>> len(words)
|
||||
89
|
||||
>>> u'lamb' in words
|
||||
|
|
50
text/base.py
50
text/base.py
|
@ -19,32 +19,17 @@
|
|||
"""
|
||||
Base classes for text transformations.
|
||||
|
||||
Based on code provided by zc.index.
|
||||
Based on code provided by zc.index and TextIndexNG3.
|
||||
|
||||
$Id$
|
||||
"""
|
||||
|
||||
|
||||
__docformat__ = "reStructuredText"
|
||||
|
||||
import os, shutil, sys, tempfile
|
||||
import logging
|
||||
from zope.interface import implements
|
||||
from cybertools.text.interfaces import ITextTransform, IFileTransform
|
||||
|
||||
def haveProgram(name):
|
||||
"""Return true if the program `name` is available."""
|
||||
if sys.platform.lower().startswith("win"):
|
||||
extensions = (".com", ".exe", ".bat")
|
||||
else:
|
||||
extensions = ("",)
|
||||
execpath = os.environ.get("PATH", "").split(os.path.pathsep)
|
||||
for path in execpath:
|
||||
for ext in extensions:
|
||||
fn = os.path.join(path, name + ext)
|
||||
if os.path.isfile(fn):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class BaseTransform(object):
|
||||
|
||||
|
@ -54,11 +39,9 @@ class BaseTransform(object):
|
|||
self.context = context
|
||||
self.text = None
|
||||
|
||||
def __call__(self, f):
|
||||
def __call__(self, fr):
|
||||
if self.text is None:
|
||||
fr = open(f, 'r')
|
||||
self.text = fr.read()
|
||||
fr.close()
|
||||
return self.text
|
||||
|
||||
|
||||
|
@ -66,22 +49,45 @@ class BaseFileTransform(BaseTransform):
|
|||
|
||||
implements(IFileTransform)
|
||||
|
||||
extension = '.txt'
|
||||
|
||||
def __call__(self, fr):
|
||||
if self.text is None:
|
||||
#fr = f.open("rb")
|
||||
dirname = tempfile.mkdtemp()
|
||||
filename = os.path.join(dirname, "temp" + self.extension)
|
||||
try:
|
||||
fw = open(filename, "wb")
|
||||
shutil.copyfileobj(fr, fw)
|
||||
#fr.close()
|
||||
fw.close()
|
||||
text = self.extract(dirname, filename)
|
||||
finally:
|
||||
shutil.rmtree(dirname)
|
||||
#fr.close()
|
||||
self.text = text
|
||||
return self.text
|
||||
|
||||
def extract(self, dirname, filename):
|
||||
raise ValueError('Method extract() has to be implemented by subclass.')
|
||||
|
||||
def execute(self, com):
|
||||
try:
|
||||
import win32pipe
|
||||
result = win32pipe.popen(com).read()
|
||||
except ImportError:
|
||||
result = os.popen(com).read()
|
||||
return result
|
||||
|
||||
def checkAvailable(self, name, logMessage=''):
|
||||
if sys.platform.lower().startswith("win"):
|
||||
extensions = (".com", ".exe", ".bat")
|
||||
else:
|
||||
extensions = ("",)
|
||||
execpath = os.environ.get("PATH", "").split(os.path.pathsep)
|
||||
for path in execpath:
|
||||
for ext in extensions:
|
||||
fn = os.path.join(path, name + ext)
|
||||
if os.path.isfile(fn):
|
||||
return True
|
||||
if logMessage:
|
||||
logging.getLogger('zope.server').warn(logMessage)
|
||||
return False
|
||||
|
|
355
text/config/wvText.xml
Executable file
355
text/config/wvText.xml
Executable file
|
@ -0,0 +1,355 @@
|
|||
<main>
|
||||
<charentity>
|
||||
<begin>ABW</begin>
|
||||
</charentity>
|
||||
|
||||
<document>
|
||||
<begin>
|
||||
</begin>
|
||||
<end>
|
||||
</end>
|
||||
</document>
|
||||
|
||||
<section>
|
||||
<begin>
|
||||
</begin>
|
||||
<end>
|
||||
</end>
|
||||
</section>
|
||||
|
||||
<justification>
|
||||
<left></left>
|
||||
<right></right>
|
||||
<center></center>
|
||||
<block></block>
|
||||
<asian></asian>
|
||||
</justification>
|
||||
|
||||
<numbering>
|
||||
<Arabic>type="1"</Arabic>
|
||||
<UpperRoman>type="I"</UpperRoman>
|
||||
<LowerRoman>type="i"</LowerRoman>
|
||||
<UpperCaseN>type="A"</UpperCaseN>
|
||||
<LowerCaseN>type="a"</LowerCaseN>
|
||||
</numbering>
|
||||
|
||||
<border>
|
||||
<noned></noned>
|
||||
<singled></singled>
|
||||
<thickd></thickd>
|
||||
<doubled></doubled>
|
||||
<number4d></number4d>
|
||||
<hairlined></hairlined>
|
||||
<dotd></dotd>
|
||||
<dashlargegapd></dashlargegapd>
|
||||
<dotdashd></dotdashd>
|
||||
<dotdotdashd></dotdotdashd>
|
||||
<tripled></tripled>
|
||||
<thin-thicksmallgapd></thin-thicksmallgapd>
|
||||
<thick-thinsmallgapd></thick-thinsmallgapd>
|
||||
<thin-thick-thinsmallgapd></thin-thick-thinsmallgapd>
|
||||
<thin-thickmediumgapd></thin-thickmediumgapd>
|
||||
<thick-thinmediumgapd></thick-thinmediumgapd>
|
||||
<thin-thick-thinmediumgapd></thin-thick-thinmediumgapd>
|
||||
<thin-thicklargegapd></thin-thicklargegapd>
|
||||
<thick-thinlargegapd></thick-thinlargegapd>
|
||||
<thin-thick-thinlargegapd></thin-thick-thinlargegapd>
|
||||
<waved></waved>
|
||||
<doublewaved></doublewaved>
|
||||
<dashsmallgapd></dashsmallgapd>
|
||||
<dashdotstrokedd></dashdotstrokedd>
|
||||
<emboss3Dd></emboss3Dd>
|
||||
<engrave3Dd></engrave3Dd>
|
||||
<defaultd></defaultd>
|
||||
</border>
|
||||
|
||||
<olist>
|
||||
<begin></begin>
|
||||
<end></end>
|
||||
</olist>
|
||||
|
||||
<ulist>
|
||||
<begin></begin>
|
||||
<end></end>
|
||||
</ulist>
|
||||
|
||||
<entry>
|
||||
<begin></begin>
|
||||
<end></end>
|
||||
</entry>
|
||||
|
||||
<!-- the only thing of significance -->
|
||||
<text>
|
||||
<begin></begin>
|
||||
<end>
|
||||
</end>
|
||||
</text>
|
||||
|
||||
<!--
|
||||
this tableoverride option can be used to turn off handling of
|
||||
these tags in tables, which I find is necessary for at least netscape
|
||||
-->
|
||||
<tableoverrides>
|
||||
<ParaBefore>0</ParaBefore>
|
||||
<ParaRight>0</ParaRight>
|
||||
<ParaAfter>0</ParaAfter>
|
||||
<ParaLeft>0</ParaLeft>
|
||||
<ParaLeft1>0</ParaLeft1>
|
||||
<VertMergedCells>0</VertMergedCells>
|
||||
</tableoverrides>
|
||||
|
||||
<table>
|
||||
<begin></begin>
|
||||
<end></end>
|
||||
</table>
|
||||
|
||||
<row>
|
||||
<begin></begin>
|
||||
<end></end>
|
||||
</row>
|
||||
|
||||
<cell>
|
||||
<begin></begin>
|
||||
<end></end>
|
||||
</cell>
|
||||
|
||||
<paragraph>
|
||||
<begin><text.begin/></begin>
|
||||
<end><text.end/></end>
|
||||
</paragraph>
|
||||
|
||||
<!-- these are all the character properties that can show up in word -->
|
||||
<bold><begin></begin><end></end></bold>
|
||||
<italic><begin></begin><end></end></italic>
|
||||
|
||||
<!--
|
||||
text that has been deleted and will be displayed with strikethrough when
|
||||
revision marked text is to be displayed
|
||||
|
||||
use either this line...
|
||||
-->
|
||||
<RMarkDel><begin></begin>
|
||||
<end></end>
|
||||
</RMarkDel>
|
||||
|
||||
<!--
|
||||
or uncomment below to make deleted text dissappear (well, become commented out)
|
||||
-->
|
||||
<!--
|
||||
<RMarkDel><begin><!--</begin><end>--></end></RMarkDel>
|
||||
-->
|
||||
|
||||
<!-- I don't even know what outline means -->
|
||||
<outline><begin></begin><end></end></outline>
|
||||
<smallcaps><begin></begin><end></end></smallcaps>
|
||||
<caps><begin></begin><end></end></caps>
|
||||
<vanish><begin></begin><end></end></vanish>
|
||||
|
||||
<!--If you uncomment this then the annotation text links will become commented out by html tags-->
|
||||
<!--
|
||||
<vanish><begin><!--</begin><end>--></end></vanish>
|
||||
-->
|
||||
|
||||
<!--
|
||||
text that has been newly typed since the last time revision marks have been accepted
|
||||
and will be displayed with underline when revision marked text is to be displayed
|
||||
|
||||
use either this line...
|
||||
-->
|
||||
<RMark><begin></begin><end></end></RMark>
|
||||
|
||||
<!--
|
||||
or uncomment below to make the underline dissappear
|
||||
-->
|
||||
<!--
|
||||
<RMark><begin></begin><end></end></RMark>
|
||||
-->
|
||||
|
||||
|
||||
<strike><begin></begin><end></end></strike>
|
||||
<shadow><begin></begin><end></end></shadow>
|
||||
<lowercase><begin></begin><end></end></lowercase>
|
||||
<emboss><begin></begin><end></end></emboss>
|
||||
<imprint><begin></begin><end></end></imprint>
|
||||
<!--double strike-->
|
||||
<dstrike><begin></begin><end></end></dstrike>
|
||||
|
||||
<!--
|
||||
ftc's
|
||||
&
|
||||
hps
|
||||
|
||||
keep them for font face and do that later.
|
||||
-->
|
||||
|
||||
<super><begin></begin><end></end></super>
|
||||
<sub><begin></begin><end></end></sub>
|
||||
|
||||
<singleu><begin></begin><end></end></singleu>
|
||||
<wordu><begin></begin><end></end></wordu>
|
||||
<doubleu><begin></begin><end></end></doubleu>
|
||||
<dottedu><begin></begin><end></end></dottedu>
|
||||
<hiddenu><begin></begin><end></end></hiddenu>
|
||||
<thicku><begin></begin><end></end></thicku>
|
||||
<dashu><begin></begin><end></end></dashu>
|
||||
<dotu><begin></begin><end></end></dotu>
|
||||
<dotdashu><begin></begin><end></end></dotdashu>
|
||||
<dotdotdashu><begin></begin><end></end></dotdotdashu>
|
||||
<waveu><begin></begin><end></end></waveu>
|
||||
|
||||
<!--
|
||||
text whose properties have been changed since the last time revision marks have been accepted
|
||||
and will be displayed with a note showing the change points.
|
||||
|
||||
use either this line (which admit it a bit scary looking, but harmless)...
|
||||
-->
|
||||
<PropRMark><begin><ibstPropRMark/></begin><end></end></PropRMark>
|
||||
|
||||
<!--
|
||||
or uncomment below to make the notes dissappear
|
||||
-->
|
||||
<!--
|
||||
<PropRMark><begin></begin><end></end></PropRMark>
|
||||
-->
|
||||
|
||||
<!--
|
||||
<color>
|
||||
-->
|
||||
<Black><begin></begin><end></end></Black>
|
||||
<Blue><begin></begin><end></end></Blue>
|
||||
<Cyan><begin></begin><end></end></Cyan>
|
||||
<Green><begin></begin><end></end></Green>
|
||||
<Magenta><begin></begin><end></end></Magenta>
|
||||
<Red><begin></begin><end></end></Red>
|
||||
<Yellow><begin></begin><end></end></Yellow>
|
||||
<White><begin></begin><end></end></White>
|
||||
<DkBlue><begin></begin><end></end></DkBlue>
|
||||
<DkCyan><begin></begin><end></end></DkCyan>
|
||||
<DkGreen><begin></begin><end></end></DkGreen>
|
||||
<DkMagenta><begin></begin><end></end></DkMagenta>
|
||||
<DkRed><begin></begin><end></end></DkRed>
|
||||
<DkYellow><begin></begin><end></end></DkYellow>
|
||||
<DkGray><begin></begin><end></end></DkGray>
|
||||
<LtGray><begin></begin><end></end></LtGray>
|
||||
<!--
|
||||
</color>
|
||||
-->
|
||||
|
||||
<!--
|
||||
<animation>
|
||||
-->
|
||||
<LasVegas><begin></begin><end></end></LasVegas>
|
||||
<BackgroundBlink><begin></begin><end></end></BackgroundBlink>
|
||||
<SparkleText><begin></begin><end></end></SparkleText>
|
||||
<MarchingAnts><begin></begin><end></end></MarchingAnts>
|
||||
<MarchingRedAnts><begin></begin><end></end></MarchingRedAnts>
|
||||
<Shimmer><begin></begin><end></end></Shimmer>
|
||||
<!--
|
||||
</animation>
|
||||
-->
|
||||
|
||||
<!--
|
||||
I dont understand what this one is, and ive never come across it
|
||||
|
||||
use this sample line (which admit it a bit scary looking, but harmless)...
|
||||
-->
|
||||
<DispFldRMark><begin></begin><end></end></DispFldRMark>
|
||||
|
||||
<!--
|
||||
or uncomment below to ignore it, the previous might even crash wv ?
|
||||
-->
|
||||
<!--
|
||||
<DispFldRMark><begin></begin><end></end></DispFldRMark>
|
||||
-->
|
||||
|
||||
<animation>
|
||||
<begin><LasVegas.begin/><BackgroundBlink.begin/><SparkleText.begin/><MarchingAnts.begin/><MarchingRedAnts.begin/><Shimmer.begin/></begin>
|
||||
<end><Shimmer.end/><MarchingRedAnts.end/><MarchingAnts.end/><SparkleText.end/><BackgroundBlink.end/><LasVegas.end/></end>
|
||||
</animation>
|
||||
|
||||
<fontstr>
|
||||
<begin></begin>
|
||||
<end></end>
|
||||
</fontstr>
|
||||
|
||||
<comment>
|
||||
<begin>
|
||||
</begin>
|
||||
<end>
|
||||
</end>
|
||||
</comment>
|
||||
|
||||
<style name="Normal">
|
||||
<character>
|
||||
<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
|
||||
<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
|
||||
</character>
|
||||
|
||||
<!-- Netscape does handle this correctly yet, here is how each different side of the border should work.
|
||||
border-top: thin <bordertopstyle/> <bordertopcolor/>;
|
||||
border-left: thin <borderleftstyle/> <borderleftcolor/>;
|
||||
border-right: thin <borderrightstyle/> <borderrightcolor/>;
|
||||
border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
|
||||
-->
|
||||
|
||||
|
||||
<pmargin>
|
||||
<begin><!-- <mmParaBefore/> <mmParaRight/> <mmParaAfter/> <mmParaLeft/>;--></begin>
|
||||
</pmargin>
|
||||
|
||||
<pborder>
|
||||
<begin>
|
||||
<!--
|
||||
border: thin <borderleftstyle/> <borderleftcolor/>;
|
||||
border-top: thin <bordertopstyle/> <bordertopcolor/>;
|
||||
border-left: thin <borderleftstyle/> <borderleftcolor/>;
|
||||
border-right: thin <borderrightstyle/> <borderrightcolor/>;
|
||||
border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
|
||||
-->
|
||||
</begin>
|
||||
</pborder>
|
||||
|
||||
<picture>
|
||||
<begin>
|
||||
</begin>
|
||||
<!-- images are lacking for now -->
|
||||
|
||||
</picture>
|
||||
|
||||
</style>
|
||||
|
||||
<!--we need to be override the character properties-->
|
||||
<!--
|
||||
<style name="Normal">
|
||||
<character>
|
||||
<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
|
||||
<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
|
||||
</character>
|
||||
|
||||
<text>
|
||||
<begin></begin>
|
||||
<end>
|
||||
</end>
|
||||
</text>
|
||||
|
||||
</style>
|
||||
|
||||
<style name="Heading 1">
|
||||
|
||||
<character>
|
||||
<begin></begin>
|
||||
<end></end>
|
||||
</character>
|
||||
|
||||
<text>
|
||||
<begin></begin>
|
||||
<end>
|
||||
</end>
|
||||
</text>
|
||||
|
||||
|
||||
|
||||
</style>
|
||||
-->
|
||||
|
||||
</main>
|
54
text/doc.py
Normal file
54
text/doc.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
#
|
||||
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
#
|
||||
|
||||
"""
|
||||
Searchable text support for Portable Document Format (PDF) files.
|
||||
|
||||
This uses the pdftotext command from xpdf to perform the extraction.
|
||||
interface definitions for text transformations.
|
||||
|
||||
Based on code provided by zc.index and TextIndexNG3.
|
||||
|
||||
$Id$
|
||||
"""
|
||||
|
||||
import os, sys
|
||||
|
||||
from cybertools.text import base
|
||||
|
||||
try:
|
||||
from Globals import package_home
|
||||
wvConf = os.path.join(package_home(globals()), 'config', 'wvText.xml')
|
||||
except ImportError:
|
||||
wvConf = os.path.join(os.path.dirname(__file__), 'config', 'wvText.xml')
|
||||
|
||||
|
||||
class DocTransform(base.BaseFileTransform):
|
||||
|
||||
extension = ".doc"
|
||||
|
||||
def extract(self, directory, filename):
|
||||
if not self.checkAvailable('wvWare', 'wvWare is not available'):
|
||||
return u''
|
||||
if sys.platform == 'win32':
|
||||
data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> nul:'
|
||||
% (wvConf, filename))
|
||||
else:
|
||||
data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> /dev/null'
|
||||
% (wvConf, filename))
|
||||
return data.decode('UTF-8')
|
|
@ -27,17 +27,32 @@ from zope.interface import Interface
|
|||
|
||||
class ITextTransform(Interface):
|
||||
|
||||
def __call__(f):
|
||||
""" Transform the content of file f to plain text and return
|
||||
the result as unicode.
|
||||
def __call__(fr):
|
||||
""" Transform the content of file fr (readfile) to plain text and
|
||||
return the result as unicode.
|
||||
"""
|
||||
|
||||
|
||||
class IFileTransform(ITextTransform):
|
||||
""" A transformation that uses an intermediate disk file.
|
||||
""" A transformation that is performed by calling some external program
|
||||
and that typically uses an intermediate disk file.
|
||||
"""
|
||||
|
||||
def extract(dirname, filename):
|
||||
""" Extract text contents from the file specified by dirnam, filename,
|
||||
""" Extract text contents from the file specified by ``filename``,
|
||||
using some external programm, and return the result as unicode.
|
||||
``dirname`` is the path to a temporary directory that
|
||||
usually (but not necessarily) contains the file and may
|
||||
be used for creating other (temporary) files if needed.
|
||||
"""
|
||||
|
||||
def execute(command):
|
||||
""" Execute a system command and return the output of the program
|
||||
called.
|
||||
"""
|
||||
|
||||
def checkAvailable(progname, logMessage=''):
|
||||
""" Check the availability of the program named ``progname``.
|
||||
Return True if available; if ``logMessage`` is given, put this
|
||||
as a warning message into the log if the program is not available.
|
||||
"""
|
||||
|
|
41
text/pdf.py
41
text/pdf.py
|
@ -1,9 +1,31 @@
|
|||
"""Searchable text support for Portable Document Format (PDF) files.
|
||||
|
||||
This uses the pdftotext command from xpdf to perform the extraction.
|
||||
#
|
||||
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
#
|
||||
|
||||
"""
|
||||
__docformat__ = "reStructuredText"
|
||||
Searchable text support for Portable Document Format (PDF) files.
|
||||
|
||||
This uses the pdftotext command from xpdf to perform the extraction.
|
||||
interface definitions for text transformations.
|
||||
|
||||
Based on code provided by zc.index and TextIndexNG3.
|
||||
|
||||
$Id$
|
||||
"""
|
||||
|
||||
import os, sys
|
||||
|
||||
|
@ -15,12 +37,7 @@ class PdfTransform(base.BaseFileTransform):
|
|||
extension = ".pdf"
|
||||
|
||||
def extract(self, directory, filename):
|
||||
if not base.haveProgram("pdftotext"):
|
||||
print 'Warning: pdftotext is not available'
|
||||
if not self.checkAvailable('pdftotext', 'pdftotext is not available'):
|
||||
return u''
|
||||
txtfile = os.path.join(directory, "words.txt")
|
||||
st = os.system("pdftotext -enc UTF-8 %s %s" % (filename, txtfile))
|
||||
f = open(txtfile, "rb")
|
||||
data = f.read()
|
||||
f.close()
|
||||
return unicode(data, "utf-8")
|
||||
data = self.execute('pdftotext -enc UTF-8 "%s" -' % filename)
|
||||
return data.decode('UTF-8')
|
||||
|
|
BIN
text/testfiles/mary.doc
Normal file
BIN
text/testfiles/mary.doc
Normal file
Binary file not shown.
BIN
text/testfiles/mary.odt
Normal file
BIN
text/testfiles/mary.odt
Normal file
Binary file not shown.
Loading…
Add table
Reference in a new issue