clean-up of cybertools.text; provide msword conversion
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1603 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
53ae9f332a
commit
b380b0fcd7
8 changed files with 529 additions and 44 deletions
|
@ -1,18 +1,56 @@
|
||||||
=================================================
|
=================================================
|
||||||
Text transformations, e.g. for full-text indexing
|
Text Transformations, e.g. for Full-text Indexing
|
||||||
=================================================
|
=================================================
|
||||||
|
|
||||||
($Id$)
|
($Id$)
|
||||||
|
|
||||||
|
If a converter program needed is not available we want to put a warning
|
||||||
|
into Zope's server log; in order to be able to test this we register
|
||||||
|
a log handler for testing:
|
||||||
|
|
||||||
|
>>> from zope.testing.loggingsupport import InstalledHandler
|
||||||
|
>>> log = InstalledHandler('zope.server')
|
||||||
|
|
||||||
|
The test files are in a subdirectory of the text package:
|
||||||
|
|
||||||
>>> import os
|
>>> import os
|
||||||
>>> from cybertools import text
|
>>> from cybertools import text
|
||||||
>>> directory = os.path.dirname(text.__file__)
|
>>> testdir = os.path.join(os.path.dirname(text.__file__), 'testfiles')
|
||||||
>>> fn = os.path.sep.join((directory, 'testfiles', 'mary.pdf'))
|
|
||||||
>>> f = open(fn)
|
PDF Files
|
||||||
|
---------
|
||||||
|
|
||||||
|
Let's start with a PDF file:
|
||||||
|
|
||||||
>>> from cybertools.text.pdf import PdfTransform
|
>>> from cybertools.text.pdf import PdfTransform
|
||||||
>>> transform = PdfTransform(None)
|
>>> transform = PdfTransform(None)
|
||||||
>>> words = transform(f).split()
|
>>> f = open(os.path.join(testdir, 'mary.pdf'))
|
||||||
|
|
||||||
|
This will be transformed to plain text:
|
||||||
|
|
||||||
|
>>> result = transform(f)
|
||||||
|
|
||||||
|
Let's check the log, should be empty:
|
||||||
|
|
||||||
|
>>> print log
|
||||||
|
|
||||||
|
So what is in the plain text result?
|
||||||
|
|
||||||
|
>>> words = result.split()
|
||||||
|
>>> len(words)
|
||||||
|
89
|
||||||
|
>>> u'lamb' in words
|
||||||
|
True
|
||||||
|
|
||||||
|
Word Documents
|
||||||
|
--------------
|
||||||
|
|
||||||
|
>>> from cybertools.text.doc import DocTransform
|
||||||
|
>>> transform = DocTransform(None)
|
||||||
|
>>> f = open(os.path.join(testdir, 'mary.doc'))
|
||||||
|
>>> result = transform(f)
|
||||||
|
>>> print log
|
||||||
|
>>> words = result.split()
|
||||||
>>> len(words)
|
>>> len(words)
|
||||||
89
|
89
|
||||||
>>> u'lamb' in words
|
>>> u'lamb' in words
|
||||||
|
|
50
text/base.py
50
text/base.py
|
@ -19,32 +19,17 @@
|
||||||
"""
|
"""
|
||||||
Base classes for text transformations.
|
Base classes for text transformations.
|
||||||
|
|
||||||
Based on code provided by zc.index.
|
Based on code provided by zc.index and TextIndexNG3.
|
||||||
|
|
||||||
$Id$
|
$Id$
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
__docformat__ = "reStructuredText"
|
|
||||||
|
|
||||||
import os, shutil, sys, tempfile
|
import os, shutil, sys, tempfile
|
||||||
|
import logging
|
||||||
from zope.interface import implements
|
from zope.interface import implements
|
||||||
from cybertools.text.interfaces import ITextTransform, IFileTransform
|
from cybertools.text.interfaces import ITextTransform, IFileTransform
|
||||||
|
|
||||||
def haveProgram(name):
|
|
||||||
"""Return true if the program `name` is available."""
|
|
||||||
if sys.platform.lower().startswith("win"):
|
|
||||||
extensions = (".com", ".exe", ".bat")
|
|
||||||
else:
|
|
||||||
extensions = ("",)
|
|
||||||
execpath = os.environ.get("PATH", "").split(os.path.pathsep)
|
|
||||||
for path in execpath:
|
|
||||||
for ext in extensions:
|
|
||||||
fn = os.path.join(path, name + ext)
|
|
||||||
if os.path.isfile(fn):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
class BaseTransform(object):
|
class BaseTransform(object):
|
||||||
|
|
||||||
|
@ -54,11 +39,9 @@ class BaseTransform(object):
|
||||||
self.context = context
|
self.context = context
|
||||||
self.text = None
|
self.text = None
|
||||||
|
|
||||||
def __call__(self, f):
|
def __call__(self, fr):
|
||||||
if self.text is None:
|
if self.text is None:
|
||||||
fr = open(f, 'r')
|
|
||||||
self.text = fr.read()
|
self.text = fr.read()
|
||||||
fr.close()
|
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
|
|
||||||
|
@ -66,22 +49,45 @@ class BaseFileTransform(BaseTransform):
|
||||||
|
|
||||||
implements(IFileTransform)
|
implements(IFileTransform)
|
||||||
|
|
||||||
|
extension = '.txt'
|
||||||
|
|
||||||
def __call__(self, fr):
|
def __call__(self, fr):
|
||||||
if self.text is None:
|
if self.text is None:
|
||||||
#fr = f.open("rb")
|
|
||||||
dirname = tempfile.mkdtemp()
|
dirname = tempfile.mkdtemp()
|
||||||
filename = os.path.join(dirname, "temp" + self.extension)
|
filename = os.path.join(dirname, "temp" + self.extension)
|
||||||
try:
|
try:
|
||||||
fw = open(filename, "wb")
|
fw = open(filename, "wb")
|
||||||
shutil.copyfileobj(fr, fw)
|
shutil.copyfileobj(fr, fw)
|
||||||
#fr.close()
|
|
||||||
fw.close()
|
fw.close()
|
||||||
text = self.extract(dirname, filename)
|
text = self.extract(dirname, filename)
|
||||||
finally:
|
finally:
|
||||||
shutil.rmtree(dirname)
|
shutil.rmtree(dirname)
|
||||||
|
#fr.close()
|
||||||
self.text = text
|
self.text = text
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
def extract(self, dirname, filename):
|
def extract(self, dirname, filename):
|
||||||
raise ValueError('Method extract() has to be implemented by subclass.')
|
raise ValueError('Method extract() has to be implemented by subclass.')
|
||||||
|
|
||||||
|
def execute(self, com):
|
||||||
|
try:
|
||||||
|
import win32pipe
|
||||||
|
result = win32pipe.popen(com).read()
|
||||||
|
except ImportError:
|
||||||
|
result = os.popen(com).read()
|
||||||
|
return result
|
||||||
|
|
||||||
|
def checkAvailable(self, name, logMessage=''):
|
||||||
|
if sys.platform.lower().startswith("win"):
|
||||||
|
extensions = (".com", ".exe", ".bat")
|
||||||
|
else:
|
||||||
|
extensions = ("",)
|
||||||
|
execpath = os.environ.get("PATH", "").split(os.path.pathsep)
|
||||||
|
for path in execpath:
|
||||||
|
for ext in extensions:
|
||||||
|
fn = os.path.join(path, name + ext)
|
||||||
|
if os.path.isfile(fn):
|
||||||
|
return True
|
||||||
|
if logMessage:
|
||||||
|
logging.getLogger('zope.server').warn(logMessage)
|
||||||
|
return False
|
||||||
|
|
355
text/config/wvText.xml
Executable file
355
text/config/wvText.xml
Executable file
|
@ -0,0 +1,355 @@
|
||||||
|
<main>
|
||||||
|
<charentity>
|
||||||
|
<begin>ABW</begin>
|
||||||
|
</charentity>
|
||||||
|
|
||||||
|
<document>
|
||||||
|
<begin>
|
||||||
|
</begin>
|
||||||
|
<end>
|
||||||
|
</end>
|
||||||
|
</document>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<begin>
|
||||||
|
</begin>
|
||||||
|
<end>
|
||||||
|
</end>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<justification>
|
||||||
|
<left></left>
|
||||||
|
<right></right>
|
||||||
|
<center></center>
|
||||||
|
<block></block>
|
||||||
|
<asian></asian>
|
||||||
|
</justification>
|
||||||
|
|
||||||
|
<numbering>
|
||||||
|
<Arabic>type="1"</Arabic>
|
||||||
|
<UpperRoman>type="I"</UpperRoman>
|
||||||
|
<LowerRoman>type="i"</LowerRoman>
|
||||||
|
<UpperCaseN>type="A"</UpperCaseN>
|
||||||
|
<LowerCaseN>type="a"</LowerCaseN>
|
||||||
|
</numbering>
|
||||||
|
|
||||||
|
<border>
|
||||||
|
<noned></noned>
|
||||||
|
<singled></singled>
|
||||||
|
<thickd></thickd>
|
||||||
|
<doubled></doubled>
|
||||||
|
<number4d></number4d>
|
||||||
|
<hairlined></hairlined>
|
||||||
|
<dotd></dotd>
|
||||||
|
<dashlargegapd></dashlargegapd>
|
||||||
|
<dotdashd></dotdashd>
|
||||||
|
<dotdotdashd></dotdotdashd>
|
||||||
|
<tripled></tripled>
|
||||||
|
<thin-thicksmallgapd></thin-thicksmallgapd>
|
||||||
|
<thick-thinsmallgapd></thick-thinsmallgapd>
|
||||||
|
<thin-thick-thinsmallgapd></thin-thick-thinsmallgapd>
|
||||||
|
<thin-thickmediumgapd></thin-thickmediumgapd>
|
||||||
|
<thick-thinmediumgapd></thick-thinmediumgapd>
|
||||||
|
<thin-thick-thinmediumgapd></thin-thick-thinmediumgapd>
|
||||||
|
<thin-thicklargegapd></thin-thicklargegapd>
|
||||||
|
<thick-thinlargegapd></thick-thinlargegapd>
|
||||||
|
<thin-thick-thinlargegapd></thin-thick-thinlargegapd>
|
||||||
|
<waved></waved>
|
||||||
|
<doublewaved></doublewaved>
|
||||||
|
<dashsmallgapd></dashsmallgapd>
|
||||||
|
<dashdotstrokedd></dashdotstrokedd>
|
||||||
|
<emboss3Dd></emboss3Dd>
|
||||||
|
<engrave3Dd></engrave3Dd>
|
||||||
|
<defaultd></defaultd>
|
||||||
|
</border>
|
||||||
|
|
||||||
|
<olist>
|
||||||
|
<begin></begin>
|
||||||
|
<end></end>
|
||||||
|
</olist>
|
||||||
|
|
||||||
|
<ulist>
|
||||||
|
<begin></begin>
|
||||||
|
<end></end>
|
||||||
|
</ulist>
|
||||||
|
|
||||||
|
<entry>
|
||||||
|
<begin></begin>
|
||||||
|
<end></end>
|
||||||
|
</entry>
|
||||||
|
|
||||||
|
<!-- the only thing of significance -->
|
||||||
|
<text>
|
||||||
|
<begin></begin>
|
||||||
|
<end>
|
||||||
|
</end>
|
||||||
|
</text>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
this tableoverride option can be used to turn off handling of
|
||||||
|
these tags in tables, which I find is necessary for at least netscape
|
||||||
|
-->
|
||||||
|
<tableoverrides>
|
||||||
|
<ParaBefore>0</ParaBefore>
|
||||||
|
<ParaRight>0</ParaRight>
|
||||||
|
<ParaAfter>0</ParaAfter>
|
||||||
|
<ParaLeft>0</ParaLeft>
|
||||||
|
<ParaLeft1>0</ParaLeft1>
|
||||||
|
<VertMergedCells>0</VertMergedCells>
|
||||||
|
</tableoverrides>
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<begin></begin>
|
||||||
|
<end></end>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<row>
|
||||||
|
<begin></begin>
|
||||||
|
<end></end>
|
||||||
|
</row>
|
||||||
|
|
||||||
|
<cell>
|
||||||
|
<begin></begin>
|
||||||
|
<end></end>
|
||||||
|
</cell>
|
||||||
|
|
||||||
|
<paragraph>
|
||||||
|
<begin><text.begin/></begin>
|
||||||
|
<end><text.end/></end>
|
||||||
|
</paragraph>
|
||||||
|
|
||||||
|
<!-- these are all the character properties that can show up in word -->
|
||||||
|
<bold><begin></begin><end></end></bold>
|
||||||
|
<italic><begin></begin><end></end></italic>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
text that has been deleted and will be displayed with strikethrough when
|
||||||
|
revision marked text is to be displayed
|
||||||
|
|
||||||
|
use either this line...
|
||||||
|
-->
|
||||||
|
<RMarkDel><begin></begin>
|
||||||
|
<end></end>
|
||||||
|
</RMarkDel>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
or uncomment below to make deleted text dissappear (well, become commented out)
|
||||||
|
-->
|
||||||
|
<!--
|
||||||
|
<RMarkDel><begin><!--</begin><end>--></end></RMarkDel>
|
||||||
|
-->
|
||||||
|
|
||||||
|
<!-- I don't even know what outline means -->
|
||||||
|
<outline><begin></begin><end></end></outline>
|
||||||
|
<smallcaps><begin></begin><end></end></smallcaps>
|
||||||
|
<caps><begin></begin><end></end></caps>
|
||||||
|
<vanish><begin></begin><end></end></vanish>
|
||||||
|
|
||||||
|
<!--If you uncomment this then the annotation text links will become commented out by html tags-->
|
||||||
|
<!--
|
||||||
|
<vanish><begin><!--</begin><end>--></end></vanish>
|
||||||
|
-->
|
||||||
|
|
||||||
|
<!--
|
||||||
|
text that has been newly typed since the last time revision marks have been accepted
|
||||||
|
and will be displayed with underline when revision marked text is to be displayed
|
||||||
|
|
||||||
|
use either this line...
|
||||||
|
-->
|
||||||
|
<RMark><begin></begin><end></end></RMark>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
or uncomment below to make the underline dissappear
|
||||||
|
-->
|
||||||
|
<!--
|
||||||
|
<RMark><begin></begin><end></end></RMark>
|
||||||
|
-->
|
||||||
|
|
||||||
|
|
||||||
|
<strike><begin></begin><end></end></strike>
|
||||||
|
<shadow><begin></begin><end></end></shadow>
|
||||||
|
<lowercase><begin></begin><end></end></lowercase>
|
||||||
|
<emboss><begin></begin><end></end></emboss>
|
||||||
|
<imprint><begin></begin><end></end></imprint>
|
||||||
|
<!--double strike-->
|
||||||
|
<dstrike><begin></begin><end></end></dstrike>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
ftc's
|
||||||
|
&
|
||||||
|
hps
|
||||||
|
|
||||||
|
keep them for font face and do that later.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<super><begin></begin><end></end></super>
|
||||||
|
<sub><begin></begin><end></end></sub>
|
||||||
|
|
||||||
|
<singleu><begin></begin><end></end></singleu>
|
||||||
|
<wordu><begin></begin><end></end></wordu>
|
||||||
|
<doubleu><begin></begin><end></end></doubleu>
|
||||||
|
<dottedu><begin></begin><end></end></dottedu>
|
||||||
|
<hiddenu><begin></begin><end></end></hiddenu>
|
||||||
|
<thicku><begin></begin><end></end></thicku>
|
||||||
|
<dashu><begin></begin><end></end></dashu>
|
||||||
|
<dotu><begin></begin><end></end></dotu>
|
||||||
|
<dotdashu><begin></begin><end></end></dotdashu>
|
||||||
|
<dotdotdashu><begin></begin><end></end></dotdotdashu>
|
||||||
|
<waveu><begin></begin><end></end></waveu>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
text whose properties have been changed since the last time revision marks have been accepted
|
||||||
|
and will be displayed with a note showing the change points.
|
||||||
|
|
||||||
|
use either this line (which admit it a bit scary looking, but harmless)...
|
||||||
|
-->
|
||||||
|
<PropRMark><begin><ibstPropRMark/></begin><end></end></PropRMark>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
or uncomment below to make the notes dissappear
|
||||||
|
-->
|
||||||
|
<!--
|
||||||
|
<PropRMark><begin></begin><end></end></PropRMark>
|
||||||
|
-->
|
||||||
|
|
||||||
|
<!--
|
||||||
|
<color>
|
||||||
|
-->
|
||||||
|
<Black><begin></begin><end></end></Black>
|
||||||
|
<Blue><begin></begin><end></end></Blue>
|
||||||
|
<Cyan><begin></begin><end></end></Cyan>
|
||||||
|
<Green><begin></begin><end></end></Green>
|
||||||
|
<Magenta><begin></begin><end></end></Magenta>
|
||||||
|
<Red><begin></begin><end></end></Red>
|
||||||
|
<Yellow><begin></begin><end></end></Yellow>
|
||||||
|
<White><begin></begin><end></end></White>
|
||||||
|
<DkBlue><begin></begin><end></end></DkBlue>
|
||||||
|
<DkCyan><begin></begin><end></end></DkCyan>
|
||||||
|
<DkGreen><begin></begin><end></end></DkGreen>
|
||||||
|
<DkMagenta><begin></begin><end></end></DkMagenta>
|
||||||
|
<DkRed><begin></begin><end></end></DkRed>
|
||||||
|
<DkYellow><begin></begin><end></end></DkYellow>
|
||||||
|
<DkGray><begin></begin><end></end></DkGray>
|
||||||
|
<LtGray><begin></begin><end></end></LtGray>
|
||||||
|
<!--
|
||||||
|
</color>
|
||||||
|
-->
|
||||||
|
|
||||||
|
<!--
|
||||||
|
<animation>
|
||||||
|
-->
|
||||||
|
<LasVegas><begin></begin><end></end></LasVegas>
|
||||||
|
<BackgroundBlink><begin></begin><end></end></BackgroundBlink>
|
||||||
|
<SparkleText><begin></begin><end></end></SparkleText>
|
||||||
|
<MarchingAnts><begin></begin><end></end></MarchingAnts>
|
||||||
|
<MarchingRedAnts><begin></begin><end></end></MarchingRedAnts>
|
||||||
|
<Shimmer><begin></begin><end></end></Shimmer>
|
||||||
|
<!--
|
||||||
|
</animation>
|
||||||
|
-->
|
||||||
|
|
||||||
|
<!--
|
||||||
|
I dont understand what this one is, and ive never come across it
|
||||||
|
|
||||||
|
use this sample line (which admit it a bit scary looking, but harmless)...
|
||||||
|
-->
|
||||||
|
<DispFldRMark><begin></begin><end></end></DispFldRMark>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
or uncomment below to ignore it, the previous might even crash wv ?
|
||||||
|
-->
|
||||||
|
<!--
|
||||||
|
<DispFldRMark><begin></begin><end></end></DispFldRMark>
|
||||||
|
-->
|
||||||
|
|
||||||
|
<animation>
|
||||||
|
<begin><LasVegas.begin/><BackgroundBlink.begin/><SparkleText.begin/><MarchingAnts.begin/><MarchingRedAnts.begin/><Shimmer.begin/></begin>
|
||||||
|
<end><Shimmer.end/><MarchingRedAnts.end/><MarchingAnts.end/><SparkleText.end/><BackgroundBlink.end/><LasVegas.end/></end>
|
||||||
|
</animation>
|
||||||
|
|
||||||
|
<fontstr>
|
||||||
|
<begin></begin>
|
||||||
|
<end></end>
|
||||||
|
</fontstr>
|
||||||
|
|
||||||
|
<comment>
|
||||||
|
<begin>
|
||||||
|
</begin>
|
||||||
|
<end>
|
||||||
|
</end>
|
||||||
|
</comment>
|
||||||
|
|
||||||
|
<style name="Normal">
|
||||||
|
<character>
|
||||||
|
<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
|
||||||
|
<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
|
||||||
|
</character>
|
||||||
|
|
||||||
|
<!-- Netscape does handle this correctly yet, here is how each different side of the border should work.
|
||||||
|
border-top: thin <bordertopstyle/> <bordertopcolor/>;
|
||||||
|
border-left: thin <borderleftstyle/> <borderleftcolor/>;
|
||||||
|
border-right: thin <borderrightstyle/> <borderrightcolor/>;
|
||||||
|
border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
|
||||||
|
-->
|
||||||
|
|
||||||
|
|
||||||
|
<pmargin>
|
||||||
|
<begin><!-- <mmParaBefore/> <mmParaRight/> <mmParaAfter/> <mmParaLeft/>;--></begin>
|
||||||
|
</pmargin>
|
||||||
|
|
||||||
|
<pborder>
|
||||||
|
<begin>
|
||||||
|
<!--
|
||||||
|
border: thin <borderleftstyle/> <borderleftcolor/>;
|
||||||
|
border-top: thin <bordertopstyle/> <bordertopcolor/>;
|
||||||
|
border-left: thin <borderleftstyle/> <borderleftcolor/>;
|
||||||
|
border-right: thin <borderrightstyle/> <borderrightcolor/>;
|
||||||
|
border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
|
||||||
|
-->
|
||||||
|
</begin>
|
||||||
|
</pborder>
|
||||||
|
|
||||||
|
<picture>
|
||||||
|
<begin>
|
||||||
|
</begin>
|
||||||
|
<!-- images are lacking for now -->
|
||||||
|
|
||||||
|
</picture>
|
||||||
|
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<!--we need to be override the character properties-->
|
||||||
|
<!--
|
||||||
|
<style name="Normal">
|
||||||
|
<character>
|
||||||
|
<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
|
||||||
|
<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
|
||||||
|
</character>
|
||||||
|
|
||||||
|
<text>
|
||||||
|
<begin></begin>
|
||||||
|
<end>
|
||||||
|
</end>
|
||||||
|
</text>
|
||||||
|
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<style name="Heading 1">
|
||||||
|
|
||||||
|
<character>
|
||||||
|
<begin></begin>
|
||||||
|
<end></end>
|
||||||
|
</character>
|
||||||
|
|
||||||
|
<text>
|
||||||
|
<begin></begin>
|
||||||
|
<end>
|
||||||
|
</end>
|
||||||
|
</text>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</style>
|
||||||
|
-->
|
||||||
|
|
||||||
|
</main>
|
54
text/doc.py
Normal file
54
text/doc.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
#
|
||||||
|
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
#
|
||||||
|
|
||||||
|
"""
|
||||||
|
Searchable text support for Portable Document Format (PDF) files.
|
||||||
|
|
||||||
|
This uses the pdftotext command from xpdf to perform the extraction.
|
||||||
|
interface definitions for text transformations.
|
||||||
|
|
||||||
|
Based on code provided by zc.index and TextIndexNG3.
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os, sys
|
||||||
|
|
||||||
|
from cybertools.text import base
|
||||||
|
|
||||||
|
try:
|
||||||
|
from Globals import package_home
|
||||||
|
wvConf = os.path.join(package_home(globals()), 'config', 'wvText.xml')
|
||||||
|
except ImportError:
|
||||||
|
wvConf = os.path.join(os.path.dirname(__file__), 'config', 'wvText.xml')
|
||||||
|
|
||||||
|
|
||||||
|
class DocTransform(base.BaseFileTransform):
|
||||||
|
|
||||||
|
extension = ".doc"
|
||||||
|
|
||||||
|
def extract(self, directory, filename):
|
||||||
|
if not self.checkAvailable('wvWare', 'wvWare is not available'):
|
||||||
|
return u''
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> nul:'
|
||||||
|
% (wvConf, filename))
|
||||||
|
else:
|
||||||
|
data = self.execute('wvWare -c utf-8 --nographics -x "%s" "%s" 2> /dev/null'
|
||||||
|
% (wvConf, filename))
|
||||||
|
return data.decode('UTF-8')
|
|
@ -27,17 +27,32 @@ from zope.interface import Interface
|
||||||
|
|
||||||
class ITextTransform(Interface):
|
class ITextTransform(Interface):
|
||||||
|
|
||||||
def __call__(f):
|
def __call__(fr):
|
||||||
""" Transform the content of file f to plain text and return
|
""" Transform the content of file fr (readfile) to plain text and
|
||||||
the result as unicode.
|
return the result as unicode.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class IFileTransform(ITextTransform):
|
class IFileTransform(ITextTransform):
|
||||||
""" A transformation that uses an intermediate disk file.
|
""" A transformation that is performed by calling some external program
|
||||||
|
and that typically uses an intermediate disk file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def extract(dirname, filename):
|
def extract(dirname, filename):
|
||||||
""" Extract text contents from the file specified by dirnam, filename,
|
""" Extract text contents from the file specified by ``filename``,
|
||||||
using some external programm, and return the result as unicode.
|
using some external programm, and return the result as unicode.
|
||||||
|
``dirname`` is the path to a temporary directory that
|
||||||
|
usually (but not necessarily) contains the file and may
|
||||||
|
be used for creating other (temporary) files if needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def execute(command):
|
||||||
|
""" Execute a system command and return the output of the program
|
||||||
|
called.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def checkAvailable(progname, logMessage=''):
|
||||||
|
""" Check the availability of the program named ``progname``.
|
||||||
|
Return True if available; if ``logMessage`` is given, put this
|
||||||
|
as a warning message into the log if the program is not available.
|
||||||
"""
|
"""
|
||||||
|
|
41
text/pdf.py
41
text/pdf.py
|
@ -1,9 +1,31 @@
|
||||||
"""Searchable text support for Portable Document Format (PDF) files.
|
#
|
||||||
|
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||||
This uses the pdftotext command from xpdf to perform the extraction.
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
#
|
||||||
|
|
||||||
"""
|
"""
|
||||||
__docformat__ = "reStructuredText"
|
Searchable text support for Portable Document Format (PDF) files.
|
||||||
|
|
||||||
|
This uses the pdftotext command from xpdf to perform the extraction.
|
||||||
|
interface definitions for text transformations.
|
||||||
|
|
||||||
|
Based on code provided by zc.index and TextIndexNG3.
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
"""
|
||||||
|
|
||||||
import os, sys
|
import os, sys
|
||||||
|
|
||||||
|
@ -15,12 +37,7 @@ class PdfTransform(base.BaseFileTransform):
|
||||||
extension = ".pdf"
|
extension = ".pdf"
|
||||||
|
|
||||||
def extract(self, directory, filename):
|
def extract(self, directory, filename):
|
||||||
if not base.haveProgram("pdftotext"):
|
if not self.checkAvailable('pdftotext', 'pdftotext is not available'):
|
||||||
print 'Warning: pdftotext is not available'
|
|
||||||
return u''
|
return u''
|
||||||
txtfile = os.path.join(directory, "words.txt")
|
data = self.execute('pdftotext -enc UTF-8 "%s" -' % filename)
|
||||||
st = os.system("pdftotext -enc UTF-8 %s %s" % (filename, txtfile))
|
return data.decode('UTF-8')
|
||||||
f = open(txtfile, "rb")
|
|
||||||
data = f.read()
|
|
||||||
f.close()
|
|
||||||
return unicode(data, "utf-8")
|
|
||||||
|
|
BIN
text/testfiles/mary.doc
Normal file
BIN
text/testfiles/mary.doc
Normal file
Binary file not shown.
BIN
text/testfiles/mary.odt
Normal file
BIN
text/testfiles/mary.odt
Normal file
Binary file not shown.
Loading…
Add table
Reference in a new issue