added text conversion for RTF
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1629 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
06d4652807
commit
7b236ecbec
3 changed files with 112 additions and 0 deletions
|
@ -67,6 +67,20 @@ Word Documents
|
||||||
>>> u'lamb' in words
|
>>> u'lamb' in words
|
||||||
True
|
True
|
||||||
|
|
||||||
|
RTF Files
|
||||||
|
---------
|
||||||
|
|
||||||
|
>>> from cybertools.text.rtf import RtfTransform
|
||||||
|
>>> transform = RtfTransform(None)
|
||||||
|
>>> f = open(os.path.join(testdir, 'mary.rtf'))
|
||||||
|
>>> result = transform(f)
|
||||||
|
>>> print log
|
||||||
|
>>> words = result.split()
|
||||||
|
>>> len(words)
|
||||||
|
90
|
||||||
|
>>> u'lamb' in words
|
||||||
|
True
|
||||||
|
|
||||||
PowerPoint Presentations
|
PowerPoint Presentations
|
||||||
------------------------
|
------------------------
|
||||||
|
|
||||||
|
|
63
text/rtf.py
Normal file
63
text/rtf.py
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
#
|
||||||
|
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
#
|
||||||
|
|
||||||
|
"""
|
||||||
|
Searchable text support for MS Word (.doc) files.
|
||||||
|
|
||||||
|
This uses the wvware command to perform the extraction.
|
||||||
|
|
||||||
|
Based on code provided by zc.index and TextIndexNG3.
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os, sys
|
||||||
|
from xml import sax
|
||||||
|
from cStringIO import StringIO
|
||||||
|
|
||||||
|
from cybertools.text import base
|
||||||
|
|
||||||
|
|
||||||
|
class RtfTextHandler(sax.ContentHandler):
|
||||||
|
|
||||||
|
def characters(self, text):
|
||||||
|
self._data.write(text.encode('UTF-8'))
|
||||||
|
|
||||||
|
def startDocument(self):
|
||||||
|
self._data = StringIO()
|
||||||
|
|
||||||
|
def startElement(self, name, attrs):
|
||||||
|
if name == 'para':
|
||||||
|
self._data.write('\n')
|
||||||
|
|
||||||
|
def getData(self):
|
||||||
|
return self._data.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
class RtfTransform(base.BaseFileTransform):
|
||||||
|
|
||||||
|
extension = ".rtf"
|
||||||
|
|
||||||
|
def extract(self, directory, filename):
|
||||||
|
if not self.checkAvailable('rtf2xml', 'rtf2xml is not available'):
|
||||||
|
return u''
|
||||||
|
#xmlstr = self.execute('cd /tmp && rtf2xml --no-dtd "%s"' % filename)
|
||||||
|
xmlstr = self.execute('rtf2xml --no-dtd "%s"' % filename)
|
||||||
|
handler = RtfTextHandler()
|
||||||
|
sax.parseString(xmlstr, handler)
|
||||||
|
return handler.getData().decode('UTF-8')
|
35
text/testfiles/mary.rtf
Normal file
35
text/testfiles/mary.rtf
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
{\rtf1\ansi\deff1\adeflang1025
|
||||||
|
{\fonttbl{\f0\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f1\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f2\fswiss\fprq2\fcharset0 Nimbus Sans L{\*\falt Arial};}{\f3\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Nimbus Sans L{\*\falt Arial};}{\f5\fnil\fprq2\fcharset0 DejaVu Sans;}{\f6\fswiss\fprq2\fcharset0 DejaVu Sans;}}
|
||||||
|
{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}
|
||||||
|
{\stylesheet{\s1\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\snext1 Normal;}
|
||||||
|
{\s2\sb240\sa120\keepn\aspalpha\rtlch\af5\afs28\lang255\ltrch\dbch\af5\langfe255\hich\f2\fs28\lang1031\loch\f2\fs28\lang1031\sbasedon1\snext3 Heading;}
|
||||||
|
{\s3\sa120\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext3 Body Text;}
|
||||||
|
{\s4\sa120\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon3\snext4 List;}
|
||||||
|
{\s5\sb120\sa120\aspalpha\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\i\loch\fs24\lang1031\i\sbasedon1\snext5 caption;}
|
||||||
|
{\s6\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext6 Index;}
|
||||||
|
{\s7\sb240\sa120\keepn\aspalpha\rtlch\af1\afs28\lang255\ltrch\dbch\af1\langfe255\hich\f4\fs28\lang1031\loch\f4\fs28\lang1031\sbasedon1\snext3 Heading;}
|
||||||
|
{\s8\sb120\sa120\aspalpha\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\i\loch\fs24\lang1031\i\sbasedon1\snext8 caption;}
|
||||||
|
{\s9\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext9 Index;}
|
||||||
|
}
|
||||||
|
{\info{\creatim\yr2007\mo3\dy8\hr12\min8}{\revtim\yr1601\mo1\dy1\hr0\min0}{\printim\yr1601\mo1\dy1\hr0\min0}{\comment StarWriter}{\vern6800}}\deftab709
|
||||||
|
{\*\pgdsctbl
|
||||||
|
{\pgdsc0\pgdscuse195\pgwsxn11905\pghsxn16837\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Standard;}}
|
||||||
|
{\*\pgdscno0}\paperh16837\paperw11905\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\pgwsxn11905\pghsxn16837\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
|
||||||
|
\pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary had a little lamb,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Little lamb, little l\'e4mb,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary had a little lamb,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Its fleece as white as snow.}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Everywhere that Mary went,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary went, Mary went,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Everywhere that Mary went,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 The lamb was sure to go.}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It followed her to school one day,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 School one day, school one day,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It followed her to school one day,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Which was against the rules.}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It made the children laugh and play,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Laugh and play, laugh and play,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It made the children laugh and play,}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 To see a lamb in school.}
|
||||||
|
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031
|
||||||
|
\par }
|
Loading…
Add table
Reference in a new issue