added text conversion for RTF

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1629 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
helmutm 2007-03-08 11:49:02 +00:00
parent 06d4652807
commit 7b236ecbec
3 changed files with 112 additions and 0 deletions

View file

@ -67,6 +67,20 @@ Word Documents
>>> u'lamb' in words
True
RTF Files
---------
>>> from cybertools.text.rtf import RtfTransform
>>> transform = RtfTransform(None)
>>> f = open(os.path.join(testdir, 'mary.rtf'))
>>> result = transform(f)
>>> print log
>>> words = result.split()
>>> len(words)
90
>>> u'lamb' in words
True
PowerPoint Presentations
------------------------

63
text/rtf.py Normal file
View file

@ -0,0 +1,63 @@
#
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
Searchable text support for MS Word (.doc) files.
This uses the wvware command to perform the extraction.
Based on code provided by zc.index and TextIndexNG3.
$Id$
"""
import os, sys
from xml import sax
from cStringIO import StringIO
from cybertools.text import base
class RtfTextHandler(sax.ContentHandler):
def characters(self, text):
self._data.write(text.encode('UTF-8'))
def startDocument(self):
self._data = StringIO()
def startElement(self, name, attrs):
if name == 'para':
self._data.write('\n')
def getData(self):
return self._data.getvalue()
class RtfTransform(base.BaseFileTransform):
extension = ".rtf"
def extract(self, directory, filename):
if not self.checkAvailable('rtf2xml', 'rtf2xml is not available'):
return u''
#xmlstr = self.execute('cd /tmp && rtf2xml --no-dtd "%s"' % filename)
xmlstr = self.execute('rtf2xml --no-dtd "%s"' % filename)
handler = RtfTextHandler()
sax.parseString(xmlstr, handler)
return handler.getData().decode('UTF-8')

35
text/testfiles/mary.rtf Normal file
View file

@ -0,0 +1,35 @@
{\rtf1\ansi\deff1\adeflang1025
{\fonttbl{\f0\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f1\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f2\fswiss\fprq2\fcharset0 Nimbus Sans L{\*\falt Arial};}{\f3\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Nimbus Sans L{\*\falt Arial};}{\f5\fnil\fprq2\fcharset0 DejaVu Sans;}{\f6\fswiss\fprq2\fcharset0 DejaVu Sans;}}
{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}
{\stylesheet{\s1\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\snext1 Normal;}
{\s2\sb240\sa120\keepn\aspalpha\rtlch\af5\afs28\lang255\ltrch\dbch\af5\langfe255\hich\f2\fs28\lang1031\loch\f2\fs28\lang1031\sbasedon1\snext3 Heading;}
{\s3\sa120\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext3 Body Text;}
{\s4\sa120\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon3\snext4 List;}
{\s5\sb120\sa120\aspalpha\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\i\loch\fs24\lang1031\i\sbasedon1\snext5 caption;}
{\s6\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext6 Index;}
{\s7\sb240\sa120\keepn\aspalpha\rtlch\af1\afs28\lang255\ltrch\dbch\af1\langfe255\hich\f4\fs28\lang1031\loch\f4\fs28\lang1031\sbasedon1\snext3 Heading;}
{\s8\sb120\sa120\aspalpha\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\i\loch\fs24\lang1031\i\sbasedon1\snext8 caption;}
{\s9\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext9 Index;}
}
{\info{\creatim\yr2007\mo3\dy8\hr12\min8}{\revtim\yr1601\mo1\dy1\hr0\min0}{\printim\yr1601\mo1\dy1\hr0\min0}{\comment StarWriter}{\vern6800}}\deftab709
{\*\pgdsctbl
{\pgdsc0\pgdscuse195\pgwsxn11905\pghsxn16837\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Standard;}}
{\*\pgdscno0}\paperh16837\paperw11905\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\pgwsxn11905\pghsxn16837\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
\pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary had a little lamb,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Little lamb, little l\'e4mb,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary had a little lamb,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Its fleece as white as snow.}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Everywhere that Mary went,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary went, Mary went,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Everywhere that Mary went,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 The lamb was sure to go.}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It followed her to school one day,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 School one day, school one day,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It followed her to school one day,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Which was against the rules.}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It made the children laugh and play,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Laugh and play, laugh and play,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It made the children laugh and play,}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 To see a lamb in school.}
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031
\par }