diff --git a/text/README.txt b/text/README.txt index 42b2ad2..8ead0a8 100644 --- a/text/README.txt +++ b/text/README.txt @@ -67,6 +67,20 @@ Word Documents >>> u'lamb' in words True +RTF Files +--------- + + >>> from cybertools.text.rtf import RtfTransform + >>> transform = RtfTransform(None) + >>> f = open(os.path.join(testdir, 'mary.rtf')) + >>> result = transform(f) + >>> print log + >>> words = result.split() + >>> len(words) + 90 + >>> u'lamb' in words + True + PowerPoint Presentations ------------------------ diff --git a/text/rtf.py b/text/rtf.py new file mode 100644 index 0000000..7972cc4 --- /dev/null +++ b/text/rtf.py @@ -0,0 +1,63 @@ +# +# Copyright (c) 2007 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +""" +Searchable text support for MS Word (.doc) files. + +This uses the wvware command to perform the extraction. + +Based on code provided by zc.index and TextIndexNG3. + +$Id$ +""" + +import os, sys +from xml import sax +from cStringIO import StringIO + +from cybertools.text import base + + +class RtfTextHandler(sax.ContentHandler): + + def characters(self, text): + self._data.write(text.encode('UTF-8')) + + def startDocument(self): + self._data = StringIO() + + def startElement(self, name, attrs): + if name == 'para': + self._data.write('\n') + + def getData(self): + return self._data.getvalue() + + +class RtfTransform(base.BaseFileTransform): + + extension = ".rtf" + + def extract(self, directory, filename): + if not self.checkAvailable('rtf2xml', 'rtf2xml is not available'): + return u'' + #xmlstr = self.execute('cd /tmp && rtf2xml --no-dtd "%s"' % filename) + xmlstr = self.execute('rtf2xml --no-dtd "%s"' % filename) + handler = RtfTextHandler() + sax.parseString(xmlstr, handler) + return handler.getData().decode('UTF-8') diff --git a/text/testfiles/mary.rtf b/text/testfiles/mary.rtf new file mode 100644 index 0000000..43ac9c3 --- /dev/null +++ b/text/testfiles/mary.rtf @@ -0,0 +1,35 @@ +{\rtf1\ansi\deff1\adeflang1025 +{\fonttbl{\f0\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f1\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f2\fswiss\fprq2\fcharset0 Nimbus Sans L{\*\falt Arial};}{\f3\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Nimbus Sans L{\*\falt Arial};}{\f5\fnil\fprq2\fcharset0 DejaVu Sans;}{\f6\fswiss\fprq2\fcharset0 DejaVu Sans;}} +{\colortbl;\red0\green0\blue0;\red128\green128\blue128;} +{\stylesheet{\s1\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\snext1 Normal;} +{\s2\sb240\sa120\keepn\aspalpha\rtlch\af5\afs28\lang255\ltrch\dbch\af5\langfe255\hich\f2\fs28\lang1031\loch\f2\fs28\lang1031\sbasedon1\snext3 Heading;} +{\s3\sa120\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext3 Body Text;} +{\s4\sa120\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon3\snext4 List;} +{\s5\sb120\sa120\aspalpha\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\i\loch\fs24\lang1031\i\sbasedon1\snext5 caption;} +{\s6\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext6 Index;} +{\s7\sb240\sa120\keepn\aspalpha\rtlch\af1\afs28\lang255\ltrch\dbch\af1\langfe255\hich\f4\fs28\lang1031\loch\f4\fs28\lang1031\sbasedon1\snext3 Heading;} +{\s8\sb120\sa120\aspalpha\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\i\loch\fs24\lang1031\i\sbasedon1\snext8 caption;} +{\s9\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext9 Index;} +} +{\info{\creatim\yr2007\mo3\dy8\hr12\min8}{\revtim\yr1601\mo1\dy1\hr0\min0}{\printim\yr1601\mo1\dy1\hr0\min0}{\comment StarWriter}{\vern6800}}\deftab709 +{\*\pgdsctbl +{\pgdsc0\pgdscuse195\pgwsxn11905\pghsxn16837\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Standard;}} +{\*\pgdscno0}\paperh16837\paperw11905\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\pgwsxn11905\pghsxn16837\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc +\pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary had a little lamb,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Little lamb, little l\'e4mb,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary had a little lamb,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Its fleece as white as snow.} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Everywhere that Mary went,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary went, Mary went,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Everywhere that Mary went,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 The lamb was sure to go.} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It followed her to school one day,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 School one day, school one day,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It followed her to school one day,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Which was against the rules.} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It made the children laugh and play,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Laugh and play, laugh and play,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It made the children laugh and play,} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 To see a lamb in school.} +\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 +\par } \ No newline at end of file