added text conversion for RTF
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1629 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
06d4652807
commit
7b236ecbec
3 changed files with 112 additions and 0 deletions
|
@ -67,6 +67,20 @@ Word Documents
|
|||
>>> u'lamb' in words
|
||||
True
|
||||
|
||||
RTF Files
|
||||
---------
|
||||
|
||||
>>> from cybertools.text.rtf import RtfTransform
|
||||
>>> transform = RtfTransform(None)
|
||||
>>> f = open(os.path.join(testdir, 'mary.rtf'))
|
||||
>>> result = transform(f)
|
||||
>>> print log
|
||||
>>> words = result.split()
|
||||
>>> len(words)
|
||||
90
|
||||
>>> u'lamb' in words
|
||||
True
|
||||
|
||||
PowerPoint Presentations
|
||||
------------------------
|
||||
|
||||
|
|
63
text/rtf.py
Normal file
63
text/rtf.py
Normal file
|
@ -0,0 +1,63 @@
|
|||
#
|
||||
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
#
|
||||
|
||||
"""
|
||||
Searchable text support for MS Word (.doc) files.
|
||||
|
||||
This uses the wvware command to perform the extraction.
|
||||
|
||||
Based on code provided by zc.index and TextIndexNG3.
|
||||
|
||||
$Id$
|
||||
"""
|
||||
|
||||
import os, sys
|
||||
from xml import sax
|
||||
from cStringIO import StringIO
|
||||
|
||||
from cybertools.text import base
|
||||
|
||||
|
||||
class RtfTextHandler(sax.ContentHandler):
|
||||
|
||||
def characters(self, text):
|
||||
self._data.write(text.encode('UTF-8'))
|
||||
|
||||
def startDocument(self):
|
||||
self._data = StringIO()
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
if name == 'para':
|
||||
self._data.write('\n')
|
||||
|
||||
def getData(self):
|
||||
return self._data.getvalue()
|
||||
|
||||
|
||||
class RtfTransform(base.BaseFileTransform):
|
||||
|
||||
extension = ".rtf"
|
||||
|
||||
def extract(self, directory, filename):
|
||||
if not self.checkAvailable('rtf2xml', 'rtf2xml is not available'):
|
||||
return u''
|
||||
#xmlstr = self.execute('cd /tmp && rtf2xml --no-dtd "%s"' % filename)
|
||||
xmlstr = self.execute('rtf2xml --no-dtd "%s"' % filename)
|
||||
handler = RtfTextHandler()
|
||||
sax.parseString(xmlstr, handler)
|
||||
return handler.getData().decode('UTF-8')
|
35
text/testfiles/mary.rtf
Normal file
35
text/testfiles/mary.rtf
Normal file
|
@ -0,0 +1,35 @@
|
|||
{\rtf1\ansi\deff1\adeflang1025
|
||||
{\fonttbl{\f0\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f1\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f2\fswiss\fprq2\fcharset0 Nimbus Sans L{\*\falt Arial};}{\f3\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Nimbus Sans L{\*\falt Arial};}{\f5\fnil\fprq2\fcharset0 DejaVu Sans;}{\f6\fswiss\fprq2\fcharset0 DejaVu Sans;}}
|
||||
{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}
|
||||
{\stylesheet{\s1\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\snext1 Normal;}
|
||||
{\s2\sb240\sa120\keepn\aspalpha\rtlch\af5\afs28\lang255\ltrch\dbch\af5\langfe255\hich\f2\fs28\lang1031\loch\f2\fs28\lang1031\sbasedon1\snext3 Heading;}
|
||||
{\s3\sa120\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext3 Body Text;}
|
||||
{\s4\sa120\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon3\snext4 List;}
|
||||
{\s5\sb120\sa120\aspalpha\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\i\loch\fs24\lang1031\i\sbasedon1\snext5 caption;}
|
||||
{\s6\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext6 Index;}
|
||||
{\s7\sb240\sa120\keepn\aspalpha\rtlch\af1\afs28\lang255\ltrch\dbch\af1\langfe255\hich\f4\fs28\lang1031\loch\f4\fs28\lang1031\sbasedon1\snext3 Heading;}
|
||||
{\s8\sb120\sa120\aspalpha\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\i\loch\fs24\lang1031\i\sbasedon1\snext8 caption;}
|
||||
{\s9\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext9 Index;}
|
||||
}
|
||||
{\info{\creatim\yr2007\mo3\dy8\hr12\min8}{\revtim\yr1601\mo1\dy1\hr0\min0}{\printim\yr1601\mo1\dy1\hr0\min0}{\comment StarWriter}{\vern6800}}\deftab709
|
||||
{\*\pgdsctbl
|
||||
{\pgdsc0\pgdscuse195\pgwsxn11905\pghsxn16837\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Standard;}}
|
||||
{\*\pgdscno0}\paperh16837\paperw11905\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\pgwsxn11905\pghsxn16837\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
|
||||
\pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary had a little lamb,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Little lamb, little l\'e4mb,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary had a little lamb,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Its fleece as white as snow.}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Everywhere that Mary went,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary went, Mary went,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Everywhere that Mary went,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 The lamb was sure to go.}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It followed her to school one day,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 School one day, school one day,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It followed her to school one day,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Which was against the rules.}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It made the children laugh and play,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Laugh and play, laugh and play,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It made the children laugh and play,}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 To see a lamb in school.}
|
||||
\par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031
|
||||
\par }
|
Loading…
Add table
Reference in a new issue