added text conversion for RTF
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1629 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
		
							parent
							
								
									06d4652807
								
							
						
					
					
						commit
						7b236ecbec
					
				
					 3 changed files with 112 additions and 0 deletions
				
			
		|  | @ -67,6 +67,20 @@ Word Documents | ||||||
|   >>> u'lamb' in words |   >>> u'lamb' in words | ||||||
|   True |   True | ||||||
| 
 | 
 | ||||||
|  | RTF Files | ||||||
|  | --------- | ||||||
|  | 
 | ||||||
|  |   >>> from cybertools.text.rtf import RtfTransform | ||||||
|  |   >>> transform = RtfTransform(None) | ||||||
|  |   >>> f = open(os.path.join(testdir, 'mary.rtf')) | ||||||
|  |   >>> result = transform(f) | ||||||
|  |   >>> print log | ||||||
|  |   >>> words = result.split() | ||||||
|  |   >>> len(words) | ||||||
|  |   90 | ||||||
|  |   >>> u'lamb' in words | ||||||
|  |   True | ||||||
|  | 
 | ||||||
| PowerPoint Presentations | PowerPoint Presentations | ||||||
| ------------------------ | ------------------------ | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										63
									
								
								text/rtf.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										63
									
								
								text/rtf.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,63 @@ | ||||||
|  | # | ||||||
|  | #  Copyright (c) 2007 Helmut Merz helmutm@cy55.de | ||||||
|  | # | ||||||
|  | #  This program is free software; you can redistribute it and/or modify | ||||||
|  | #  it under the terms of the GNU General Public License as published by | ||||||
|  | #  the Free Software Foundation; either version 2 of the License, or | ||||||
|  | #  (at your option) any later version. | ||||||
|  | # | ||||||
|  | #  This program is distributed in the hope that it will be useful, | ||||||
|  | #  but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | #  GNU General Public License for more details. | ||||||
|  | # | ||||||
|  | #  You should have received a copy of the GNU General Public License | ||||||
|  | #  along with this program; if not, write to the Free Software | ||||||
|  | #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | Searchable text support for MS Word (.doc) files. | ||||||
|  | 
 | ||||||
|  | This uses the wvware command to perform the extraction. | ||||||
|  | 
 | ||||||
|  | Based on code provided by zc.index and TextIndexNG3. | ||||||
|  | 
 | ||||||
|  | $Id$ | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | import os, sys | ||||||
|  | from xml import sax | ||||||
|  | from cStringIO import StringIO | ||||||
|  | 
 | ||||||
|  | from cybertools.text import base | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class RtfTextHandler(sax.ContentHandler): | ||||||
|  | 
 | ||||||
|  |     def characters(self, text): | ||||||
|  |         self._data.write(text.encode('UTF-8')) | ||||||
|  | 
 | ||||||
|  |     def startDocument(self): | ||||||
|  |         self._data = StringIO() | ||||||
|  | 
 | ||||||
|  |     def startElement(self, name, attrs): | ||||||
|  |         if name == 'para': | ||||||
|  |             self._data.write('\n') | ||||||
|  | 
 | ||||||
|  |     def getData(self): | ||||||
|  |         return self._data.getvalue() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class RtfTransform(base.BaseFileTransform): | ||||||
|  | 
 | ||||||
|  |     extension = ".rtf" | ||||||
|  | 
 | ||||||
|  |     def extract(self, directory, filename): | ||||||
|  |         if not self.checkAvailable('rtf2xml', 'rtf2xml is not available'): | ||||||
|  |             return u'' | ||||||
|  |         #xmlstr = self.execute('cd /tmp && rtf2xml --no-dtd "%s"' % filename) | ||||||
|  |         xmlstr = self.execute('rtf2xml --no-dtd "%s"' % filename) | ||||||
|  |         handler = RtfTextHandler() | ||||||
|  |         sax.parseString(xmlstr, handler) | ||||||
|  |         return handler.getData().decode('UTF-8') | ||||||
							
								
								
									
										35
									
								
								text/testfiles/mary.rtf
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								text/testfiles/mary.rtf
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,35 @@ | ||||||
|  | {\rtf1\ansi\deff1\adeflang1025 | ||||||
|  | {\fonttbl{\f0\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f1\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f2\fswiss\fprq2\fcharset0 Nimbus Sans L{\*\falt Arial};}{\f3\froman\fprq2\fcharset0 Nimbus Roman No9 L{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Nimbus Sans L{\*\falt Arial};}{\f5\fnil\fprq2\fcharset0 DejaVu Sans;}{\f6\fswiss\fprq2\fcharset0 DejaVu Sans;}} | ||||||
|  | {\colortbl;\red0\green0\blue0;\red128\green128\blue128;} | ||||||
|  | {\stylesheet{\s1\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\snext1 Normal;} | ||||||
|  | {\s2\sb240\sa120\keepn\aspalpha\rtlch\af5\afs28\lang255\ltrch\dbch\af5\langfe255\hich\f2\fs28\lang1031\loch\f2\fs28\lang1031\sbasedon1\snext3 Heading;} | ||||||
|  | {\s3\sa120\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext3 Body Text;} | ||||||
|  | {\s4\sa120\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon3\snext4 List;} | ||||||
|  | {\s5\sb120\sa120\aspalpha\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\i\loch\fs24\lang1031\i\sbasedon1\snext5 caption;} | ||||||
|  | {\s6\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext6 Index;} | ||||||
|  | {\s7\sb240\sa120\keepn\aspalpha\rtlch\af1\afs28\lang255\ltrch\dbch\af1\langfe255\hich\f4\fs28\lang1031\loch\f4\fs28\lang1031\sbasedon1\snext3 Heading;} | ||||||
|  | {\s8\sb120\sa120\aspalpha\rtlch\af6\afs24\lang255\ai\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\i\loch\fs24\lang1031\i\sbasedon1\snext8 caption;} | ||||||
|  | {\s9\aspalpha\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031\sbasedon1\snext9 Index;} | ||||||
|  | } | ||||||
|  | {\info{\creatim\yr2007\mo3\dy8\hr12\min8}{\revtim\yr1601\mo1\dy1\hr0\min0}{\printim\yr1601\mo1\dy1\hr0\min0}{\comment StarWriter}{\vern6800}}\deftab709 | ||||||
|  | {\*\pgdsctbl | ||||||
|  | {\pgdsc0\pgdscuse195\pgwsxn11905\pghsxn16837\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Standard;}} | ||||||
|  | {\*\pgdscno0}\paperh16837\paperw11905\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\pgwsxn11905\pghsxn16837\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc | ||||||
|  | \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary had a little lamb,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Little lamb, little l\'e4mb,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary had a little lamb,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Its fleece as white as snow.} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Everywhere that Mary went,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Mary went, Mary went,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Everywhere that Mary went,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 The lamb was sure to go.} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It followed her to school one day,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 School one day, school one day,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It followed her to school one day,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Which was against the rules.} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It made the children laugh and play,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 Laugh and play, laugh and play,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 It made the children laugh and play,} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031 {\rtlch \ltrch\loch\f1\fs24\lang1031\i0\b0 To see a lamb in school.} | ||||||
|  | \par \pard\plain \ltrpar\s1\aspalpha\ql\rtlch\af6\afs24\lang255\ltrch\dbch\af6\langfe255\hich\fs24\lang1031\loch\fs24\lang1031  | ||||||
|  | \par } | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 helmutm
						helmutm