provide text converters for XLS and PPT
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1626 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
		
							parent
							
								
									65249df13f
								
							
						
					
					
						commit
						06d4652807
					
				
					 9 changed files with 124 additions and 4 deletions
				
			
		|  | @ -66,3 +66,32 @@ Word Documents | ||||||
|   89 |   89 | ||||||
|   >>> u'lamb' in words |   >>> u'lamb' in words | ||||||
|   True |   True | ||||||
|  | 
 | ||||||
|  | PowerPoint Presentations | ||||||
|  | ------------------------ | ||||||
|  | 
 | ||||||
|  |   >>> from cybertools.text.ppt import PptTransform | ||||||
|  |   >>> transform = PptTransform(None) | ||||||
|  |   >>> f = open(os.path.join(testdir, 'mary.ppt')) | ||||||
|  |   >>> result = transform(f) | ||||||
|  |   >>> print log | ||||||
|  |   >>> words = result.split() | ||||||
|  |   >>> len(words) | ||||||
|  |   102 | ||||||
|  |   >>> u'lamb' in words | ||||||
|  |   True | ||||||
|  | 
 | ||||||
|  | Excel Spreadsheets | ||||||
|  | ------------------ | ||||||
|  | 
 | ||||||
|  |   >>> from cybertools.text.xls import XlsTransform | ||||||
|  |   >>> transform = XlsTransform(None) | ||||||
|  |   >>> f = open(os.path.join(testdir, 'mary.xls')) | ||||||
|  |   >>> result = transform(f) | ||||||
|  |   >>> print log | ||||||
|  |   >>> words = result.split() | ||||||
|  |   >>> len(words) | ||||||
|  |   89 | ||||||
|  |   >>> u'lamb' in words | ||||||
|  |   True | ||||||
|  | 
 | ||||||
|  |  | ||||||
|  | @ -17,10 +17,9 @@ | ||||||
| # | # | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| Searchable text support for Portable Document Format (PDF) files. | Searchable text support for MS Word (.doc) files. | ||||||
| 
 | 
 | ||||||
| This uses the pdftotext command from xpdf to perform the extraction. | This uses the wvware command to perform the extraction. | ||||||
| interface definitions for text transformations. |  | ||||||
| 
 | 
 | ||||||
| Based on code provided by zc.index and TextIndexNG3. | Based on code provided by zc.index and TextIndexNG3. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -20,7 +20,6 @@ | ||||||
| Searchable text support for Portable Document Format (PDF) files. | Searchable text support for Portable Document Format (PDF) files. | ||||||
| 
 | 
 | ||||||
| This uses the pdftotext command from xpdf to perform the extraction. | This uses the pdftotext command from xpdf to perform the extraction. | ||||||
| interface definitions for text transformations. |  | ||||||
| 
 | 
 | ||||||
| Based on code provided by zc.index and TextIndexNG3. | Based on code provided by zc.index and TextIndexNG3. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										48
									
								
								text/ppt.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								text/ppt.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,48 @@ | ||||||
|  | # | ||||||
|  | #  Copyright (c) 2007 Helmut Merz helmutm@cy55.de | ||||||
|  | # | ||||||
|  | #  This program is free software; you can redistribute it and/or modify | ||||||
|  | #  it under the terms of the GNU General Public License as published by | ||||||
|  | #  the Free Software Foundation; either version 2 of the License, or | ||||||
|  | #  (at your option) any later version. | ||||||
|  | # | ||||||
|  | #  This program is distributed in the hope that it will be useful, | ||||||
|  | #  but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | #  GNU General Public License for more details. | ||||||
|  | # | ||||||
|  | #  You should have received a copy of the GNU General Public License | ||||||
|  | #  along with this program; if not, write to the Free Software | ||||||
|  | #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | Searchable text support for Portable Document Format (PDF) files. | ||||||
|  | 
 | ||||||
|  | This uses the pdftotext command from xpdf to perform the extraction. | ||||||
|  | interface definitions for text transformations. | ||||||
|  | 
 | ||||||
|  | Based on code provided by zc.index and TextIndexNG3. | ||||||
|  | 
 | ||||||
|  | $Id$ | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | import os, sys | ||||||
|  | 
 | ||||||
|  | from cybertools.text import base | ||||||
|  | from cybertools.text.html import htmlToText | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class PptTransform(base.BaseFileTransform): | ||||||
|  | 
 | ||||||
|  |     extension = ".ppt" | ||||||
|  | 
 | ||||||
|  |     def extract(self, directory, filename): | ||||||
|  |         if not self.checkAvailable('ppthtml', 'ppthtml is not available'): | ||||||
|  |             return u'' | ||||||
|  |         if sys.platform == 'win32': | ||||||
|  |             html = self.execute('ppthtml "%s" 2> nul:' % filename) | ||||||
|  |         else: | ||||||
|  |             html = self.execute('ppthtml "%s" 2> /dev/null' % filename) | ||||||
|  |         data = htmlToText(html) | ||||||
|  |         return data.decode('ISO8859-15') | ||||||
							
								
								
									
										
											BIN
										
									
								
								text/testfiles/mary.odp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text/testfiles/mary.odp
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								text/testfiles/mary.ods
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text/testfiles/mary.ods
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								text/testfiles/mary.ppt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text/testfiles/mary.ppt
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								text/testfiles/mary.xls
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text/testfiles/mary.xls
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										45
									
								
								text/xls.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								text/xls.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,45 @@ | ||||||
|  | # | ||||||
|  | #  Copyright (c) 2007 Helmut Merz helmutm@cy55.de | ||||||
|  | # | ||||||
|  | #  This program is free software; you can redistribute it and/or modify | ||||||
|  | #  it under the terms of the GNU General Public License as published by | ||||||
|  | #  the Free Software Foundation; either version 2 of the License, or | ||||||
|  | #  (at your option) any later version. | ||||||
|  | # | ||||||
|  | #  This program is distributed in the hope that it will be useful, | ||||||
|  | #  but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | #  GNU General Public License for more details. | ||||||
|  | # | ||||||
|  | #  You should have received a copy of the GNU General Public License | ||||||
|  | #  along with this program; if not, write to the Free Software | ||||||
|  | #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | Searchable text support for MS Excel (.xls) files. | ||||||
|  | 
 | ||||||
|  | This uses the xls2csv command to perform the extraction. | ||||||
|  | 
 | ||||||
|  | Based on code provided by zc.index and TextIndexNG3. | ||||||
|  | 
 | ||||||
|  | $Id$ | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | import os, sys | ||||||
|  | 
 | ||||||
|  | from cybertools.text import base | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class XlsTransform(base.BaseFileTransform): | ||||||
|  | 
 | ||||||
|  |     extension = ".xls" | ||||||
|  | 
 | ||||||
|  |     def extract(self, directory, filename): | ||||||
|  |         if not self.checkAvailable('xls2csv', 'xls2csv is not available'): | ||||||
|  |             return u'' | ||||||
|  |         if sys.platform == 'win32': | ||||||
|  |             data = self.execute('xls2csv -d 8859-1 -q 0 "%s" 2> nul:' % filename) | ||||||
|  |         else: | ||||||
|  |             data = self.execute('xls2csv -d 8859-1 -q 0 "%s" 2> /dev/null' % filename) | ||||||
|  |         return data.decode('ISO8859-1') | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 helmutm
						helmutm