provide text converters for XLS and PPT
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1626 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
		
							parent
							
								
									65249df13f
								
							
						
					
					
						commit
						06d4652807
					
				
					 9 changed files with 124 additions and 4 deletions
				
			
		|  | @ -66,3 +66,32 @@ Word Documents | |||
|   89 | ||||
|   >>> u'lamb' in words | ||||
|   True | ||||
| 
 | ||||
| PowerPoint Presentations | ||||
| ------------------------ | ||||
| 
 | ||||
|   >>> from cybertools.text.ppt import PptTransform | ||||
|   >>> transform = PptTransform(None) | ||||
|   >>> f = open(os.path.join(testdir, 'mary.ppt')) | ||||
|   >>> result = transform(f) | ||||
|   >>> print log | ||||
|   >>> words = result.split() | ||||
|   >>> len(words) | ||||
|   102 | ||||
|   >>> u'lamb' in words | ||||
|   True | ||||
| 
 | ||||
| Excel Spreadsheets | ||||
| ------------------ | ||||
| 
 | ||||
|   >>> from cybertools.text.xls import XlsTransform | ||||
|   >>> transform = XlsTransform(None) | ||||
|   >>> f = open(os.path.join(testdir, 'mary.xls')) | ||||
|   >>> result = transform(f) | ||||
|   >>> print log | ||||
|   >>> words = result.split() | ||||
|   >>> len(words) | ||||
|   89 | ||||
|   >>> u'lamb' in words | ||||
|   True | ||||
| 
 | ||||
|  |  | |||
|  | @ -17,10 +17,9 @@ | |||
| # | ||||
| 
 | ||||
| """ | ||||
| Searchable text support for Portable Document Format (PDF) files. | ||||
| Searchable text support for MS Word (.doc) files. | ||||
| 
 | ||||
| This uses the pdftotext command from xpdf to perform the extraction. | ||||
| interface definitions for text transformations. | ||||
| This uses the wvware command to perform the extraction. | ||||
| 
 | ||||
| Based on code provided by zc.index and TextIndexNG3. | ||||
| 
 | ||||
|  |  | |||
|  | @ -20,7 +20,6 @@ | |||
| Searchable text support for Portable Document Format (PDF) files. | ||||
| 
 | ||||
| This uses the pdftotext command from xpdf to perform the extraction. | ||||
| interface definitions for text transformations. | ||||
| 
 | ||||
| Based on code provided by zc.index and TextIndexNG3. | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										48
									
								
								text/ppt.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								text/ppt.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,48 @@ | |||
| # | ||||
| #  Copyright (c) 2007 Helmut Merz helmutm@cy55.de | ||||
| # | ||||
| #  This program is free software; you can redistribute it and/or modify | ||||
| #  it under the terms of the GNU General Public License as published by | ||||
| #  the Free Software Foundation; either version 2 of the License, or | ||||
| #  (at your option) any later version. | ||||
| # | ||||
| #  This program is distributed in the hope that it will be useful, | ||||
| #  but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| #  GNU General Public License for more details. | ||||
| # | ||||
| #  You should have received a copy of the GNU General Public License | ||||
| #  along with this program; if not, write to the Free Software | ||||
| #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA | ||||
| # | ||||
| 
 | ||||
| """ | ||||
| Searchable text support for Portable Document Format (PDF) files. | ||||
| 
 | ||||
| This uses the pdftotext command from xpdf to perform the extraction. | ||||
| interface definitions for text transformations. | ||||
| 
 | ||||
| Based on code provided by zc.index and TextIndexNG3. | ||||
| 
 | ||||
| $Id$ | ||||
| """ | ||||
| 
 | ||||
| import os, sys | ||||
| 
 | ||||
| from cybertools.text import base | ||||
| from cybertools.text.html import htmlToText | ||||
| 
 | ||||
| 
 | ||||
| class PptTransform(base.BaseFileTransform): | ||||
| 
 | ||||
|     extension = ".ppt" | ||||
| 
 | ||||
|     def extract(self, directory, filename): | ||||
|         if not self.checkAvailable('ppthtml', 'ppthtml is not available'): | ||||
|             return u'' | ||||
|         if sys.platform == 'win32': | ||||
|             html = self.execute('ppthtml "%s" 2> nul:' % filename) | ||||
|         else: | ||||
|             html = self.execute('ppthtml "%s" 2> /dev/null' % filename) | ||||
|         data = htmlToText(html) | ||||
|         return data.decode('ISO8859-15') | ||||
							
								
								
									
										
											BIN
										
									
								
								text/testfiles/mary.odp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text/testfiles/mary.odp
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								text/testfiles/mary.ods
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text/testfiles/mary.ods
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								text/testfiles/mary.ppt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text/testfiles/mary.ppt
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								text/testfiles/mary.xls
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text/testfiles/mary.xls
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										45
									
								
								text/xls.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								text/xls.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,45 @@ | |||
| # | ||||
| #  Copyright (c) 2007 Helmut Merz helmutm@cy55.de | ||||
| # | ||||
| #  This program is free software; you can redistribute it and/or modify | ||||
| #  it under the terms of the GNU General Public License as published by | ||||
| #  the Free Software Foundation; either version 2 of the License, or | ||||
| #  (at your option) any later version. | ||||
| # | ||||
| #  This program is distributed in the hope that it will be useful, | ||||
| #  but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| #  GNU General Public License for more details. | ||||
| # | ||||
| #  You should have received a copy of the GNU General Public License | ||||
| #  along with this program; if not, write to the Free Software | ||||
| #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA | ||||
| # | ||||
| 
 | ||||
| """ | ||||
| Searchable text support for MS Excel (.xls) files. | ||||
| 
 | ||||
| This uses the xls2csv command to perform the extraction. | ||||
| 
 | ||||
| Based on code provided by zc.index and TextIndexNG3. | ||||
| 
 | ||||
| $Id$ | ||||
| """ | ||||
| 
 | ||||
| import os, sys | ||||
| 
 | ||||
| from cybertools.text import base | ||||
| 
 | ||||
| 
 | ||||
| class XlsTransform(base.BaseFileTransform): | ||||
| 
 | ||||
|     extension = ".xls" | ||||
| 
 | ||||
|     def extract(self, directory, filename): | ||||
|         if not self.checkAvailable('xls2csv', 'xls2csv is not available'): | ||||
|             return u'' | ||||
|         if sys.platform == 'win32': | ||||
|             data = self.execute('xls2csv -d 8859-1 -q 0 "%s" 2> nul:' % filename) | ||||
|         else: | ||||
|             data = self.execute('xls2csv -d 8859-1 -q 0 "%s" 2> /dev/null' % filename) | ||||
|         return data.decode('ISO8859-1') | ||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 helmutm
						helmutm