cybertools/text/pdf.py
helmutm f2ec195a55 added package cybertools.text
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1383 fd906abe-77d9-0310-91a1-e0d9ade77398
2006-10-04 08:53:58 +00:00

26 lines
711 B
Python

"""Searchable text support for Portable Document Format (PDF) files.
This uses the pdftotext command from xpdf to perform the extraction.
"""
__docformat__ = "reStructuredText"
import os, sys
from cybertools.text import base
class PdfTransform(base.BaseFileTransform):
extension = ".pdf"
def extract(self, directory, filename):
if not base.haveProgram("pdftotext"):
print 'Warning: pdftotext is not available'
return u''
txtfile = os.path.join(directory, "words.txt")
st = os.system("pdftotext -enc UTF-8 %s %s" % (filename, txtfile))
f = open(txtfile, "rb")
data = f.read()
f.close()
return unicode(data, "utf-8")