make HTML transformation work

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@3214 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
helmutm 2009-02-07 16:27:04 +00:00
parent d7d6558b3a
commit 70ec37efb8
2 changed files with 15 additions and 6 deletions

View file

@ -22,7 +22,7 @@ HTML
>>> from cybertools.text.html import htmlToText
>>> html = open(os.path.join(testdir, 'selfhtml.html')).read()
>>> text = htmlToText(html)
>>> text = htmlToText(html.decode('ISO8859-15'))
>>> '<p>' in html
True
>>> '<p>' in text

View file

@ -26,13 +26,22 @@ import os, sys
from cStringIO import StringIO
from cybertools.text import base
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, NavigableString
from cybertools.text.lib.BeautifulSoup import BeautifulSoup
from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString
def htmlToText(html):
class HtmlTransform(base.BaseTransform):
def __call__(self, fr):
input = fr.read().decode('UTF-8')
return htmlToText(input)
def htmlToText(input):
data = []
soup = BeautifulSoup(html).html
collectText([soup], data)
input = input.replace(u'<!--', u'')
soup = BeautifulSoup(input)
collectText(soup.contents, data)
text = u' '.join(data).replace(u'\n', u'').replace(u'&nbsp;', u'')
return text
@ -40,5 +49,5 @@ def collectText(tags, data):
for tag in tags:
if type(tag) is NavigableString:
data.append(tag)
else:
elif tag is not None and type(tag) is not Declaration:
collectText(tag.contents, data)