make HTML transformation work

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@3214 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
helmutm 2009-02-07 16:27:04 +00:00
parent d7d6558b3a
commit 70ec37efb8
2 changed files with 15 additions and 6 deletions

View file

@ -22,7 +22,7 @@ HTML
>>> from cybertools.text.html import htmlToText >>> from cybertools.text.html import htmlToText
>>> html = open(os.path.join(testdir, 'selfhtml.html')).read() >>> html = open(os.path.join(testdir, 'selfhtml.html')).read()
>>> text = htmlToText(html) >>> text = htmlToText(html.decode('ISO8859-15'))
>>> '<p>' in html >>> '<p>' in html
True True
>>> '<p>' in text >>> '<p>' in text

View file

@ -26,13 +26,22 @@ import os, sys
from cStringIO import StringIO from cStringIO import StringIO
from cybertools.text import base from cybertools.text import base
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, NavigableString from cybertools.text.lib.BeautifulSoup import BeautifulSoup
from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString
def htmlToText(html): class HtmlTransform(base.BaseTransform):
def __call__(self, fr):
input = fr.read().decode('UTF-8')
return htmlToText(input)
def htmlToText(input):
data = [] data = []
soup = BeautifulSoup(html).html input = input.replace(u'<!--', u'')
collectText([soup], data) soup = BeautifulSoup(input)
collectText(soup.contents, data)
text = u' '.join(data).replace(u'\n', u'').replace(u'&nbsp;', u'') text = u' '.join(data).replace(u'\n', u'').replace(u'&nbsp;', u'')
return text return text
@ -40,5 +49,5 @@ def collectText(tags, data):
for tag in tags: for tag in tags:
if type(tag) is NavigableString: if type(tag) is NavigableString:
data.append(tag) data.append(tag)
else: elif tag is not None and type(tag) is not Declaration:
collectText(tag.contents, data) collectText(tag.contents, data)