make HTML transformation work

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@3214 fd906abe-77d9-0310-91a1-e0d9ade77398
2009-02-07 16:27:04 +00:00 · 2009-02-07 16:27:04 +00:00 · 70ec37efb8
commit 70ec37efb8
parent d7d6558b3a
2 changed files with 15 additions and 6 deletions
--- a/text/README.txt
+++ b/text/README.txt
@ -22,7 +22,7 @@ HTML
  >>> from cybertools.text.html import htmlToText
  >>> html = open(os.path.join(testdir, 'selfhtml.html')).read()
-  >>> text = htmlToText(html)
+  >>> text = htmlToText(html.decode('ISO8859-15'))
  >>> '<p>' in html
  True
  >>> '<p>' in text
--- a/text/html.py
+++ b/text/html.py
@ -26,13 +26,22 @@ import os, sys
 from cStringIO import StringIO
 from cybertools.text import base
-from cybertools.text.lib.BeautifulSoup import BeautifulSoup, NavigableString
+from cybertools.text.lib.BeautifulSoup import BeautifulSoup
 from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString
-def htmlToText(html):
+class HtmlTransform(base.BaseTransform):
    def __call__(self, fr):
        input = fr.read().decode('UTF-8')
        return htmlToText(input)
 def htmlToText(input):
    data = []
-    soup = BeautifulSoup(html).html
+    input = input.replace(u'<!--', u'')
-    collectText([soup], data)
+    soup = BeautifulSoup(input)
    collectText(soup.contents, data)
    text = u' '.join(data).replace(u'\n', u'').replace(u'&nbsp;', u'')
    return text
@ -40,5 +49,5 @@ def collectText(tags, data):
    for tag in tags:
        if type(tag) is NavigableString:
            data.append(tag)
-        else:
+        elif tag is not None and type(tag) is not Declaration:
            collectText(tag.contents, data)