diff --git a/text/README.txt b/text/README.txt index c3aad8b..d14f8e5 100644 --- a/text/README.txt +++ b/text/README.txt @@ -22,7 +22,7 @@ HTML >>> from cybertools.text.html import htmlToText >>> html = open(os.path.join(testdir, 'selfhtml.html')).read() - >>> text = htmlToText(html) + >>> text = htmlToText(html.decode('ISO8859-15')) >>> '

' in html True >>> '

' in text diff --git a/text/html.py b/text/html.py index a34239e..f984a43 100644 --- a/text/html.py +++ b/text/html.py @@ -26,13 +26,22 @@ import os, sys from cStringIO import StringIO from cybertools.text import base -from cybertools.text.lib.BeautifulSoup import BeautifulSoup, NavigableString +from cybertools.text.lib.BeautifulSoup import BeautifulSoup +from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString -def htmlToText(html): +class HtmlTransform(base.BaseTransform): + + def __call__(self, fr): + input = fr.read().decode('UTF-8') + return htmlToText(input) + + +def htmlToText(input): data = [] - soup = BeautifulSoup(html).html - collectText([soup], data) + input = input.replace(u'