From 70ec37efb80674ac8ed7551b5e3f1813327682de Mon Sep 17 00:00:00 2001 From: helmutm Date: Sat, 7 Feb 2009 16:27:04 +0000 Subject: [PATCH] make HTML transformation work git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@3214 fd906abe-77d9-0310-91a1-e0d9ade77398 --- text/README.txt | 2 +- text/html.py | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/text/README.txt b/text/README.txt index c3aad8b..d14f8e5 100644 --- a/text/README.txt +++ b/text/README.txt @@ -22,7 +22,7 @@ HTML >>> from cybertools.text.html import htmlToText >>> html = open(os.path.join(testdir, 'selfhtml.html')).read() - >>> text = htmlToText(html) + >>> text = htmlToText(html.decode('ISO8859-15')) >>> '

' in html True >>> '

' in text diff --git a/text/html.py b/text/html.py index a34239e..f984a43 100644 --- a/text/html.py +++ b/text/html.py @@ -26,13 +26,22 @@ import os, sys from cStringIO import StringIO from cybertools.text import base -from cybertools.text.lib.BeautifulSoup import BeautifulSoup, NavigableString +from cybertools.text.lib.BeautifulSoup import BeautifulSoup +from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString -def htmlToText(html): +class HtmlTransform(base.BaseTransform): + + def __call__(self, fr): + input = fr.read().decode('UTF-8') + return htmlToText(input) + + +def htmlToText(input): data = [] - soup = BeautifulSoup(html).html - collectText([soup], data) + input = input.replace(u'