make HTML transformation work
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@3214 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
d7d6558b3a
commit
70ec37efb8
2 changed files with 15 additions and 6 deletions
|
@ -22,7 +22,7 @@ HTML
|
|||
|
||||
>>> from cybertools.text.html import htmlToText
|
||||
>>> html = open(os.path.join(testdir, 'selfhtml.html')).read()
|
||||
>>> text = htmlToText(html)
|
||||
>>> text = htmlToText(html.decode('ISO8859-15'))
|
||||
>>> '<p>' in html
|
||||
True
|
||||
>>> '<p>' in text
|
||||
|
|
19
text/html.py
19
text/html.py
|
@ -26,13 +26,22 @@ import os, sys
|
|||
from cStringIO import StringIO
|
||||
|
||||
from cybertools.text import base
|
||||
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, NavigableString
|
||||
from cybertools.text.lib.BeautifulSoup import BeautifulSoup
|
||||
from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString
|
||||
|
||||
|
||||
def htmlToText(html):
|
||||
class HtmlTransform(base.BaseTransform):
|
||||
|
||||
def __call__(self, fr):
|
||||
input = fr.read().decode('UTF-8')
|
||||
return htmlToText(input)
|
||||
|
||||
|
||||
def htmlToText(input):
|
||||
data = []
|
||||
soup = BeautifulSoup(html).html
|
||||
collectText([soup], data)
|
||||
input = input.replace(u'<!--', u'')
|
||||
soup = BeautifulSoup(input)
|
||||
collectText(soup.contents, data)
|
||||
text = u' '.join(data).replace(u'\n', u'').replace(u' ', u'')
|
||||
return text
|
||||
|
||||
|
@ -40,5 +49,5 @@ def collectText(tags, data):
|
|||
for tag in tags:
|
||||
if type(tag) is NavigableString:
|
||||
data.append(tag)
|
||||
else:
|
||||
elif tag is not None and type(tag) is not Declaration:
|
||||
collectText(tag.contents, data)
|
||||
|
|
Loading…
Add table
Reference in a new issue