make HTML transformation work
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@3214 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
d7d6558b3a
commit
70ec37efb8
2 changed files with 15 additions and 6 deletions
|
@ -22,7 +22,7 @@ HTML
|
||||||
|
|
||||||
>>> from cybertools.text.html import htmlToText
|
>>> from cybertools.text.html import htmlToText
|
||||||
>>> html = open(os.path.join(testdir, 'selfhtml.html')).read()
|
>>> html = open(os.path.join(testdir, 'selfhtml.html')).read()
|
||||||
>>> text = htmlToText(html)
|
>>> text = htmlToText(html.decode('ISO8859-15'))
|
||||||
>>> '<p>' in html
|
>>> '<p>' in html
|
||||||
True
|
True
|
||||||
>>> '<p>' in text
|
>>> '<p>' in text
|
||||||
|
|
19
text/html.py
19
text/html.py
|
@ -26,13 +26,22 @@ import os, sys
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
|
|
||||||
from cybertools.text import base
|
from cybertools.text import base
|
||||||
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, NavigableString
|
from cybertools.text.lib.BeautifulSoup import BeautifulSoup
|
||||||
|
from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString
|
||||||
|
|
||||||
|
|
||||||
def htmlToText(html):
|
class HtmlTransform(base.BaseTransform):
|
||||||
|
|
||||||
|
def __call__(self, fr):
|
||||||
|
input = fr.read().decode('UTF-8')
|
||||||
|
return htmlToText(input)
|
||||||
|
|
||||||
|
|
||||||
|
def htmlToText(input):
|
||||||
data = []
|
data = []
|
||||||
soup = BeautifulSoup(html).html
|
input = input.replace(u'<!--', u'')
|
||||||
collectText([soup], data)
|
soup = BeautifulSoup(input)
|
||||||
|
collectText(soup.contents, data)
|
||||||
text = u' '.join(data).replace(u'\n', u'').replace(u' ', u'')
|
text = u' '.join(data).replace(u'\n', u'').replace(u' ', u'')
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
@ -40,5 +49,5 @@ def collectText(tags, data):
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
if type(tag) is NavigableString:
|
if type(tag) is NavigableString:
|
||||||
data.append(tag)
|
data.append(tag)
|
||||||
else:
|
elif tag is not None and type(tag) is not Declaration:
|
||||||
collectText(tag.contents, data)
|
collectText(tag.contents, data)
|
||||||
|
|
Loading…
Add table
Reference in a new issue