provide simple function for stripping all HTML tags from a text

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@3759 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
helmutm 2010-03-08 07:20:54 +00:00
parent 23fd90ac1f
commit 439ddf7c3d
2 changed files with 26 additions and 4 deletions

View file

@ -1,5 +1,5 @@
#
# Copyright (c) 2009 Helmut Merz helmutm@cy55.de
# Copyright (c) 2010 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -23,6 +23,7 @@ $Id$
"""
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Comment
from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString
validTags = ('a b br div em font h1 h2 h3 i li ol p pre span strong '
'table td tr u ul').split()
@ -72,3 +73,18 @@ def checkStyle(k):
if k.startswith(name):
return True
return False
def stripAll(value):
def collectText(tags):
for tag in tags:
if type(tag) is NavigableString:
data.append(tag)
elif tag is not None and type(tag) is not Declaration:
collectText(tag.contents)
data = []
soup = BeautifulSoup(value)
collectText(soup.contents)
text = u''.join(data).replace(u'\n', u'').replace(u' ', u' ')
return text

View file

@ -8,15 +8,15 @@ $Id$
>>> input = """<html>
... <p class="standard" style="font-size: 200%; font-weight: bold">
... <a href="blubb"><b>Text</b></a>
... <a href="blubb"><b>Text</b>, and more</a>
... </p>
... </html>"""
>>> sanitize(input, validAttrs=['style'])
u'\n<p style="font-weight: bold">\n<a><b>Text</b></a>\n</p>\n'
u'\n<p style="font-weight: bold">\n<a><b>Text</b>, and more</a>\n</p>\n'
>>> sanitize(input, ['p', 'b'], ['class'])
u'\n<p class="standard">\n<b>Text</b>\n</p>\n'
u'\n<p class="standard">\n<b>Text</b>, and more\n</p>\n'
All comments are stripped from the HTML input.
@ -27,3 +27,9 @@ All comments are stripped from the HTML input.
>>> sanitize(input2)
u'\n<p>text</p>\n\n<p>text</p>'
It is also possible to strip all HTML tags from the input string.
>>> from cybertools.util.html import stripAll
>>> stripAll(input)
u'Text, and more'