diff --git a/util/html.py b/util/html.py new file mode 100644 index 0000000..79bd6ec --- /dev/null +++ b/util/html.py @@ -0,0 +1,63 @@ +# +# Copyright (c) 2009 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +""" +Strip HTML tags and other HTML-related utilities. + +$Id$ +""" + +from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Comment + +#validTags = 'p i strong b u a h1 h2 h3 img pre br'.split() +validTags = 'b br div em font h1 h2 h3 i p pre span strong table td tr u'.split() + +#validAttrs = 'href src'.split() +validAttrs = 'class style'.split() + +validStyles = 'font-style font-weight'.split() + + +def sanitize(value, validTags=validTags, validAttrs=validAttrs, + validStyles=validStyles): + soup = BeautifulSoup(value) + for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): + comment.extract() + for tag in soup.findAll(True): + if tag.name not in validTags: + tag.hidden = True + attrs = [] + for attr, val in tag.attrs: + if attr not in validAttrs: + continue + if attr == 'style': + val = sanitizeStyle(val, validStyles) + if val: + attrs.append((attr, val)) + tag.attrs = attrs + return soup.renderContents().decode('utf8') + + +def sanitizeStyle(value, validStyles=validStyles): + result = [] + for item in value.split(';'): + if ':' in item: + k, v = item.split(':') + if k.strip() in validStyles: + result.append(item.strip()) + return '; '.join(result) diff --git a/util/html.txt b/util/html.txt new file mode 100644 index 0000000..4729b41 --- /dev/null +++ b/util/html.txt @@ -0,0 +1,20 @@ +================== +Tweaking HTML text +================== + +$Id$ + + >>> from cybertools.util.html import sanitize + + >>> input = """ + ...
+ ... Text + ...
+ ... """ + + >>> sanitize(input, validAttrs=['style']) + u'\n\nText\n
\n' + + >>> sanitize(input, ['p', 'b'], ['class']) + u'\n\nText\n
\n' + diff --git a/util/tests.py b/util/tests.py index 0ac1c0f..a5ef08e 100755 --- a/util/tests.py +++ b/util/tests.py @@ -24,6 +24,7 @@ def test_suite(): doctest.DocFileSuite('config.txt', optionflags=flags), doctest.DocFileSuite('defer.txt', optionflags=flags), doctest.DocFileSuite('format.txt', optionflags=flags), + doctest.DocFileSuite('html.txt', optionflags=flags), doctest.DocFileSuite('multikey.txt', optionflags=flags), doctest.DocFileSuite('property.txt', optionflags=flags), doctest.DocFileSuite('json.txt', optionflags=flags),