From 439ddf7c3db12a94d648eca76d1b144ae583638f Mon Sep 17 00:00:00 2001 From: helmutm Date: Mon, 8 Mar 2010 07:20:54 +0000 Subject: [PATCH] provide simple function for stripping all HTML tags from a text git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@3759 fd906abe-77d9-0310-91a1-e0d9ade77398 --- util/html.py | 18 +++++++++++++++++- util/html.txt | 12 +++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/util/html.py b/util/html.py index 7bc3992..f701e1d 100644 --- a/util/html.py +++ b/util/html.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2009 Helmut Merz helmutm@cy55.de +# Copyright (c) 2010 Helmut Merz helmutm@cy55.de # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -23,6 +23,7 @@ $Id$ """ from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Comment +from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString validTags = ('a b br div em font h1 h2 h3 i li ol p pre span strong ' 'table td tr u ul').split() @@ -72,3 +73,18 @@ def checkStyle(k): if k.startswith(name): return True return False + + +def stripAll(value): + def collectText(tags): + for tag in tags: + if type(tag) is NavigableString: + data.append(tag) + elif tag is not None and type(tag) is not Declaration: + collectText(tag.contents) + data = [] + soup = BeautifulSoup(value) + collectText(soup.contents) + text = u''.join(data).replace(u'\n', u'').replace(u' ', u' ') + return text + diff --git a/util/html.txt b/util/html.txt index cea5772..176c1a3 100644 --- a/util/html.txt +++ b/util/html.txt @@ -8,15 +8,15 @@ $Id$ >>> input = """ ...

- ... Text + ... Text, and more ...

... """ >>> sanitize(input, validAttrs=['style']) - u'\n

\nText\n

\n' + u'\n

\nText, and more\n

\n' >>> sanitize(input, ['p', 'b'], ['class']) - u'\n

\nText\n

\n' + u'\n

\nText, and more\n

\n' All comments are stripped from the HTML input. @@ -27,3 +27,9 @@ All comments are stripped from the HTML input. >>> sanitize(input2) u'\n

text

\n\n

text

' + +It is also possible to strip all HTML tags from the input string. + + >>> from cybertools.util.html import stripAll + >>> stripAll(input) + u'Text, and more'