From 439ddf7c3db12a94d648eca76d1b144ae583638f Mon Sep 17 00:00:00 2001
From: helmutm <helmutm@fd906abe-77d9-0310-91a1-e0d9ade77398>
Date: Mon, 8 Mar 2010 07:20:54 +0000
Subject: [PATCH] provide simple function for stripping all HTML tags from a
 text

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@3759 fd906abe-77d9-0310-91a1-e0d9ade77398
---
 util/html.py  | 18 +++++++++++++++++-
 util/html.txt | 12 +++++++++---
 2 files changed, 26 insertions(+), 4 deletions(-)
diff --git a/util/html.py b/util/html.py
index 7bc3992..f701e1d 100644
--- a/util/html.py
+++ b/util/html.py
@@ -1,5 +1,5 @@
 #
-#  Copyright (c) 2009 Helmut Merz helmutm@cy55.de
+#  Copyright (c) 2010 Helmut Merz helmutm@cy55.de
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
@@ -23,6 +23,7 @@ $Id$
 """
 
 from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Comment
+from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString
 
 validTags = ('a b br div em font h1 h2 h3 i li ol p pre span strong '
              'table td tr u ul').split()
@@ -72,3 +73,18 @@ def checkStyle(k):
         if k.startswith(name):
             return True
     return False
+
+
+def stripAll(value):
+    def collectText(tags):
+        for tag in tags:
+            if type(tag) is NavigableString:
+                data.append(tag)
+            elif tag is not None and type(tag) is not Declaration:
+                collectText(tag.contents)
+    data = []
+    soup = BeautifulSoup(value)
+    collectText(soup.contents)
+    text = u''.join(data).replace(u'\n', u'').replace(u'&nbsp;', u' ')
+    return text
+
diff --git a/util/html.txt b/util/html.txt
index cea5772..176c1a3 100644
--- a/util/html.txt
+++ b/util/html.txt
@@ -8,15 +8,15 @@ $Id$
 
   >>> input = """<html>
   ... <p class="standard" style="font-size: 200%; font-weight: bold">
-  ...   <a href="blubb"><b>Text</b></a>
+  ...   <a href="blubb"><b>Text</b>, and more</a>
   ... </p>
   ... </html>"""
 
   >>> sanitize(input, validAttrs=['style'])
-  u'\n<p style="font-weight: bold">\n<a><b>Text</b></a>\n</p>\n'
+  u'\n<p style="font-weight: bold">\n<a><b>Text</b>, and more</a>\n</p>\n'
 
   >>> sanitize(input, ['p', 'b'], ['class'])
-  u'\n<p class="standard">\n<b>Text</b>\n</p>\n'
+  u'\n<p class="standard">\n<b>Text</b>, and more\n</p>\n'
 
 All comments are stripped from the HTML input.
 
@@ -27,3 +27,9 @@ All comments are stripped from the HTML input.
 
   >>> sanitize(input2)
   u'\n<p>text</p>\n\n<p>text</p>'
+
+It is also possible to strip all HTML tags from the input string.
+
+  >>> from cybertools.util.html import stripAll
+  >>> stripAll(input)
+  u'Text, and more'