From ea0999a5c0b72183c6896eb59dd6301202b7547c Mon Sep 17 00:00:00 2001 From: Helmut Merz Date: Fri, 18 Jan 2013 16:36:04 +0100 Subject: [PATCH] provide stripping of HTML comments --- util/html.py | 9 ++++++++- util/html.txt | 9 ++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/util/html.py b/util/html.py index e3950b1..f3dba46 100644 --- a/util/html.py +++ b/util/html.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2012 Helmut Merz helmutm@cy55.de +# Copyright (c) 2013 Helmut Merz helmutm@cy55.de # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -76,6 +76,13 @@ def checkStyle(k): return False +def stripComments(value): + soup = BeautifulSoup(value) + for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): + comment.extract() + return soup.renderContents().decode('utf8') + + def stripAll(value): value = sanitize(value) def collectText(tags): diff --git a/util/html.txt b/util/html.txt index 176c1a3..b94cd9f 100644 --- a/util/html.txt +++ b/util/html.txt @@ -2,9 +2,7 @@ Tweaking HTML text ================== -$Id$ - - >>> from cybertools.util.html import sanitize + >>> from cybertools.util.html import sanitize, stripComments >>> input = """ ...

@@ -28,6 +26,11 @@ All comments are stripped from the HTML input. >>> sanitize(input2) u'\n

text

\n\n

text

' +It's also possible to remove only the comments from the HTML input. + + >>> stripComments(input2) + u'\n

text

\n\n

text

' + It is also possible to strip all HTML tags from the input string. >>> from cybertools.util.html import stripAll