From 4fc0538e30786f18b75a01583948e8acf7a7cce8 Mon Sep 17 00:00:00 2001 From: Helmut Merz Date: Fri, 1 Mar 2013 16:31:02 +0100 Subject: [PATCH] remove special HTML comments from MS Word even when escaped --- util/html.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/util/html.py b/util/html.py index f3dba46..56abd5a 100644 --- a/util/html.py +++ b/util/html.py @@ -20,6 +20,8 @@ Strip HTML tags and other HTML-related utilities. """ +import re + from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Comment from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString @@ -32,9 +34,11 @@ validAttrs = ('align alt border cellpadding cellspacing class colspan ' validStyles = 'font-style font-weight'.split() validStyleParts = 'border padding'.split() +escCommPattern = re.compile(r'<\!--\[if .*?\!\[endif\]-->', re.DOTALL) + def sanitize(value, validTags=validTags, validAttrs=validAttrs, - validStyles=validStyles): + validStyles=validStyles, stripEscapedComments=True): soup = BeautifulSoup(value) for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): comment.extract() @@ -51,7 +55,10 @@ def sanitize(value, validTags=validTags, validAttrs=validAttrs, if val: attrs.append((attr, val)) tag.attrs = attrs - return soup.renderContents().decode('utf8') + result = soup.renderContents() + if stripEscapedComments: + result = escCommPattern.sub(u'', result) + return result.decode('utf8') def sanitizeStyle(value, validStyles=validStyles):