remove special HTML comments from MS Word even when escaped

This commit is contained in:
Helmut Merz 2013-03-01 16:31:02 +01:00
parent 074528711d
commit 4fc0538e30

View file

@ -20,6 +20,8 @@
Strip HTML tags and other HTML-related utilities.
"""
import re
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Comment
from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString
@ -32,9 +34,11 @@ validAttrs = ('align alt border cellpadding cellspacing class colspan '
validStyles = 'font-style font-weight'.split()
validStyleParts = 'border padding'.split()
escCommPattern = re.compile(r'<\!--\[if .*?\!\[endif\]-->', re.DOTALL)
def sanitize(value, validTags=validTags, validAttrs=validAttrs,
validStyles=validStyles):
validStyles=validStyles, stripEscapedComments=True):
soup = BeautifulSoup(value)
for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
comment.extract()
@ -51,7 +55,10 @@ def sanitize(value, validTags=validTags, validAttrs=validAttrs,
if val:
attrs.append((attr, val))
tag.attrs = attrs
return soup.renderContents().decode('utf8')
result = soup.renderContents()
if stripEscapedComments:
result = escCommPattern.sub(u'', result)
return result.decode('utf8')
def sanitizeStyle(value, validStyles=validStyles):