remove special HTML comments from MS Word even when escaped
This commit is contained in:
parent
074528711d
commit
4fc0538e30
1 changed files with 9 additions and 2 deletions
11
util/html.py
11
util/html.py
|
@ -20,6 +20,8 @@
|
|||
Strip HTML tags and other HTML-related utilities.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Comment
|
||||
from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString
|
||||
|
||||
|
@ -32,9 +34,11 @@ validAttrs = ('align alt border cellpadding cellspacing class colspan '
|
|||
validStyles = 'font-style font-weight'.split()
|
||||
validStyleParts = 'border padding'.split()
|
||||
|
||||
escCommPattern = re.compile(r'<\!--\[if .*?\!\[endif\]-->', re.DOTALL)
|
||||
|
||||
|
||||
def sanitize(value, validTags=validTags, validAttrs=validAttrs,
|
||||
validStyles=validStyles):
|
||||
validStyles=validStyles, stripEscapedComments=True):
|
||||
soup = BeautifulSoup(value)
|
||||
for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
|
||||
comment.extract()
|
||||
|
@ -51,7 +55,10 @@ def sanitize(value, validTags=validTags, validAttrs=validAttrs,
|
|||
if val:
|
||||
attrs.append((attr, val))
|
||||
tag.attrs = attrs
|
||||
return soup.renderContents().decode('utf8')
|
||||
result = soup.renderContents()
|
||||
if stripEscapedComments:
|
||||
result = escCommPattern.sub(u'', result)
|
||||
return result.decode('utf8')
|
||||
|
||||
|
||||
def sanitizeStyle(value, validStyles=validStyles):
|
||||
|
|
Loading…
Add table
Reference in a new issue