remove special HTML comments from MS Word even when escaped
This commit is contained in:
parent
074528711d
commit
4fc0538e30
1 changed files with 9 additions and 2 deletions
11
util/html.py
11
util/html.py
|
@ -20,6 +20,8 @@
|
||||||
Strip HTML tags and other HTML-related utilities.
|
Strip HTML tags and other HTML-related utilities.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Comment
|
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Comment
|
||||||
from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString
|
from cybertools.text.lib.BeautifulSoup import Declaration, NavigableString
|
||||||
|
|
||||||
|
@ -32,9 +34,11 @@ validAttrs = ('align alt border cellpadding cellspacing class colspan '
|
||||||
validStyles = 'font-style font-weight'.split()
|
validStyles = 'font-style font-weight'.split()
|
||||||
validStyleParts = 'border padding'.split()
|
validStyleParts = 'border padding'.split()
|
||||||
|
|
||||||
|
escCommPattern = re.compile(r'<\!--\[if .*?\!\[endif\]-->', re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
def sanitize(value, validTags=validTags, validAttrs=validAttrs,
|
def sanitize(value, validTags=validTags, validAttrs=validAttrs,
|
||||||
validStyles=validStyles):
|
validStyles=validStyles, stripEscapedComments=True):
|
||||||
soup = BeautifulSoup(value)
|
soup = BeautifulSoup(value)
|
||||||
for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
|
for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
|
||||||
comment.extract()
|
comment.extract()
|
||||||
|
@ -51,7 +55,10 @@ def sanitize(value, validTags=validTags, validAttrs=validAttrs,
|
||||||
if val:
|
if val:
|
||||||
attrs.append((attr, val))
|
attrs.append((attr, val))
|
||||||
tag.attrs = attrs
|
tag.attrs = attrs
|
||||||
return soup.renderContents().decode('utf8')
|
result = soup.renderContents()
|
||||||
|
if stripEscapedComments:
|
||||||
|
result = escCommPattern.sub(u'', result)
|
||||||
|
return result.decode('utf8')
|
||||||
|
|
||||||
|
|
||||||
def sanitizeStyle(value, validStyles=validStyles):
|
def sanitizeStyle(value, validStyles=validStyles):
|
||||||
|
|
Loading…
Add table
Reference in a new issue