Merge branch 'master' into bbmaster

This commit is contained in:
Helmut Merz 2013-04-01 12:41:10 +02:00
commit 524975de32
2 changed files with 26 additions and 0 deletions

View file

@ -36,6 +36,7 @@ validStyleParts = 'border padding'.split()
escCommPattern = re.compile(r'<\!--\[if .*?\!\[endif\]-->', re.DOTALL)
sentencePattern = re.compile(r'[:.\?\!]')
def sanitize(value, validTags=validTags, validAttrs=validAttrs,
validStyles=validStyles, stripEscapedComments=True):
@ -104,3 +105,14 @@ def stripAll(value):
text = u''.join(data).replace(u'\n', u'').replace(u' ', u' ')
return text
def extractFirstPart(value):
soup = BeautifulSoup(value)
for tag in soup.findAll(True):
if tag.name in ('p',):
part = tag.renderContents()
break
else:
text = stripAll(value)
part = sentencePattern.split(text)[0]
return ('<p>%s</p>' % part).decode('utf8')

View file

@ -10,6 +10,9 @@ Tweaking HTML text
... </p>
... </html>"""
Sanitize HTML
-------------
>>> sanitize(input, validAttrs=['style'])
u'\n<p style="font-weight: bold">\n<a><b>Text</b>, and more</a>\n</p>\n'
@ -36,3 +39,14 @@ It is also possible to strip all HTML tags from the input string.
>>> from cybertools.util.html import stripAll
>>> stripAll(input)
u'Text, and more'
Extract first part of an HTML text
----------------------------------
>>> from cybertools.util.html import extractFirstPart
>>> extractFirstPart(input)
u'<p>\n<a href="blubb"><b>Text</b>, and more</a>\n</p>'
>>> extractFirstPart(input2)
u'<p>text</p>'