Merge branch 'master' into bbmaster
This commit is contained in:
commit
524975de32
2 changed files with 26 additions and 0 deletions
12
util/html.py
12
util/html.py
|
@ -36,6 +36,7 @@ validStyleParts = 'border padding'.split()
|
|||
|
||||
escCommPattern = re.compile(r'<\!--\[if .*?\!\[endif\]-->', re.DOTALL)
|
||||
|
||||
sentencePattern = re.compile(r'[:.\?\!]')
|
||||
|
||||
def sanitize(value, validTags=validTags, validAttrs=validAttrs,
|
||||
validStyles=validStyles, stripEscapedComments=True):
|
||||
|
@ -104,3 +105,14 @@ def stripAll(value):
|
|||
text = u''.join(data).replace(u'\n', u'').replace(u' ', u' ')
|
||||
return text
|
||||
|
||||
|
||||
def extractFirstPart(value):
|
||||
soup = BeautifulSoup(value)
|
||||
for tag in soup.findAll(True):
|
||||
if tag.name in ('p',):
|
||||
part = tag.renderContents()
|
||||
break
|
||||
else:
|
||||
text = stripAll(value)
|
||||
part = sentencePattern.split(text)[0]
|
||||
return ('<p>%s</p>' % part).decode('utf8')
|
||||
|
|
|
@ -10,6 +10,9 @@ Tweaking HTML text
|
|||
... </p>
|
||||
... </html>"""
|
||||
|
||||
Sanitize HTML
|
||||
-------------
|
||||
|
||||
>>> sanitize(input, validAttrs=['style'])
|
||||
u'\n<p style="font-weight: bold">\n<a><b>Text</b>, and more</a>\n</p>\n'
|
||||
|
||||
|
@ -36,3 +39,14 @@ It is also possible to strip all HTML tags from the input string.
|
|||
>>> from cybertools.util.html import stripAll
|
||||
>>> stripAll(input)
|
||||
u'Text, and more'
|
||||
|
||||
Extract first part of an HTML text
|
||||
----------------------------------
|
||||
|
||||
>>> from cybertools.util.html import extractFirstPart
|
||||
|
||||
>>> extractFirstPart(input)
|
||||
u'<p>\n<a href="blubb"><b>Text</b>, and more</a>\n</p>'
|
||||
|
||||
>>> extractFirstPart(input2)
|
||||
u'<p>text</p>'
|
||||
|
|
Loading…
Add table
Reference in a new issue