Merge branch 'master' into bbmaster

2013-04-01 12:41:10 +02:00 · 2013-04-01 12:41:10 +02:00 · 524975de32
commit 524975de32
parent d3986258db 0bb012a9c6
2 changed files with 26 additions and 0 deletions
--- a/util/html.py
+++ b/util/html.py
@ -36,6 +36,7 @@ validStyleParts = 'border padding'.split()

 escCommPattern = re.compile(r'&lt;\!--\[if .*?\!\[endif\]--&gt;', re.DOTALL)

+sentencePattern = re.compile(r'[:.\?\!]')

 def sanitize(value, validTags=validTags, validAttrs=validAttrs,
                    validStyles=validStyles, stripEscapedComments=True):
@ -104,3 +105,14 @@ def stripAll(value):
    text = u''.join(data).replace(u'\n', u'').replace(u'&nbsp;', u' ')
    return text

+
+def extractFirstPart(value):
+    soup = BeautifulSoup(value)
+    for tag in soup.findAll(True):
+        if tag.name in ('p',):
+            part = tag.renderContents()
+            break
+    else:
+        text = stripAll(value)
+        part = sentencePattern.split(text)[0]
+    return ('<p>%s</p>' % part).decode('utf8')
--- a/util/html.txt
+++ b/util/html.txt
@ -10,6 +10,9 @@ Tweaking HTML text
  ... </p>
  ... </html>"""

+Sanitize HTML
+-------------
+
  >>> sanitize(input, validAttrs=['style'])
  u'\n<p style="font-weight: bold">\n<a><b>Text</b>, and more</a>\n</p>\n'

@ -36,3 +39,14 @@ It is also possible to strip all HTML tags from the input string.
  >>> from cybertools.util.html import stripAll
  >>> stripAll(input)
  u'Text, and more'
+
+Extract first part of an HTML text
+----------------------------------
+
+  >>> from cybertools.util.html import extractFirstPart
+
+  >>> extractFirstPart(input)
+  u'<p>\n<a href="blubb"><b>Text</b>, and more</a>\n</p>'
+
+  >>> extractFirstPart(input2)
+  u'<p>text</p>'