diff --git a/util/html.py b/util/html.py index 56abd5a..25cf8f2 100644 --- a/util/html.py +++ b/util/html.py @@ -36,6 +36,7 @@ validStyleParts = 'border padding'.split() escCommPattern = re.compile(r'<\!--\[if .*?\!\[endif\]-->', re.DOTALL) +sentencePattern = re.compile(r'[:.\?\!]') def sanitize(value, validTags=validTags, validAttrs=validAttrs, validStyles=validStyles, stripEscapedComments=True): @@ -104,3 +105,14 @@ def stripAll(value): text = u''.join(data).replace(u'\n', u'').replace(u' ', u' ') return text + +def extractFirstPart(value): + soup = BeautifulSoup(value) + for tag in soup.findAll(True): + if tag.name in ('p',): + part = tag.renderContents() + break + else: + text = stripAll(value) + part = sentencePattern.split(text)[0] + return ('
%s
' % part).decode('utf8') diff --git a/util/html.txt b/util/html.txt index b94cd9f..fd1980c 100644 --- a/util/html.txt +++ b/util/html.txt @@ -10,6 +10,9 @@ Tweaking HTML text ... ...