From 191484c2e5037554a49aac605d345d8ce1f25534 Mon Sep 17 00:00:00 2001 From: Helmut Merz Date: Sun, 31 Mar 2013 11:04:55 +0200 Subject: [PATCH 1/2] remove empty line; fix doctest --- util/format.py | 1 - util/format.txt | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/util/format.py b/util/format.py index 2a0f911..1299778 100644 --- a/util/format.py +++ b/util/format.py @@ -56,7 +56,6 @@ def formatNumber(num, type='decimal', lang='de', pattern=u'#,##0.00;-#,##0.00'): return fmt.format(num, pattern=pattern) - def toStr(value, encoding='UTF-8'): if isinstance(value, unicode): return value.encode(encoding) diff --git a/util/format.txt b/util/format.txt index 32c0f9b..b96450a 100644 --- a/util/format.txt +++ b/util/format.txt @@ -14,4 +14,4 @@ Basic Formatting Functions u'21.08.2006 17:37:13' >>> format.formatNumber(17.2) - u'17,2' + u'17,20' From 0bb012a9c6513b49d225cca5c9be1f2a830e2d84 Mon Sep 17 00:00:00 2001 From: Helmut Merz Date: Mon, 1 Apr 2013 10:38:17 +0200 Subject: [PATCH 2/2] add utility function for extracting first part of a text --- util/html.py | 12 ++++++++++++ util/html.txt | 14 ++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/util/html.py b/util/html.py index 56abd5a..25cf8f2 100644 --- a/util/html.py +++ b/util/html.py @@ -36,6 +36,7 @@ validStyleParts = 'border padding'.split() escCommPattern = re.compile(r'<\!--\[if .*?\!\[endif\]-->', re.DOTALL) +sentencePattern = re.compile(r'[:.\?\!]') def sanitize(value, validTags=validTags, validAttrs=validAttrs, validStyles=validStyles, stripEscapedComments=True): @@ -104,3 +105,14 @@ def stripAll(value): text = u''.join(data).replace(u'\n', u'').replace(u' ', u' ') return text + +def extractFirstPart(value): + soup = BeautifulSoup(value) + for tag in soup.findAll(True): + if tag.name in ('p',): + part = tag.renderContents() + break + else: + text = stripAll(value) + part = sentencePattern.split(text)[0] + return ('

%s

' % part).decode('utf8') diff --git a/util/html.txt b/util/html.txt index b94cd9f..fd1980c 100644 --- a/util/html.txt +++ b/util/html.txt @@ -10,6 +10,9 @@ Tweaking HTML text ...

... """ +Sanitize HTML +------------- + >>> sanitize(input, validAttrs=['style']) u'\n

\nText, and more\n

\n' @@ -36,3 +39,14 @@ It is also possible to strip all HTML tags from the input string. >>> from cybertools.util.html import stripAll >>> stripAll(input) u'Text, and more' + +Extract first part of an HTML text +---------------------------------- + + >>> from cybertools.util.html import extractFirstPart + + >>> extractFirstPart(input) + u'

\nText, and more\n

' + + >>> extractFirstPart(input2) + u'

text

'