cybertools/text/html.py
helmutm 7ec9bbdc15 merged Dojo 1.0 branch
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@2387 fd906abe-77d9-0310-91a1-e0d9ade77398
2008-02-10 09:56:27 +00:00

44 lines
1.3 KiB
Python

#
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
Searchable text support for HTML files.
$Id$
"""
import os, sys
from cStringIO import StringIO
from cybertools.text import base
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, NavigableString
def htmlToText(html):
data = []
soup = BeautifulSoup(html).html
collectText([soup], data)
text = u' '.join(data).replace(u'\n', u'').replace(u' ', u'')
return text
def collectText(tags, data):
for tag in tags:
if type(tag) is NavigableString:
data.append(tag)
else:
collectText(tag.contents, data)