added basic support for converting html files to text using BeautifulSoup
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@1610 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
898e6b5a26
commit
041c305d29
5 changed files with 2020 additions and 0 deletions
|
@ -17,6 +17,17 @@ The test files are in a subdirectory of the text package:
|
||||||
>>> from cybertools import text
|
>>> from cybertools import text
|
||||||
>>> testdir = os.path.join(os.path.dirname(text.__file__), 'testfiles')
|
>>> testdir = os.path.join(os.path.dirname(text.__file__), 'testfiles')
|
||||||
|
|
||||||
|
HTML
|
||||||
|
----
|
||||||
|
|
||||||
|
>>> from cybertools.text.html import htmlToText
|
||||||
|
>>> html = open(os.path.join(testdir, 'selfhtml.html')).read()
|
||||||
|
>>> text = htmlToText(html)
|
||||||
|
>>> '<p>' in html
|
||||||
|
True
|
||||||
|
>>> '<p>' in text
|
||||||
|
False
|
||||||
|
|
||||||
PDF Files
|
PDF Files
|
||||||
---------
|
---------
|
||||||
|
|
||||||
|
|
44
text/html.py
Normal file
44
text/html.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
#
|
||||||
|
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
#
|
||||||
|
|
||||||
|
"""
|
||||||
|
Searchable text support for HTML files.
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os, sys
|
||||||
|
from cStringIO import StringIO
|
||||||
|
|
||||||
|
from cybertools.text import base
|
||||||
|
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, NavigableString
|
||||||
|
|
||||||
|
|
||||||
|
def htmlToText(html):
|
||||||
|
data = []
|
||||||
|
soup = BeautifulSoup(html).html
|
||||||
|
collectText([soup], data)
|
||||||
|
text = u' '.join(data).replace('\n', '').replace(' ', '')
|
||||||
|
return text
|
||||||
|
|
||||||
|
def collectText(tags, data):
|
||||||
|
for tag in tags:
|
||||||
|
if type(tag) is NavigableString:
|
||||||
|
data.append(tag)
|
||||||
|
else:
|
||||||
|
collectText(tag.contents, data)
|
1812
text/lib/BeautifulSoup.py
Normal file
1812
text/lib/BeautifulSoup.py
Normal file
File diff suppressed because it is too large
Load diff
3
text/lib/__init__.py
Normal file
3
text/lib/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
"""
|
||||||
|
$Id$
|
||||||
|
"""
|
150
text/testfiles/selfhtml.html
Normal file
150
text/testfiles/selfhtml.html
Normal file
|
@ -0,0 +1,150 @@
|
||||||
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||||
|
<html><head>
|
||||||
|
|
||||||
|
|
||||||
|
<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"><title>SELFHTML 8.1.2 (HTML-Dateien selbst erstellen)</title>
|
||||||
|
|
||||||
|
<meta name="description" content="SELFHTML 8.1.2 - Die bekannte Dokumentation zu HTML, JavaScript und CGI/Perl - Tutorial und Referenz, mit etlichen Zusatztips zu Design, Grafik, Projektverwaltung usw.">
|
||||||
|
<meta name="keywords" content="SELFHTML, HTML, Dynamic HTML, JavaScript, CGI, Perl, Grafik, WWW-Seiten, Web-Seiten, Hilfe, Dokumentation, Beschreibung">
|
||||||
|
<meta name="author" content="Redaktion SELFHTML, selfhtml81@selfhtml.org">
|
||||||
|
<meta name="DC.Publisher" content="SELFHTML e. V.">
|
||||||
|
<meta name="DC.Date" content="2005-11-11T12:48:29+01:00">
|
||||||
|
<meta name="DC.Identifier" content="http://de.selfhtml.org/">
|
||||||
|
<meta name="DC.Language" content="de">
|
||||||
|
<meta name="DC.Rights" content="editorial/copyright.htm">
|
||||||
|
<meta name="DC.Date.created" content="2001-10-27T08:00+01:00">
|
||||||
|
<meta name="SELF.Pagetype" content="chapter">
|
||||||
|
<link rel="stylesheet" type="text/css" href="selfhtml-Dateien/selfhtml.css">
|
||||||
|
<link rel="alternate" type="application/atom+xml" title="SELFHTML-Weblog (Atom, gesamt)" href="http://aktuell.de.selfhtml.org/weblog/atom-feed">
|
||||||
|
<link rel="alternate" type="application/rss+xml" title="SELFHTML-Weblog (RSS, Auszüge)" href="http://aktuell.de.selfhtml.org/weblog/rss-feed">
|
||||||
|
<link rel="shortcut icon" type="image/x-icon" href="http://de.selfhtml.org/src/favicon.ico">
|
||||||
|
<link rel="author" title="Impressum" href="http://de.selfhtml.org/editorial/impressum.htm">
|
||||||
|
<link rel="contents" title="Inhaltsverzeichnis" href="http://de.selfhtml.org/navigation/inhalt.htm">
|
||||||
|
<link rel="index" title="Stichwortverzeichnis" href="http://de.selfhtml.org/navigation/stichwort.htm">
|
||||||
|
<link rel="search" title="Suche" href="http://de.selfhtml.org/navigation/suche/index.htm">
|
||||||
|
<link rel="help" title="Hilfe" href="http://de.selfhtml.org/editorial/index.htm">
|
||||||
|
<link rel="copyright" title="Urheberrecht" href="http://de.selfhtml.org/editorial/copyright.htm"></head><body>
|
||||||
|
|
||||||
|
<table border="0" cellpadding="4" cellspacing="0" width="100%">
|
||||||
|
<tbody><tr>
|
||||||
|
<td colspan="2" class="nav"><a class="an" name="top"><strong>SELFHTML: Version 8.1.2 vom 01.03.2007</strong></a></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="doc"><img src="selfhtml-Dateien/logo.gif" alt="SELFHTML" border="0" height="109" width="106"></td>
|
||||||
|
<td class="docbot">
|
||||||
|
<span class="blue">Die Energie des Verstehens</span><br>
|
||||||
|
<b>HTML-Dateien selbst erstellen</b>
|
||||||
|
<h1>SELFHTML</h1>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="doctop">
|
||||||
|
|
||||||
|
<table border="0" cellpadding="5" cellspacing="5" width="100%"><tbody><tr>
|
||||||
|
<td class="nav" align="center"><a href="http://aktuell.de.selfhtml.org/weblog/" target="_blank"><img src="selfhtml-Dateien/auge.gif" alt="News" border="0" height="16" width="24"><br>News</a><br>
|
||||||
|
Online-News<br>
|
||||||
|
rund um<br>
|
||||||
|
SELFHTML<br> <br>
|
||||||
|
<b>Suche nach:</b>
|
||||||
|
<form action="navigation/suche/index.htm" method="get" style="margin: 0pt;">
|
||||||
|
<input name="Suchanfrage" size="10" accesskey="u" style="width: 90px;" type="text">
|
||||||
|
</form>
|
||||||
|
</td></tr></tbody></table>
|
||||||
|
</td>
|
||||||
|
<td valign="top" width="100%">
|
||||||
|
<table cellpadding="5" cellspacing="5"><tbody><tr><td nowrap="nowrap" valign="top">
|
||||||
|
|
||||||
|
<table border="0" cellpadding="5" cellspacing="0" width="250"><tbody><tr>
|
||||||
|
<td class="doc"><b>Inhalt: Allgemeines</b></td></tr></tbody></table>
|
||||||
|
|
||||||
|
<p><img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/editorial/index.htm">Editorial</a><br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/intro/index.htm">Einführung</a></p>
|
||||||
|
|
||||||
|
<table border="0" cellpadding="5" cellspacing="0" width="250"><tbody><tr>
|
||||||
|
<td class="doc"><b>Inhalt: Web-Technologien</b></td></tr></tbody></table>
|
||||||
|
|
||||||
|
<p><img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/html/index.htm">HTML/XHTML</a> <br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/css/index.htm">Stylesheets (CSS)</a> <br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/xml/index.htm">XML/DTDs</a><br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/javascript/index.htm">JavaScript/DOM</a><br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/dhtml/index.htm">Dynamisches HTML</a><br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/perl/index.htm">Perl</a><br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/php/index.htm">PHP</a><br>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<table border="0" cellpadding="5" cellspacing="0" width="250"><tbody><tr>
|
||||||
|
<td class="doc"><b>Inhalt: Ergänzendes Wissen</b></td></tr></tbody></table>
|
||||||
|
|
||||||
|
<p><img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/inter/index.htm">Internationalisierung</a> <br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/grafik/index.htm">Grafik</a><br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/projekt/index.htm">Web-Projektverwaltung</a><br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/servercgi/index.htm">Webserver/CGI</a><br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/diverses/index.htm">Diverse technische Ergänzungen</a></p>
|
||||||
|
|
||||||
|
<table border="0" cellpadding="5" cellspacing="0" width="250"><tbody><tr>
|
||||||
|
<td class="doc"><b>Inhalt: Extras</b></td></tr></tbody></table>
|
||||||
|
|
||||||
|
<p><img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/layouts/index.htm">Fertige Layouts</a><br>
|
||||||
|
<img src="selfhtml-Dateien/kap.gif" alt="Kapitel" height="13" width="15"> <a href="http://de.selfhtml.org/helferlein/index.htm">Kleine Helferlein</a><br> </p>
|
||||||
|
</td>
|
||||||
|
<td nowrap="nowrap" valign="top">
|
||||||
|
|
||||||
|
<table border="0" cellpadding="5" cellspacing="0" width="250"><tbody><tr>
|
||||||
|
<td class="doc"><b>Navigation: Einstieg</b></td></tr></tbody></table>
|
||||||
|
<p><img src="selfhtml-Dateien/dok.gif" alt="Seite" height="10" width="15"> <a href="http://de.selfhtml.org/navigation/anfang/index.htm">Wie fange ich an?</a><br>
|
||||||
|
<img src="selfhtml-Dateien/dok.gif" alt="Seite" height="10" width="15"> <a href="http://de.selfhtml.org/navigation/faq.htm">Häufig gestellte Fragen (FAQ)</a></p>
|
||||||
|
|
||||||
|
<table border="0" cellpadding="5" cellspacing="0" width="250"><tbody><tr>
|
||||||
|
<td class="doc"><b>Navigation: Kurzreferenzen</b></td></tr></tbody></table>
|
||||||
|
|
||||||
|
<p><img src="selfhtml-Dateien/dok.gif" alt="Seite" height="10" width="15"> <a href="http://de.selfhtml.org/navigation/html.htm">Kurzreferenz: HTML</a><br>
|
||||||
|
<img src="selfhtml-Dateien/dok.gif" alt="Seite" height="10" width="15"> <a href="http://de.selfhtml.org/navigation/css.htm">Kurzreferenz: CSS</a></p>
|
||||||
|
|
||||||
|
<table border="0" cellpadding="5" cellspacing="0" width="250"><tbody><tr>
|
||||||
|
<td class="doc"><b>Navigation: Verzeichnisse</b></td></tr></tbody></table>
|
||||||
|
|
||||||
|
<p><img src="selfhtml-Dateien/dok.gif" alt="Seite" height="10" width="15"> <a href="http://de.selfhtml.org/navigation/inhalt.htm">Inhaltsverzeichnis</a><br>
|
||||||
|
<img src="selfhtml-Dateien/dok.gif" alt="Seite" height="10" width="15"> <a href="http://de.selfhtml.org/navigation/syntax.htm">Syntaxverzeichnis</a><br>
|
||||||
|
<img src="selfhtml-Dateien/dok.gif" alt="Seite" height="10" width="15"> <a href="http://de.selfhtml.org/navigation/stichwort.htm">Stichwortverzeichnis</a></p>
|
||||||
|
|
||||||
|
<table border="0" cellpadding="5" cellspacing="0" width="250"><tbody><tr>
|
||||||
|
<td class="doc"><b>Navigation: Extras</b></td></tr></tbody></table>
|
||||||
|
|
||||||
|
<p><img src="selfhtml-Dateien/dok.gif" alt="Seite" height="10" width="15"> <a href="http://de.selfhtml.org/navigation/quickbar/index.htm" target="_parent">Quickbar</a><br>
|
||||||
|
<img src="selfhtml-Dateien/dok.gif" alt="Seite" height="10" width="15"> <a href="http://de.selfhtml.org/navigation/sidebars/index.htm" target="_parent">Sidebars</a><br>
|
||||||
|
<img src="selfhtml-Dateien/dok.gif" alt="Seite" height="10" width="15"> <a href="http://de.selfhtml.org/navigation/suche/index.htm" target="_parent">Suche</a></p>
|
||||||
|
</td></tr></tbody></table>
|
||||||
|
</td></tr>
|
||||||
|
<tr>
|
||||||
|
<td class="doctop" rowspan="2" align="center" width="110"> </td>
|
||||||
|
<td class="doctop" width="100%"><h2>SELFHTML aktuell</h2></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td valign="top" width="100%">
|
||||||
|
|
||||||
|
<table cellpadding="5" cellspacing="5"><tbody><tr><td nowrap="nowrap" valign="top">
|
||||||
|
|
||||||
|
<p>Das <b>Online-Angebot von SELFHTML</b>, das Sie ebenfalls kennen sollten!<br>Die Einstiegsseite erreichen Sie über den folgenden Link:</p>
|
||||||
|
|
||||||
|
<p><img src="selfhtml-Dateien/serverkap.gif" alt="Online-Kapitel" height="13" width="15"> <a target="_top" href="http://aktuell.de.selfhtml.org/">SELFHTML aktuell</a></p>
|
||||||
|
|
||||||
|
<p>Dort finden Sie unter anderem:</p>
|
||||||
|
<p>
|
||||||
|
<img src="selfhtml-Dateien/serverdok.gif" alt="Online-Seite" height="10" width="15"> <a target="_top" href="http://aktuell.de.selfhtml.org/extras/download.shtml">Download der Dokumentation</a> zum Offline-Lesen<br>
|
||||||
|
<img src="selfhtml-Dateien/serverdok.gif" alt="Online-Seite" height="10" width="15"> <a target="_top" href="http://aktuell.de.selfhtml.org/extras/buch.htm">SELFHTML als Buch</a> für Bildschirmmuffel<br>
|
||||||
|
<img src="selfhtml-Dateien/serverkap.gif" alt="Online-Kapitel" height="13" width="15"> <a target="_top" href="http://aktuell.de.selfhtml.org/links/">Linkverzeichnis</a> mit Links zu anderen Informationsquellen<br>
|
||||||
|
<img src="selfhtml-Dateien/serverkap.gif" alt="Online-Kapitel" height="13" width="15"> <a target="_top" href="http://aktuell.de.selfhtml.org/artikel/">Fachartikel</a> diverser Autoren zu einzelnen Themen<br>
|
||||||
|
<img src="selfhtml-Dateien/serverdok.gif" alt="Online-Seite" height="10" width="15"> <a target="_top" href="http://forum.de.selfhtml.org/">Forum</a> zum Diskutieren von Fachthemen und Anderem
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p> </p></td></tr></tbody></table></td></tr>
|
||||||
|
<tr><td colspan="2" class="doc">
|
||||||
|
<a href="#top"><img src="selfhtml-Dateien/up.gif" alt="nach oben" border="0" height="10" width="14"></a>
|
||||||
|
</td></tr></tbody></table>
|
||||||
|
|
||||||
|
<p>Die Originaladresse dieses Dokuments im Web lautet:<br>
|
||||||
|
<img src="selfhtml-Dateien/serverdok.gif" alt="Online-Seite" height="10" width="15"> <a target="_top" href="http://de.selfhtml.org/">http://de.selfhtml.org/</a></p>
|
||||||
|
|
||||||
|
<p>© 2007 <img src="selfhtml-Dateien/dok.gif" alt="Seite" height="10" width="15"> <a href="http://de.selfhtml.org/editorial/impressum.htm">Impressum</a></p>
|
||||||
|
|
||||||
|
</body></html>
|
Loading…
Add table
Reference in a new issue