From 9989e73be9690cf0fccab901c9db81711cb9a9e7 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Mon, 3 Jan 2011 19:35:08 +0000 Subject: [PATCH] Moved parse_html.py and its submodules to a separate parse_html module. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@311 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- Robots/bkmk_rsimple_tos.py | 4 ++-- .../parse_html.py => parse_html/__init__.py | 24 +++++++++---------- .../beautifulsoup.py | 4 ++-- .../etreetidy.py | 4 ++-- .../html5.py | 4 ++-- .../htmlparser.py | 2 +- .../parse_html_lxml.py => parse_html/lxml.py | 4 ++-- .../parse_html_util.py => parse_html/util.py | 0 8 files changed, 23 insertions(+), 23 deletions(-) rename Robots/parse_html.py => parse_html/__init__.py (91%) mode change 100755 => 100644 rename Robots/parse_html_beautifulsoup.py => parse_html/beautifulsoup.py (97%) rename Robots/parse_html_etreetidy.py => parse_html/etreetidy.py (94%) rename Robots/parse_html_html5.py => parse_html/html5.py (96%) rename Robots/parse_html_htmlparser.py => parse_html/htmlparser.py (97%) rename Robots/parse_html_lxml.py => parse_html/lxml.py (93%) rename Robots/parse_html_util.py => parse_html/util.py (100%) diff --git a/Robots/bkmk_rsimple_tos.py b/Robots/bkmk_rsimple_tos.py index 0be22b8..ad42f09 100644 --- a/Robots/bkmk_rsimple_tos.py +++ b/Robots/bkmk_rsimple_tos.py @@ -1,13 +1,13 @@ """ Simple robot with socket's timeout - Written by Broytman. Copyright (C) 2000-2010 PhiloSoft Design + Written by Broytman. Copyright (C) 2000-2011 PhiloSoft Design """ import socket socket.setdefaulttimeout(900) -from bkmk_rsimple import robot_simple, get_error +from .bkmk_rsimple import robot_simple, get_error class robot_simple_tos(robot_simple): diff --git a/Robots/parse_html.py b/parse_html/__init__.py old mode 100755 new mode 100644 similarity index 91% rename from Robots/parse_html.py rename to parse_html/__init__.py index cbb45d6..c67f2e7 --- a/Robots/parse_html.py +++ b/parse_html/__init__.py @@ -1,4 +1,3 @@ -#! /usr/bin/env python """ HTML Parsers wrapper @@ -13,41 +12,41 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] try: - import parse_html_beautifulsoup - parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET + from . import beautifulsoup except ImportError: pass else: - parsers.append(parse_html_beautifulsoup.parse_html) + beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET + parsers.append(beautifulsoup.parse_html) try: - from parse_html_lxml import parse_html + from .lxml import parse_html except ImportError: pass else: parsers.append(parse_html) try: - from parse_html_htmlparser import parse_html + from .htmlparser import parse_html except ImportError: pass else: parsers.append(parse_html) try: - import parse_html_html5 + from . import html5 except ImportError: pass else: - parsers.append(parse_html_html5.parse_html) + parsers.append(html5.parse_html) # ElementTidy often segfaults #try: -# import parse_html_etreetidy +# from . import etreetidy #except ImportError: # pass #else: -# parsers.append(parse_html_etreetidy.parse_html) +# parsers.append(etreetidy.parse_html) import re from htmlentitydefs import name2codepoint @@ -100,9 +99,10 @@ def parse_html(filename, charset=None, log=None): for c in charsets: try: parser = p(filename, c, log) - break except UnicodeEncodeError: pass + else: + break if parser: break else: @@ -161,7 +161,7 @@ def parse_html(filename, charset=None, log=None): return parser -if __name__ == '__main__': +def test(): import sys l = len(sys.argv) diff --git a/Robots/parse_html_beautifulsoup.py b/parse_html/beautifulsoup.py similarity index 97% rename from Robots/parse_html_beautifulsoup.py rename to parse_html/beautifulsoup.py index e03dfce..86b8c1b 100644 --- a/Robots/parse_html_beautifulsoup.py +++ b/parse_html/beautifulsoup.py @@ -1,13 +1,13 @@ """ HTML Parser using BeautifulSoup - Written by Broytman. Copyright (C) 2007-2010 PhiloSoft Design + Written by Broytman. Copyright (C) 2007-2011 PhiloSoft Design """ import re from sgmllib import SGMLParser, SGMLParseError from BeautifulSoup import BeautifulSoup, CData -from parse_html_util import HTMLParser +from .util import HTMLParser # http://groups.google.com/group/beautifulsoup/browse_thread/thread/69093cb0d3a3cf63 diff --git a/Robots/parse_html_etreetidy.py b/parse_html/etreetidy.py similarity index 94% rename from Robots/parse_html_etreetidy.py rename to parse_html/etreetidy.py index 7149c22..d55aa57 100644 --- a/Robots/parse_html_etreetidy.py +++ b/parse_html/etreetidy.py @@ -1,11 +1,11 @@ """ HTML Parser using ElementTree+TidyLib. - Written by Broytman. Copyright (C) 2010 PhiloSoft Design + Written by Broytman. Copyright (C) 2010, 2011 PhiloSoft Design """ from elementtidy import TidyHTMLTreeBuilder -from parse_html_util import HTMLParser +from .util import HTMLParser def parse_html(filename, charset=None, log=None): diff --git a/Robots/parse_html_html5.py b/parse_html/html5.py similarity index 96% rename from Robots/parse_html_html5.py rename to parse_html/html5.py index 43e8d74..52bd576 100644 --- a/Robots/parse_html_html5.py +++ b/parse_html/html5.py @@ -1,11 +1,11 @@ """ HTML Parser using html5. - Written by Broytman. Copyright (C) 2010 PhiloSoft Design + Written by Broytman. Copyright (C) 2010, 2011 PhiloSoft Design """ from html5lib import HTMLParser as HTML5Parser -from parse_html_util import HTMLParser +from .util import HTMLParser def parse_html(filename, charset=None, log=None): diff --git a/Robots/parse_html_htmlparser.py b/parse_html/htmlparser.py similarity index 97% rename from Robots/parse_html_htmlparser.py rename to parse_html/htmlparser.py index 7702162..5f885b2 100644 --- a/Robots/parse_html_htmlparser.py +++ b/parse_html/htmlparser.py @@ -1,7 +1,7 @@ """ HTML Parser - Written by Broytman. Copyright (C) 1997-2010 PhiloSoft Design + Written by Broytman. Copyright (C) 1997-2011 PhiloSoft Design """ from HTMLParser import HTMLParseError diff --git a/Robots/parse_html_lxml.py b/parse_html/lxml.py similarity index 93% rename from Robots/parse_html_lxml.py rename to parse_html/lxml.py index 8f658e1..f399397 100644 --- a/Robots/parse_html_lxml.py +++ b/parse_html/lxml.py @@ -1,11 +1,11 @@ """ HTML Parser using lxml.html. - Written by Broytman. Copyright (C) 2010 PhiloSoft Design + Written by Broytman. Copyright (C) 2010, 2011 PhiloSoft Design """ from lxml.html import parse -from parse_html_util import HTMLParser +from .util import HTMLParser def parse_html(filename, charset=None, log=None): diff --git a/Robots/parse_html_util.py b/parse_html/util.py similarity index 100% rename from Robots/parse_html_util.py rename to parse_html/util.py -- 2.39.5