From d5b372c5bf9f87844de555c84cc09dad2dac180f Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Thu, 6 Jan 2011 19:00:27 +0000 Subject: [PATCH] Added code to collect statistics on parsers; sort parser according to the statistics; commented out statistics code. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@322 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- parse_html/bkmk_parse_html.py | 36 +++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index f42dab8..42cb5ce 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -20,6 +20,12 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] +# Statistics by parser - successfully parsed HTML pages: +# 4358 beautifulsoup +# 4310 htmlparser +# 4307 html5 +# 4250 lxml + try: from . import bkmk_ph_beautifulsoup except ImportError: @@ -29,25 +35,25 @@ else: parsers.append(bkmk_ph_beautifulsoup.parse_html) try: - from . import bkmk_ph_lxml + from . import bkmk_ph_htmlparser except ImportError: pass else: - parsers.append(bkmk_ph_lxml.parse_html) + parsers.append(bkmk_ph_htmlparser.parse_html) try: - from . import bkmk_ph_htmlparser + from . import bkmk_ph_html5 except ImportError: pass else: - parsers.append(bkmk_ph_htmlparser.parse_html) + parsers.append(bkmk_ph_html5.parse_html) try: - from . import bkmk_ph_html5 + from . import bkmk_ph_lxml except ImportError: pass else: - parsers.append(bkmk_ph_html5.parse_html) + parsers.append(bkmk_ph_lxml.parse_html) # ElementTidy often segfaults #try: @@ -103,25 +109,31 @@ def parse_html(filename, charset=None, log=None): charsets.remove(charset) charsets.insert(0, charset) + #_parsers = [] for p in parsers: parser = None for c in charsets: try: parser = p(filename, c, log) - except UnicodeEncodeError: + except UnicodeError: pass else: - break + if parser: + if log: log(" Parser %s: ok" % p.__module__) + #_parsers.append(parser) + break + else: + if log: log(" Parser %s: fail" % p.__module__) if parser: break - else: - if log: log("Parser %s.%s failed, trying next one." % (p.__module__, p.__name__)) + #if not _parsers: if not parser: - if log: log("All parser has failed.") + if log: log(" All parser has failed") return None - if log: log(" Using %s" % p.__module__) + #parser = _parsers[0] + if log: log(" Using %s" % parser.__module__) converted_title = title = parser.title if title and (not parser.charset): -- 2.39.2