X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=parse_html%2Fbkmk_parse_html.py;h=42cb5ce1575abf2f0e1eb073943ca62240bcccb2;hb=d5b372c5bf9f87844de555c84cc09dad2dac180f;hp=f42dab8d4de79b040b4638b5d8d7806aa519b5d2;hpb=9c0c9f60aa753e09cf3a091deda6b939ebb956b6;p=bookmarks_db.git diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index f42dab8..42cb5ce 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -20,6 +20,12 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] +# Statistics by parser - successfully parsed HTML pages: +# 4358 beautifulsoup +# 4310 htmlparser +# 4307 html5 +# 4250 lxml + try: from . import bkmk_ph_beautifulsoup except ImportError: @@ -29,25 +35,25 @@ else: parsers.append(bkmk_ph_beautifulsoup.parse_html) try: - from . import bkmk_ph_lxml + from . import bkmk_ph_htmlparser except ImportError: pass else: - parsers.append(bkmk_ph_lxml.parse_html) + parsers.append(bkmk_ph_htmlparser.parse_html) try: - from . import bkmk_ph_htmlparser + from . import bkmk_ph_html5 except ImportError: pass else: - parsers.append(bkmk_ph_htmlparser.parse_html) + parsers.append(bkmk_ph_html5.parse_html) try: - from . import bkmk_ph_html5 + from . import bkmk_ph_lxml except ImportError: pass else: - parsers.append(bkmk_ph_html5.parse_html) + parsers.append(bkmk_ph_lxml.parse_html) # ElementTidy often segfaults #try: @@ -103,25 +109,31 @@ def parse_html(filename, charset=None, log=None): charsets.remove(charset) charsets.insert(0, charset) + #_parsers = [] for p in parsers: parser = None for c in charsets: try: parser = p(filename, c, log) - except UnicodeEncodeError: + except UnicodeError: pass else: - break + if parser: + if log: log(" Parser %s: ok" % p.__module__) + #_parsers.append(parser) + break + else: + if log: log(" Parser %s: fail" % p.__module__) if parser: break - else: - if log: log("Parser %s.%s failed, trying next one." % (p.__module__, p.__name__)) + #if not _parsers: if not parser: - if log: log("All parser has failed.") + if log: log(" All parser has failed") return None - if log: log(" Using %s" % p.__module__) + #parser = _parsers[0] + if log: log(" Using %s" % parser.__module__) converted_title = title = parser.title if title and (not parser.charset):