X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;ds=sidebyside;f=parse_html%2Fbkmk_parse_html.py;h=a6b3d92a1e98137850b4c241ef89b713d23b6ab5;hb=6eae6e394f44a55e5be10aa59408bdf76d0af9e0;hp=f42dab8d4de79b040b4638b5d8d7806aa519b5d2;hpb=338c964afba3651bd8fe6318644c0fcabb66cc3b;p=bookmarks_db.git diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index f42dab8..a6b3d92 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -20,6 +20,12 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] +# Statistics by parser - successfully parsed HTML pages: +# 4358 beautifulsoup +# 4310 htmlparser +# 4307 html5 +# 4250 lxml + try: from . import bkmk_ph_beautifulsoup except ImportError: @@ -29,25 +35,25 @@ else: parsers.append(bkmk_ph_beautifulsoup.parse_html) try: - from . import bkmk_ph_lxml + from . import bkmk_ph_htmlparser except ImportError: pass else: - parsers.append(bkmk_ph_lxml.parse_html) + parsers.append(bkmk_ph_htmlparser.parse_html) try: - from . import bkmk_ph_htmlparser + from . import bkmk_ph_html5 except ImportError: pass else: - parsers.append(bkmk_ph_htmlparser.parse_html) + parsers.append(bkmk_ph_html5.parse_html) try: - from . import bkmk_ph_html5 + from . import bkmk_ph_lxml except ImportError: pass else: - parsers.append(bkmk_ph_html5.parse_html) + parsers.append(bkmk_ph_lxml.parse_html) # ElementTidy often segfaults #try: @@ -103,24 +109,30 @@ def parse_html(filename, charset=None, log=None): charsets.remove(charset) charsets.insert(0, charset) + #_parsers = [] for p in parsers: parser = None for c in charsets: try: parser = p(filename, c, log) - except UnicodeEncodeError: + except UnicodeError: pass else: - break + if parser: + #if log: log(" Parser %s: ok" % p.__module__) + #_parsers.append(parser) + break + else: + if log: log(" Parser %s: fail" % p.__module__) if parser: break - else: - if log: log("Parser %s.%s failed, trying next one." % (p.__module__, p.__name__)) + #if not _parsers: if not parser: - if log: log("All parser has failed.") + if log: log(" All parsers have failed") return None + #parser = _parsers[0] if log: log(" Using %s" % p.__module__) converted_title = title = parser.title