X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fparse_html.py;h=c05e66cce3a2ad394807b057e4071625f5390fcb;hb=32335282d920880ef89f7abe14855cbb27092138;hp=c7acb6426e7dfbf386702ffb18fe5f12b3bed1ce;hpb=240c6b89a3825d71e040702dc9f9f948a71da107;p=bookmarks_db.git diff --git a/Robots/parse_html.py b/Robots/parse_html.py index c7acb64..c05e66c 100755 --- a/Robots/parse_html.py +++ b/Robots/parse_html.py @@ -56,7 +56,16 @@ def parse_html(filename, charset=None, log=None): charset = None # ...try charset from HTML for p in parsers: - parser = p(filename, charset) + charsets = [universal_charset, DEFAULT_CHARSET] + if charset not in charsets: + charsets.insert(0, charset) + parser = None + for c in charsets: + try: + parser = p(filename, c) + break + except UnicodeEncodeError: + pass if parser: break else: @@ -77,7 +86,7 @@ def parse_html(filename, charset=None, log=None): if parser.meta_charset: if log: log(" META charset : %s" % parser.charset) else: - if log: log(" HTTP charset : %s" % parser.charset) + if log: log(" guessed charset: %s" % parser.charset) if log: log(" current charset: %s" % universal_charset) if log: log(" title : %s" % title) try: