X-Git-Url: https://git.phdru.name/?p=bookmarks_db.git;a=blobdiff_plain;f=parse_html%2Fbkmk_parse_html.py;h=af9395b0d8d735689288a02071a0e1aee80814de;hp=7bc4640d31ccad2a902335c8a2aad978b02bb09c;hb=7fc9a9ac1bfa749aa30e3ae1d730ac4f266db950;hpb=c88cb7a75e7caf1d67466cfa107981d95115fa0c diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index 7bc4640..af9395b 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -18,6 +18,14 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] +try: + from . import bkmk_ph_beautifulsoup4 +except ImportError: + pass +else: + bkmk_ph_beautifulsoup4.DEFAULT_CHARSET = DEFAULT_CHARSET + parsers.append(bkmk_ph_beautifulsoup4.parse_html) + try: from . import bkmk_ph_beautifulsoup except ImportError: @@ -136,6 +144,24 @@ def parse_html(html_text, charset=None, log=None): p, parser = _parsers[0] if log: log(" Using %s" % p.__module__) + title = parser.title + if isinstance(title, unicode): + if parser.charset: + parser.title = title.encode(parser.charset) + else: + try: + parser.title = title.encode('ascii') + except UnicodeEncodeError: + try: + parser.title = title.encode(DEFAULT_CHARSET) + except UnicodeEncodeError: + parser.title = title.encode(universal_charset) + parser.charset = universal_charset + else: + parser.charset = DEFAULT_CHARSET + else: + parser.charset = 'ascii' + converted_title = title = parser.title if title and (not parser.charset): try: