X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=parse_html%2Fbkmk_parse_html.py;h=07fe32e3e7e6a658ec82f0123acff746a7ac0b01;hb=9faa13f6f8199790cf01533e857c593520559649;hp=af9395b0d8d735689288a02071a0e1aee80814de;hpb=7fc9a9ac1bfa749aa30e3ae1d730ac4f266db950;p=bookmarks_db.git diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index af9395b..07fe32e 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 1997-2017 PhiloSoft Design" +__copyright__ = "Copyright (C) 1997-2023 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html', 'parse_filename', 'universal_charset'] @@ -14,7 +14,7 @@ __all__ = ['parse_html', 'parse_filename', 'universal_charset'] import codecs universal_charset = "utf-8" -DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic +DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] @@ -56,12 +56,12 @@ else: parsers.append(bkmk_ph_htmlparser.parse_html) # ElementTidy often segfaults -#try: -# from . import bkmk_ph_etreetidy -#except ImportError: -# pass -#else: -# parsers.append(bkmk_ph_etreetidy.parse_html) +# try: +# from . import bkmk_ph_etreetidy +# except ImportError: +# pass +# else: +# parsers.append(bkmk_ph_etreetidy.parse_html) import re from htmlentitydefs import name2codepoint @@ -69,6 +69,7 @@ from htmlentitydefs import name2codepoint entity_re = re.compile("(&\w+;)") num_entity_re = re.compile("(&#[0-9]+;)") + def recode_entities(title, charset): output = [] for part in entity_re.split(title): @@ -86,7 +87,7 @@ def recode_entities(title, charset): try: part = unichr(int(part[2:-1])).encode(charset) except UnicodeEncodeError: - pass # Leave the entity as is + pass # Leave the entity as is output.append(part) return ''.join(output) @@ -95,13 +96,14 @@ def recode_entities(title, charset): import os BKMK_DEBUG_HTML_PARSERS = os.environ.get("BKMK_DEBUG_HTML_PARSERS") + def parse_html(html_text, charset=None, log=None): if not parsers: return None if charset: try: - codecs.lookup(charset) # In case of unknown charset... + codecs.lookup(charset) # In case of unknown charset... except (ValueError, LookupError): charset = None # ...try charset from HTML @@ -180,7 +182,7 @@ def parse_html(html_text, charset=None, log=None): if log: log(" META charset : %s" % parser.charset) elif (not charset) or (charset != parser.charset): if log: log(" guessed charset: %s" % parser.charset) - #if log: log(" current charset: %s" % universal_charset) + # if log: log(" current charset: %s" % universal_charset) if log: log(" title : %s" % title) if parser.charset != universal_charset: try: @@ -211,6 +213,7 @@ def parse_html(html_text, charset=None, log=None): parser.icon = icon.encode(parser.charset) return parser + def parse_filename(filename, charset=None, log=None): fp = open(filename, 'r') try: