X-Git-Url: https://git.phdru.name/?p=bookmarks_db.git;a=blobdiff_plain;f=parse_html%2Fbkmk_parse_html.py;h=7bc4640d31ccad2a902335c8a2aad978b02bb09c;hp=2e412ad84dc69756662c215e287e120dad66f72a;hb=c88cb7a75e7caf1d67466cfa107981d95115fa0c;hpb=a04eaa0346e8aa5ad86a195f8f4d36487ebfe09c diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index 2e412ad..7bc4640 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -19,31 +19,31 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] try: - from . import bkmk_ph_beautifulsoup + from . import bkmk_ph_beautifulsoup except ImportError: - pass + pass else: - bkmk_ph_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET - parsers.append(bkmk_ph_beautifulsoup.parse_html) + bkmk_ph_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET + parsers.append(bkmk_ph_beautifulsoup.parse_html) try: - from . import bkmk_ph_html5 + from . import bkmk_ph_html5 except ImportError: - pass + pass else: - parsers.append(bkmk_ph_html5.parse_html) + parsers.append(bkmk_ph_html5.parse_html) try: - from . import bkmk_ph_lxml + from . import bkmk_ph_lxml except ImportError: - pass + pass else: parsers.append(bkmk_ph_lxml.parse_html) try: - from . import bkmk_ph_htmlparser + from . import bkmk_ph_htmlparser except ImportError: - pass + pass else: parsers.append(bkmk_ph_htmlparser.parse_html) @@ -62,128 +62,128 @@ entity_re = re.compile("(&\w+;)") num_entity_re = re.compile("(&#[0-9]+;)") def recode_entities(title, charset): - output = [] - for part in entity_re.split(title): - if part not in ("&", "<", ">", """) and \ - entity_re.match(part): - _part = name2codepoint.get(part[1:-1], None) - if _part is not None: - part = unichr(_part).encode(charset) - output.append(part) - title = ''.join(output) - - output = [] - for part in num_entity_re.split(title): - if num_entity_re.match(part): - try: - part = unichr(int(part[2:-1])).encode(charset) - except UnicodeEncodeError: - pass # Leave the entity as is - output.append(part) - - return ''.join(output) + output = [] + for part in entity_re.split(title): + if part not in ("&", "<", ">", """) and \ + entity_re.match(part): + _part = name2codepoint.get(part[1:-1], None) + if _part is not None: + part = unichr(_part).encode(charset) + output.append(part) + title = ''.join(output) + + output = [] + for part in num_entity_re.split(title): + if num_entity_re.match(part): + try: + part = unichr(int(part[2:-1])).encode(charset) + except UnicodeEncodeError: + pass # Leave the entity as is + output.append(part) + + return ''.join(output) import os BKMK_DEBUG_HTML_PARSERS = os.environ.get("BKMK_DEBUG_HTML_PARSERS") def parse_html(html_text, charset=None, log=None): - if not parsers: - return None - - if charset: - try: - codecs.lookup(charset) # In case of unknown charset... - except (ValueError, LookupError): - charset = None # ...try charset from HTML - - charsets = [universal_charset, DEFAULT_CHARSET] - if charset: - charset = charset.lower().replace("windows-", "cp") - if charset in charsets: - charsets.remove(charset) - charsets.insert(0, charset) - - if BKMK_DEBUG_HTML_PARSERS: - _parsers = [] - for p in parsers: - parser = None - for c in charsets: - try: - parser = p(html_text, c, log) - except UnicodeError: - pass - else: - if parser: - if BKMK_DEBUG_HTML_PARSERS: - if log: log(" Parser %s: ok" % p.__module__) - _parsers.append((p, parser)) - break - else: - if log: log(" Parser %s: fail" % p.__module__) - if not BKMK_DEBUG_HTML_PARSERS and parser: - break - - if BKMK_DEBUG_HTML_PARSERS: - if not _parsers: - if log: log(" All parsers have failed") - return None - elif not parser: - if log: log(" All parsers have failed") - return None - - if BKMK_DEBUG_HTML_PARSERS: - p, parser = _parsers[0] - if log: log(" Using %s" % p.__module__) - - converted_title = title = parser.title - if title and (not parser.charset): - try: - unicode(title, "ascii") - except UnicodeDecodeError: - parser.charset = DEFAULT_CHARSET - - if parser.charset: - parser.charset = parser.charset.lower().replace("windows-", "cp") - - if title and parser.charset and ( - (parser.charset != universal_charset) or - ((not charset) or (charset != parser.charset))): - try: - if parser.meta_charset: - if log: log(" META charset : %s" % parser.charset) - elif (not charset) or (charset != parser.charset): - if log: log(" guessed charset: %s" % parser.charset) - #if log: log(" current charset: %s" % universal_charset) - if log: log(" title : %s" % title) - if parser.charset != universal_charset: + if not parsers: + return None + + if charset: + try: + codecs.lookup(charset) # In case of unknown charset... + except (ValueError, LookupError): + charset = None # ...try charset from HTML + + charsets = [universal_charset, DEFAULT_CHARSET] + if charset: + charset = charset.lower().replace("windows-", "cp") + if charset in charsets: + charsets.remove(charset) + charsets.insert(0, charset) + + if BKMK_DEBUG_HTML_PARSERS: + _parsers = [] + for p in parsers: + parser = None + for c in charsets: try: - converted_title = unicode(title, parser.charset).encode(universal_charset) + parser = p(html_text, c, log) except UnicodeError: - if log: log(" incorrect conversion from %s, converting from %s" % (parser.charset, DEFAULT_CHARSET)) - converted_title = unicode(title, DEFAULT_CHARSET, "replace").encode(universal_charset, "replace") - parser.charset = DEFAULT_CHARSET - if log and (converted_title != title): log(" converted title: %s" % converted_title) - except LookupError: - if log: log(" unknown charset: '%s'" % parser.charset) - else: - if log: log(" title : %s" % title) - - if title: - final_title = recode_entities(converted_title, universal_charset) - parts = [s.strip() for s in final_title.replace('\r', '').split('\n')] - final_title = ' '.join([s for s in parts if s]) - if log and (final_title != converted_title): log(" final title : %s" % final_title) - parser.title = final_title - - icon = parser.icon - if isinstance(icon, unicode): - try: - parser.icon = icon.encode('ascii') - except UnicodeEncodeError: - if parser.charset: - parser.icon = icon.encode(parser.charset) - return parser + pass + else: + if parser: + if BKMK_DEBUG_HTML_PARSERS: + if log: log(" Parser %s: ok" % p.__module__) + _parsers.append((p, parser)) + break + else: + if log: log(" Parser %s: fail" % p.__module__) + if not BKMK_DEBUG_HTML_PARSERS and parser: + break + + if BKMK_DEBUG_HTML_PARSERS: + if not _parsers: + if log: log(" All parsers have failed") + return None + elif not parser: + if log: log(" All parsers have failed") + return None + + if BKMK_DEBUG_HTML_PARSERS: + p, parser = _parsers[0] + if log: log(" Using %s" % p.__module__) + + converted_title = title = parser.title + if title and (not parser.charset): + try: + unicode(title, "ascii") + except UnicodeDecodeError: + parser.charset = DEFAULT_CHARSET + + if parser.charset: + parser.charset = parser.charset.lower().replace("windows-", "cp") + + if title and parser.charset and ( + (parser.charset != universal_charset) or + ((not charset) or (charset != parser.charset))): + try: + if parser.meta_charset: + if log: log(" META charset : %s" % parser.charset) + elif (not charset) or (charset != parser.charset): + if log: log(" guessed charset: %s" % parser.charset) + #if log: log(" current charset: %s" % universal_charset) + if log: log(" title : %s" % title) + if parser.charset != universal_charset: + try: + converted_title = unicode(title, parser.charset).encode(universal_charset) + except UnicodeError: + if log: log(" incorrect conversion from %s, converting from %s" % (parser.charset, DEFAULT_CHARSET)) + converted_title = unicode(title, DEFAULT_CHARSET, "replace").encode(universal_charset, "replace") + parser.charset = DEFAULT_CHARSET + if log and (converted_title != title): log(" converted title: %s" % converted_title) + except LookupError: + if log: log(" unknown charset: '%s'" % parser.charset) + else: + if log: log(" title : %s" % title) + + if title: + final_title = recode_entities(converted_title, universal_charset) + parts = [s.strip() for s in final_title.replace('\r', '').split('\n')] + final_title = ' '.join([s for s in parts if s]) + if log and (final_title != converted_title): log(" final title : %s" % final_title) + parser.title = final_title + + icon = parser.icon + if isinstance(icon, unicode): + try: + parser.icon = icon.encode('ascii') + except UnicodeEncodeError: + if parser.charset: + parser.icon = icon.encode(parser.charset) + return parser def parse_filename(filename, charset=None, log=None): fp = open(filename, 'r')