X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=parse_html%2Fbkmk_ph_htmlparser.py;h=d11a2ff9fbeab4b5e5ec8daa1c2a7b4205ac63e4;hb=bd078d376a721b31918b60c41ebf15be408bf52a;hp=05ad6584d47ec37f67ae5883181fdc6de5328880;hpb=c2ea4e82718b903aa123dd77490f36657383b0ca;p=bookmarks_db.git diff --git a/parse_html/bkmk_ph_htmlparser.py b/parse_html/bkmk_ph_htmlparser.py index 05ad658..d11a2ff 100644 --- a/parse_html/bkmk_ph_htmlparser.py +++ b/parse_html/bkmk_ph_htmlparser.py @@ -11,7 +11,10 @@ __license__ = "GNU GPL" __all__ = ['parse_html'] -from HTMLParser import HTMLParseError +try: + from HTMLParser import HTMLParseError +except ImportError: + class HTMLParseError(Exception): pass from m_lib.net.www.html import HTMLParser as _HTMLParser @@ -47,8 +50,10 @@ class HTMLParser(_HTMLParser): if (not self.charset) and (http_equiv == "content-type"): try: - # extract charset from "text/html; foo; charset=UTF-8, bar; baz;" - self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0] + # extract charset from + # "text/html; foo; charset=UTF-8, bar; baz;" + self.charset = content.lower().split('charset=')[1].\ + split(';')[0].split(',')[0] # Remember that the charset was retrieved from # META tag, not from the Content-Type header self.meta_charset = 1 @@ -72,7 +77,9 @@ class HTMLParser(_HTMLParser): for attrname, value in attrs: if value: value = value.strip() - if (attrname == 'rel') and (value.lower() in ('icon', 'shortcut icon')): + if (attrname == 'rel') and ( + value.lower() in ('icon', 'shortcut icon') + ): has_icon = True elif attrname == 'href': href = value @@ -82,6 +89,13 @@ class HTMLParser(_HTMLParser): def parse_html(html_text, charset=None, log=None): + if not html_text: + return None + if charset is None and isinstance(html_text, bytes): + return None # html.parser cannot parse bytes + if charset and isinstance(html_text, bytes): + html_text = html_text.decode(charset) + parser = HTMLParser(charset) try: @@ -94,6 +108,7 @@ def parse_html(html_text, charset=None, log=None): except (HTMLParseError, HTMLHeadDone): pass - if (parser.title is None) and (parser.refresh is None) and (parser.icon is None): + if (parser.title is None) and (parser.refresh is None) \ + and (parser.icon is None): return None return parser