X-Git-Url: https://git.phdru.name/?p=bookmarks_db.git;a=blobdiff_plain;f=parse_html%2Fbkmk_ph_htmlparser.py;h=45e89f5119817787b0b926c7f43627a7dc22e7dd;hp=0798467386bda9ab993df160f1d4937b1e1802d9;hb=c88cb7a75e7caf1d67466cfa107981d95115fa0c;hpb=a04eaa0346e8aa5ad86a195f8f4d36487ebfe09c diff --git a/parse_html/bkmk_ph_htmlparser.py b/parse_html/bkmk_ph_htmlparser.py index 0798467..45e89f5 100644 --- a/parse_html/bkmk_ph_htmlparser.py +++ b/parse_html/bkmk_ph_htmlparser.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 1997-2014 PhiloSoft Design" +__copyright__ = "Copyright (C) 1997-2017 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -19,80 +19,80 @@ class HTMLHeadDone(Exception): pass class HTMLParser(_HTMLParser): - def __init__(self, charset=None): - _HTMLParser.__init__(self) - self.charset = charset - self.meta_charset = 0 - self.title = None - self.refresh = None - self.icon = None - - def end_head(self): - raise HTMLHeadDone() - - def do_meta(self, attrs): - http_equiv = "" - content = "" - - for attrname, value in attrs: - if value: - value = value.strip() - if attrname == 'http-equiv': - http_equiv = value.lower() - elif attrname == 'content': - content = value - elif (attrname == 'charset') and (not self.charset): - self.charset = value.lower() - self.meta_charset = 1 - - if (not self.charset) and (http_equiv == "content-type"): - try: - # extract charset from "text/html; foo; charset=UTF-8, bar; baz;" - self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0] - self.meta_charset = 1 # Remember that the charset was retrieved from - # META tag, not from the Content-Type header - except IndexError: - pass - - if http_equiv == "refresh": - self.refresh = content - - def start_title(self, attrs): - self.accumulator = '' - - def end_title(self): - if not self.title: # use only the first title - self.title = self.accumulator - - def do_link(self, attrs): - has_icon = False - href = None - - for attrname, value in attrs: - if value: - value = value.strip() - if (attrname == 'rel') and (value.lower() in ('icon', 'shortcut icon')): - has_icon = True - elif attrname == 'href': - href = value - - if has_icon: - self.icon = href + def __init__(self, charset=None): + _HTMLParser.__init__(self) + self.charset = charset + self.meta_charset = 0 + self.title = None + self.refresh = None + self.icon = None + + def end_head(self): + raise HTMLHeadDone() + + def do_meta(self, attrs): + http_equiv = "" + content = "" + + for attrname, value in attrs: + if value: + value = value.strip() + if attrname == 'http-equiv': + http_equiv = value.lower() + elif attrname == 'content': + content = value + elif (attrname == 'charset') and (not self.charset): + self.charset = value.lower() + self.meta_charset = 1 + + if (not self.charset) and (http_equiv == "content-type"): + try: + # extract charset from "text/html; foo; charset=UTF-8, bar; baz;" + self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0] + self.meta_charset = 1 # Remember that the charset was retrieved from + # META tag, not from the Content-Type header + except IndexError: + pass + + if http_equiv == "refresh": + self.refresh = content + + def start_title(self, attrs): + self.accumulator = '' + + def end_title(self): + if not self.title: # use only the first title + self.title = self.accumulator + + def do_link(self, attrs): + has_icon = False + href = None + + for attrname, value in attrs: + if value: + value = value.strip() + if (attrname == 'rel') and (value.lower() in ('icon', 'shortcut icon')): + has_icon = True + elif attrname == 'href': + href = value + + if has_icon: + self.icon = href def parse_html(html_text, charset=None, log=None): - parser = HTMLParser(charset) + parser = HTMLParser(charset) - try: - parser.feed(html_text) - except (HTMLParseError, HTMLHeadDone): - pass + try: + parser.feed(html_text) + except (HTMLParseError, HTMLHeadDone): + pass - try: - parser.close() - except (HTMLParseError, HTMLHeadDone): - pass + try: + parser.close() + except (HTMLParseError, HTMLHeadDone): + pass - if (parser.title is None) and (parser.refresh is None) and (parser.icon is None): - return None - return parser + if (parser.title is None) and (parser.refresh is None) and (parser.icon is None): + return None + return parser