X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=parse_html%2Fbkmk_ph_htmlparser.py;h=45e89f5119817787b0b926c7f43627a7dc22e7dd;hb=c88cb7a75e7caf1d67466cfa107981d95115fa0c;hp=5c0a440ff7fea80bba26b58a0e729b75df8567cc;hpb=71712390f4edb041609ff7bc9272d12a5c1a9b1d;p=bookmarks_db.git diff --git a/parse_html/bkmk_ph_htmlparser.py b/parse_html/bkmk_ph_htmlparser.py index 5c0a440..45e89f5 100644 --- a/parse_html/bkmk_ph_htmlparser.py +++ b/parse_html/bkmk_ph_htmlparser.py @@ -1,10 +1,11 @@ """HTML Parser using Pythons' HTMLParser This file is a part of Bookmarks database and Internet robot. + """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design" +__copyright__ = "Copyright (C) 1997-2017 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -18,82 +19,80 @@ class HTMLHeadDone(Exception): pass class HTMLParser(_HTMLParser): - def __init__(self, charset=None): - _HTMLParser.__init__(self) - self.charset = charset - self.meta_charset = 0 - self.title = None - self.refresh = None - self.icon = None - - def end_head(self): - raise HTMLHeadDone() - - def do_meta(self, attrs): - http_equiv = "" - content = "" - - for attrname, value in attrs: - if value: - value = value.strip() - if attrname == 'http-equiv': - http_equiv = value.lower() - elif attrname == 'content': - content = value - - if (not self.charset) and (http_equiv == "content-type"): - try: - # extract charset from "text/html; foo; charset=UTF-8, bar; baz;" - self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0] - self.meta_charset = 1 # Remember that the charset was retrieved from - # META tag, not from the Content-Type header - except IndexError: - pass - - if http_equiv == "refresh": - self.refresh = content - - def start_title(self, attrs): - self.accumulator = '' - - def end_title(self): - if not self.title: # use only the first title - self.title = self.accumulator - - def do_link(self, attrs): - has_icon = False - href = None - - for attrname, value in attrs: - if value: - value = value.strip() - if (attrname == 'rel') and (value.lower() in ('icon', 'shortcut icon')): - has_icon = True - elif attrname == 'href': - href = value - - if has_icon: - self.icon = href - - -def parse_html(filename, charset=None, log=None): - infile = open(filename, 'r') - parser = HTMLParser(charset) - - for line in infile: - try: - parser.feed(line) - except (HTMLParseError, HTMLHeadDone): - break - - infile.close() - - try: - parser.close() - except (HTMLParseError, HTMLHeadDone): - pass - - if parser.title is None: - return None - - return parser + def __init__(self, charset=None): + _HTMLParser.__init__(self) + self.charset = charset + self.meta_charset = 0 + self.title = None + self.refresh = None + self.icon = None + + def end_head(self): + raise HTMLHeadDone() + + def do_meta(self, attrs): + http_equiv = "" + content = "" + + for attrname, value in attrs: + if value: + value = value.strip() + if attrname == 'http-equiv': + http_equiv = value.lower() + elif attrname == 'content': + content = value + elif (attrname == 'charset') and (not self.charset): + self.charset = value.lower() + self.meta_charset = 1 + + if (not self.charset) and (http_equiv == "content-type"): + try: + # extract charset from "text/html; foo; charset=UTF-8, bar; baz;" + self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0] + self.meta_charset = 1 # Remember that the charset was retrieved from + # META tag, not from the Content-Type header + except IndexError: + pass + + if http_equiv == "refresh": + self.refresh = content + + def start_title(self, attrs): + self.accumulator = '' + + def end_title(self): + if not self.title: # use only the first title + self.title = self.accumulator + + def do_link(self, attrs): + has_icon = False + href = None + + for attrname, value in attrs: + if value: + value = value.strip() + if (attrname == 'rel') and (value.lower() in ('icon', 'shortcut icon')): + has_icon = True + elif attrname == 'href': + href = value + + if has_icon: + self.icon = href + + +def parse_html(html_text, charset=None, log=None): + parser = HTMLParser(charset) + + try: + parser.feed(html_text) + except (HTMLParseError, HTMLHeadDone): + pass + + try: + parser.close() + except (HTMLParseError, HTMLHeadDone): + pass + + if (parser.title is None) and (parser.refresh is None) and (parser.icon is None): + return None + return parser