X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fparse_html_htmlparser.py;h=77021624ff9543a82ebe26cc170398a49f13a6bb;hb=ab815a37e54d51b330eb1c8a756f9d52bba859ad;hp=e1a35f1007babbf5a31c93751da082dc3e94156d;hpb=a147d51d168748fe91f9ee8e27fcc065d12658d8;p=bookmarks_db.git diff --git a/Robots/parse_html_htmlparser.py b/Robots/parse_html_htmlparser.py index e1a35f1..7702162 100644 --- a/Robots/parse_html_htmlparser.py +++ b/Robots/parse_html_htmlparser.py @@ -1,7 +1,7 @@ """ HTML Parser - Written by BroytMann. Copyright (C) 1997-2007 PhiloSoft Design + Written by Broytman. Copyright (C) 1997-2010 PhiloSoft Design """ from HTMLParser import HTMLParseError @@ -16,8 +16,8 @@ class HTMLParser(_HTMLParser): _HTMLParser.__init__(self) self.charset = charset self.meta_charset = 0 - self.title = '' - self.refresh = '' + self.title = None + self.refresh = None self.icon = None def end_head(self): @@ -38,8 +38,8 @@ class HTMLParser(_HTMLParser): if (not self.charset) and (http_equiv == "content-type"): try: - # extract charset from "text/html; foo; charset=UTF-8; bar;" - self.charset = content.lower().split('charset=')[1].split(';')[0] + # extract charset from "text/html; foo; charset=UTF-8, bar; baz;" + self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0] self.meta_charset = 1 # Remember that the charset was retrieved from # META tag, not from the Content-Type header except IndexError: @@ -63,7 +63,7 @@ class HTMLParser(_HTMLParser): for attrname, value in attrs: if value: - value = value.strip().lower() + value = value.strip() if (attrname == 'rel') and (value.lower() in ('icon', 'shortcut icon')): has_icon = True elif attrname == 'href': @@ -73,7 +73,7 @@ class HTMLParser(_HTMLParser): self.icon = href -def parse_html(filename, charset=None): +def parse_html(filename, charset=None, log=None): infile = open(filename, 'r') parser = HTMLParser(charset) @@ -90,4 +90,7 @@ def parse_html(filename, charset=None): except (HTMLParseError, HTMLHeadDone): pass + if parser.title is None: + return None + return parser