"""
HTML Parser
- Written by BroytMann. Copyright (C) 1997-2007 PhiloSoft Design
+ Written by Broytman. Copyright (C) 1997-2010 PhiloSoft Design
"""
from HTMLParser import HTMLParseError
_HTMLParser.__init__(self)
self.charset = charset
self.meta_charset = 0
- self.title = ''
- self.refresh = ''
+ self.title = None
+ self.refresh = None
self.icon = None
def end_head(self):
if (not self.charset) and (http_equiv == "content-type"):
try:
- # extract charset from "text/html; foo; charset=UTF-8; bar;"
- self.charset = content.lower().split('charset=')[1].split(';')[0]
+ # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
+ self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0]
self.meta_charset = 1 # Remember that the charset was retrieved from
# META tag, not from the Content-Type header
except IndexError:
for attrname, value in attrs:
if value:
- value = value.strip().lower()
+ value = value.strip()
if (attrname == 'rel') and (value.lower() in ('icon', 'shortcut icon')):
has_icon = True
elif attrname == 'href':
self.icon = href
-def parse_html(filename, charset=None):
+def parse_html(filename, charset=None, log=None):
infile = open(filename, 'r')
parser = HTMLParser(charset)
except (HTMLParseError, HTMLHeadDone):
pass
+ if parser.title is None:
+ return None
+
return parser