__all__ = ['parse_html']
-from HTMLParser import HTMLParseError
+try:
+ from HTMLParser import HTMLParseError
+except ImportError:
+ class HTMLParseError(Exception): pass
from m_lib.net.www.html import HTMLParser as _HTMLParser
if (not self.charset) and (http_equiv == "content-type"):
try:
- # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
- self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0]
+ # extract charset from
+ # "text/html; foo; charset=UTF-8, bar; baz;"
+ self.charset = content.lower().split('charset=')[1].\
+ split(';')[0].split(',')[0]
# Remember that the charset was retrieved from
# META tag, not from the Content-Type header
self.meta_charset = 1
for attrname, value in attrs:
if value:
value = value.strip()
- if (attrname == 'rel') and (value.lower() in ('icon', 'shortcut icon')):
+ if (attrname == 'rel') and (
+ value.lower() in ('icon', 'shortcut icon')
+ ):
has_icon = True
elif attrname == 'href':
href = value
def parse_html(html_text, charset=None, log=None):
+ if not html_text:
+ return None
+ if charset is None and isinstance(html_text, bytes):
+ return None # html.parser cannot parse bytes
+ if charset and isinstance(html_text, bytes):
+ html_text = html_text.decode(charset)
+
parser = HTMLParser(charset)
try:
except (HTMLParseError, HTMLHeadDone):
pass
- if (parser.title is None) and (parser.refresh is None) and (parser.icon is None):
+ if (parser.title is None) and (parser.refresh is None) \
+ and (parser.icon is None):
return None
return parser