return j
-def parse_html(filename, charset=None, log=None):
+def _parse_html(filename, charset):
infile = open(filename, 'r')
try:
- root = BadDeclParser(infile, fromEncoding=charset)
+ return BadDeclParser(infile, fromEncoding=charset)
except TypeError:
return None
finally:
infile.close()
+def parse_html(filename, charset=None, log=None):
+ root = _parse_html(filename, charset)
+
+ _charset = root.originalEncoding
+ if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"): # Replace default
+ _charset = DEFAULT_CHARSET
+ root = _parse_html(filename, _charset)
+
try:
html = root.html
except AttributeError:
if head is None:
head = html # Some sites put TITLE in HTML without HEAD
- _charset = root.originalEncoding
- if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"): # Replace default
- _charset = DEFAULT_CHARSET
-
try:
title = head.title.string.encode(_charset)
except AttributeError: