except SGMLParseError:
# Could not parse the DOCTYPE declaration
# Try to just skip the actual declaration
- match = re.search(r'<!DOCTYPE([^>]*?)>', self.rawdata[i:], re.MULTILINE)
+ match = re.search(r'<!DOCTYPE([^>]*?)>', self.rawdata[i:], re.MULTILINE|re.I)
if match:
toHandle = self.rawdata[i:match.end()]
else:
return j
-def parse_html(filename, charset=None, log=None):
+def _parse_html(filename, charset):
infile = open(filename, 'r')
try:
- root = BadDeclParser(infile, fromEncoding=charset)
+ return BadDeclParser(infile, fromEncoding=charset)
except TypeError:
- if log: log("TypeError")
return None
finally:
infile.close()
+def parse_html(filename, charset=None, log=None):
+ root = _parse_html(filename, charset)
+ if root is None:
+ return None
+
+ _charset = root.originalEncoding
+ if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"): # Replace default
+ _charset = DEFAULT_CHARSET
+ root = _parse_html(filename, _charset)
+ if root is None:
+ return None
+
try:
html = root.html
except AttributeError:
if head is None:
head = html # Some sites put TITLE in HTML without HEAD
- _charset = root.originalEncoding
- if _charset == "windows-1252": # Replace default
- _charset = DEFAULT_CHARSET
-
try:
title = head.title.string.encode(_charset)
except AttributeError: