current_charset = default_encoding.replace("windows-", "cp")
DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
+parsers = []
try:
- from parse_html_beautifulsoup import parse_html as _parse_html
+ from parse_html_beautifulsoup import parse_html
except ImportError:
- from parse_html_htmlparser import parse_html as _parse_html
+ pass
+else:
+ parsers.append(parse_html)
+
+from parse_html_htmlparser import parse_html
+parsers.append(parse_html)
import re
except (ValueError, LookupError):
charset = None # ...try charset from HTML
- parser = _parse_html(filename, charset)
- title = parser.title
+ for p in parsers:
+ parser = p(filename, charset)
+ if parser:
+ break
+ else:
+ if log: log("Parser %s failed, trying next one." % p)
+ title = parser.title
if not parser.charset:
try:
unicode(title, "ascii")