parsers = []
try:
- import parse_html_etreetidy
+ import parse_html_beautifulsoup
+ parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET
except ImportError:
pass
else:
- parsers.append(parse_html_etreetidy.parse_html)
+ parsers.append(parse_html_beautifulsoup.parse_html)
try:
- import parse_html_beautifulsoup
- parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET
+ from parse_html_lxml import parse_html
except ImportError:
pass
else:
- parsers.append(parse_html_beautifulsoup.parse_html)
+ parsers.append(parse_html)
+
+try:
+ from parse_html_htmlparser import parse_html
+except ImportError:
+ pass
+else:
+ parsers.append(parse_html)
-from parse_html_htmlparser import parse_html
-parsers.append(parse_html)
+try:
+ import parse_html_html5
+except ImportError:
+ pass
+else:
+ parsers.append(parse_html_html5.parse_html)
import re
def parse_html(filename, charset=None, log=None):
+ if not parsers:
+ return None
+
if charset:
try:
codecs.lookup(charset) # In case of unknown charset...
charsets = [universal_charset, DEFAULT_CHARSET]
if charset:
charset = charset.lower().replace("windows-", "cp")
- if charset not in charsets:
- charsets.insert(0, charset)
+ if charset in charsets:
+ charsets.remove(charset)
+ charsets.insert(0, charset)
for p in parsers:
parser = None
else:
if log: log("Parser %s.%s failed, trying next one." % (p.__module__, p.__name__))
+ if not parser:
+ return None
+
converted_title = title = parser.title
if title and (not parser.charset):
try:
final_title = ' '.join([s for s in parts if s])
if log and (final_title <> converted_title): log(" final title : %s" % final_title)
parser.title = final_title
+
+ icon = parser.icon
+ if isinstance(icon, unicode):
+ try:
+ parser.icon = icon.encode('ascii')
+ except UnicodeEncodeError:
+ if parser.charset:
+ parser.icon = icon.encode(parser.charset)
return parser