current_charset = default_encoding.replace("windows-", "cp")
DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
+parsers = []
try:
- from parse_html_beautifulsoup import parse_html as _parse_html
+ from parse_html_beautifulsoup import parse_html
except ImportError:
- from parse_html_htmlparser import parse_html as _parse_html
+ pass
+else:
+ parsers.append(parse_html)
+
+from parse_html_htmlparser import parse_html
+parsers.append(parse_html)
import re
except (ValueError, LookupError):
charset = None # ...try charset from HTML
- parser = _parse_html(filename, charset)
- title = parser.title
+ for p in parsers:
+ parser = p(filename, charset)
+ if parser:
+ break
+ else:
+ if log: log("Parser %s failed, trying next one." % p)
+ title = parser.title
if not parser.charset:
try:
unicode(title, "ascii")
def parse_html(filename, charset=None):
infile = open(filename, 'r')
- root = BeautifulSoup(infile, fromEncoding=charset)
+ try:
+ root = BeautifulSoup(infile, fromEncoding=charset)
+ except TypeError:
+ return None
infile.close()
_charset = root.originalEncoding
try:
title = root.html.head.title.string.encode(_charset)
except AttributeError:
- title = ''
+ return None
- try:
- meta = root.html.head.find(_find_refresh, recursive=False)
- except AttributeError:
- refresh = None
+ meta = root.html.head.find(_find_refresh, recursive=False)
+ if meta:
+ refresh = meta.get("content")
else:
- if meta:
- refresh = meta.get("content")
- else:
- refresh = None
+ refresh = None
- try:
- meta = root.html.head.find(_find_icon, recursive=False)
- except AttributeError:
- icon = None
+ meta = root.html.head.find(_find_icon, recursive=False)
+ if meta:
+ icon = meta.get("href")
else:
- if meta:
- icon = meta.get("href")
- else:
- icon = None
+ icon = None
return BSoupParser(_charset, _charset != charset, title, refresh, icon)