def parse_html(filename, charset=None):
infile = open(filename, 'r')
- root = BeautifulSoup(infile, fromEncoding=charset)
+ try:
+ root = BeautifulSoup(infile, fromEncoding=charset)
+ except TypeError:
+ return None
infile.close()
_charset = root.originalEncoding
try:
title = root.html.head.title.string.encode(_charset)
except AttributeError:
- title = ''
+ return None
- try:
- meta = root.html.head.find(_find_refresh, recursive=False)
- except AttributeError:
- refresh = None
+ meta = root.html.head.find(_find_refresh, recursive=False)
+ if meta:
+ refresh = meta.get("content")
else:
- if meta:
- refresh = meta.get("content")
- else:
- refresh = None
+ refresh = None
- try:
- meta = root.html.head.find(_find_icon, recursive=False)
- except AttributeError:
- icon = None
+ meta = root.html.head.find(_find_icon, recursive=False)
+ if meta:
+ icon = meta.get("href")
else:
- if meta:
- icon = meta.get("href")
- else:
- icon = None
+ icon = None
return BSoupParser(_charset, _charset != charset, title, refresh, icon)