"""
HTML Parser using BeautifulSoup
- Written by BroytMann. Copyright (C) 2007 PhiloSoft Design
+ Written by BroytMann. Copyright (C) 2007, 2008 PhiloSoft Design
"""
import re
root = BadDeclParser(infile, fromEncoding=charset)
except TypeError:
return None
- infile.close()
+ finally:
+ infile.close()
- _charset = root.originalEncoding
try:
- title = root.html.head.title.string.encode(_charset)
+ head = root.html.head
except AttributeError:
return None
- meta = root.html.head.find(_find_refresh, recursive=False)
+ if head is None:
+ head = root.html # Some sites put TITLE in HTML without HEAD
+
+ _charset = root.originalEncoding
+ try:
+ title = head.title.string.encode(_charset)
+ except AttributeError:
+ title = '' # HEAD but no TITLE
+
+ if (not title) and (head is not root.html):
+ # Some sites put TITLE in HTML outside of HEAD
+
+ try:
+ title = root.html.title.string.encode(_charset)
+ except AttributeError:
+ title = '' # no TITLE in HTML too
+
+ meta = head.find(_find_refresh, recursive=False)
if meta:
refresh = meta.get("content")
else:
refresh = None
- meta = root.html.head.find(_find_icon, recursive=False)
+ meta = head.find(_find_icon, recursive=False)
if meta:
icon = meta.get("href")
else: