Written by BroytMann. Copyright (C) 2007 PhiloSoft Design
"""
+from HTMLParser import HTMLParser
from BeautifulSoup import BeautifulSoup
-class DummyParser(object):
+class BSoupParser(HTMLParser):
def __init__(self, charset, meta, title, refresh, icon):
object.__init__(self)
self.charset = charset
self.refresh = refresh
self.icon = icon
+
def parse_html(filename, charset=None):
infile = open(filename, 'r')
- root = BeautifulSoup(infile, fromEncoding=charset)
+ try:
+ root = BeautifulSoup(infile, fromEncoding=charset)
+ except TypeError:
+ return None
infile.close()
- charset = root.originalEncoding
+ _charset = root.originalEncoding
try:
- title = root.html.head.title.string.encode(charset)
+ title = root.html.head.title.string.encode(_charset)
except AttributeError:
- title = ''
+ return None
- try:
- meta = root.html.head.find(_find_refresh, recursive=False)
- except AttributeError:
- refresh = None
+ meta = root.html.head.find(_find_refresh, recursive=False)
+ if meta:
+ refresh = meta.get("content")
else:
- if meta:
- refresh = meta.get("content")
- else:
- refresh = None
+ refresh = None
- try:
- meta = root.html.head.find(_find_icon, recursive=False)
- except AttributeError:
- icon = None
+ meta = root.html.head.find(_find_icon, recursive=False)
+ if meta:
+ icon = meta.get("href")
else:
- if meta:
- icon = meta.get("href")
- else:
- icon = None
+ icon = None
- parser = DummyParser(charset, False, title, refresh, icon)
- return parser
+ return BSoupParser(_charset, _charset != charset, title, refresh, icon)
def _find_refresh(Tag):
return (Tag.name == "meta") and \