2 HTML Parser using BeautifulSoup
4 Written by BroytMann. Copyright (C) 2007 PhiloSoft Design
7 from HTMLParser import HTMLParser
8 from BeautifulSoup import BeautifulSoup
11 class BSoupParser(HTMLParser):
12 def __init__(self, charset, meta, title, refresh, icon):
14 self.charset = charset
15 self.meta_charset = meta
17 self.refresh = refresh
21 def parse_html(filename, charset=None):
22 infile = open(filename, 'r')
23 root = BeautifulSoup(infile, fromEncoding=charset)
26 charset = root.originalEncoding
28 title = root.html.head.title.string.encode(charset)
29 except AttributeError:
33 meta = root.html.head.find(_find_refresh, recursive=False)
34 except AttributeError:
38 refresh = meta.get("content")
43 meta = root.html.head.find(_find_icon, recursive=False)
44 except AttributeError:
48 icon = meta.get("href")
52 parser = BSoupParser(charset, False, title, refresh, icon)
55 def _find_refresh(Tag):
56 return (Tag.name == "meta") and \
57 (Tag.get("http-equiv", '').lower() == "refresh")
60 return (Tag.name == "link") and \
61 (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))