2 HTML Parser using BeautifulSoup
4 Written by BroytMann. Copyright (C) 2007 PhiloSoft Design
7 from HTMLParser import HTMLParser
8 from BeautifulSoup import BeautifulSoup
11 class BSoupParser(HTMLParser):
12 def __init__(self, charset, meta, title, refresh, icon):
14 self.charset = charset
15 self.meta_charset = meta
17 self.refresh = refresh
21 def parse_html(filename, charset=None):
22 infile = open(filename, 'r')
24 root = BeautifulSoup(infile, fromEncoding=charset)
29 _charset = root.originalEncoding
31 title = root.html.head.title.string.encode(_charset)
32 except AttributeError:
35 meta = root.html.head.find(_find_refresh, recursive=False)
37 refresh = meta.get("content")
41 meta = root.html.head.find(_find_icon, recursive=False)
43 icon = meta.get("href")
47 return BSoupParser(_charset, _charset != charset, title, refresh, icon)
49 def _find_refresh(Tag):
50 return (Tag.name == "meta") and \
51 (Tag.get("http-equiv", '').lower() == "refresh")
54 return (Tag.name == "link") and \
55 (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))