2 HTML Parser using BeautifulSoup
4 Written by BroytMann. Copyright (C) 2007 PhiloSoft Design
7 from BeautifulSoup import BeautifulSoup
10 class DummyParser(object):
11 def __init__(self, charset, meta, title, refresh, icon):
13 self.charset = charset
14 self.meta_charset = meta
16 self.refresh = refresh
19 def parse_html(filename, charset=None):
20 infile = open(filename, 'r')
21 root = BeautifulSoup(infile, fromEncoding=charset)
24 charset = root.originalEncoding
26 title = root.html.head.title.string.encode(charset)
27 except AttributeError:
31 meta = root.html.head.find(_find_refresh, recursive=False)
32 except AttributeError:
36 refresh = meta.get("content")
41 meta = root.html.head.find(_find_icon, recursive=False)
42 except AttributeError:
46 icon = meta.get("href")
50 parser = DummyParser(charset, False, title, refresh, icon)
53 def _find_refresh(Tag):
54 return (Tag.name == "meta") and \
55 (Tag.get("http-equiv", '').lower() == "refresh")
58 return (Tag.name == "link") and \
59 (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))