4 Written by Broytman. Copyright (C) 1997-2008 PhiloSoft Design
7 from HTMLParser import HTMLParseError
8 from m_lib.net.www.html import HTMLParser as _HTMLParser
11 class HTMLHeadDone(Exception): pass
14 class HTMLParser(_HTMLParser):
15 def __init__(self, charset=None):
16 _HTMLParser.__init__(self)
17 self.charset = charset
27 def do_meta(self, attrs):
31 for attrname, value in attrs:
34 if attrname == 'http-equiv':
35 http_equiv = value.lower()
36 elif attrname == 'content':
39 if (not self.charset) and (http_equiv == "content-type"):
41 # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
42 self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0]
43 self.meta_charset = 1 # Remember that the charset was retrieved from
44 # META tag, not from the Content-Type header
48 if http_equiv == "refresh":
49 self.refresh = content
52 def start_title(self, attrs):
56 if not self.title: # use only the first title
57 self.title = self.accumulator
60 def do_link(self, attrs):
64 for attrname, value in attrs:
66 value = value.strip().lower()
67 if (attrname == 'rel') and (value.lower() in ('icon', 'shortcut icon')):
69 elif attrname == 'href':
76 def parse_html(filename, charset=None, log=None):
77 infile = open(filename, 'r')
78 parser = HTMLParser(charset)
83 except (HTMLParseError, HTMLHeadDone):
90 except (HTMLParseError, HTMLHeadDone):