1 """HTML Parser using Pythons' HTMLParser
3 This file is a part of Bookmarks database and Internet robot.
6 __author__ = "Oleg Broytman <phd@phdru.name>"
7 __copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design"
8 __license__ = "GNU GPL"
10 __all__ = ['parse_html']
13 from HTMLParser import HTMLParseError
14 from m_lib.net.www.html import HTMLParser as _HTMLParser
17 class HTMLHeadDone(Exception): pass
20 class HTMLParser(_HTMLParser):
21 def __init__(self, charset=None):
22 _HTMLParser.__init__(self)
23 self.charset = charset
32 def do_meta(self, attrs):
36 for attrname, value in attrs:
39 if attrname == 'http-equiv':
40 http_equiv = value.lower()
41 elif attrname == 'content':
44 if (not self.charset) and (http_equiv == "content-type"):
46 # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
47 self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0]
48 self.meta_charset = 1 # Remember that the charset was retrieved from
49 # META tag, not from the Content-Type header
53 if http_equiv == "refresh":
54 self.refresh = content
56 def start_title(self, attrs):
60 if not self.title: # use only the first title
61 self.title = self.accumulator
63 def do_link(self, attrs):
67 for attrname, value in attrs:
70 if (attrname == 'rel') and (value.lower() in ('icon', 'shortcut icon')):
72 elif attrname == 'href':
79 def parse_html(filename, charset=None, log=None):
80 infile = open(filename, 'r')
81 parser = HTMLParser(charset)
86 except (HTMLParseError, HTMLHeadDone):
93 except (HTMLParseError, HTMLHeadDone):
96 if (parser.title is None) and (parser.refresh is None) and (parser.icon is None):