4 from HTMLParser import HTMLParseError
5 from m_lib.net.www.html import HTMLParser as _HTMLParser
8 class HTMLHeadDone(Exception): pass
11 class HTMLParser(_HTMLParser):
16 def do_meta(self, attrs):
20 for attrname, value in attrs:
23 if attrname == 'http-equiv':
24 http_equiv = value.lower()
25 elif attrname == 'content':
28 if http_equiv == "content-type":
30 # extract charset from "text/html; foo; charset=UTF-8; bar;"
31 self.charset = content.lower().split('charset=')[1].split(';')[0]
37 def parse_html(filename):
38 infile = open(filename, 'r')
44 except (HTMLParseError, HTMLHeadDone):
51 except (HTMLParseError, HTMLHeadDone):
54 if hasattr(parser, "charset"):
55 parser.charset = parser.charset.replace("windows-", "cp").lower()
60 if __name__ == '__main__':
63 parser = parse_html(sys.argv[1])
64 if hasattr(parser, "charset"):
68 charset = chardet.detect(open(sys.argv[1]).read())["encoding"]
69 if charset in ("ISO-8859-2", "MacCyrillic"):