1 #! /usr/bin/env python3
3 from m_lib.net.www.html import HTMLParser as _HTMLParser
5 class HTMLHeadDone(Exception): pass
7 class HTMLParser(_HTMLParser):
11 def do_meta(self, attrs):
15 for attrname, value in attrs:
18 if attrname == 'http-equiv':
19 http_equiv = value.lower()
20 elif attrname == 'content':
23 if http_equiv == "content-type":
25 # extract charset from "text/html; foo; charset=UTF-8; bar;"
26 self.charset = content.lower().split('charset=')[1].split(';')[0]
32 def parse_html(filename):
33 infile = open(filename, 'r')
49 if hasattr(parser, "charset"):
50 parser.charset = parser.charset.replace("windows-", "cp").lower()
55 if __name__ == '__main__':
58 parser = parse_html(sys.argv[1])
59 if hasattr(parser, "charset"):
63 charset = chardet.detect(open(sys.argv[1]).read())["encoding"]
64 if charset in ("ISO-8859-2", "MacCyrillic"):