From f10b20785d3f5ba0b42ff2ca6e13efb194fdd4cc Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Thu, 12 Aug 2010 09:05:40 +0000 Subject: [PATCH] Do not parse meta charset if there is HTTP charset. Find html in tree.childNodes skipping DocType. Use meta charset if there is no HTTP charset. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@267 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- Robots/parse_html_html5.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/Robots/parse_html_html5.py b/Robots/parse_html_html5.py index fcc7e15..6255825 100644 --- a/Robots/parse_html_html5.py +++ b/Robots/parse_html_html5.py @@ -9,14 +9,22 @@ from parse_html_util import HTMLParser def parse_html(filename, charset=None, log=None): + parser = HTML5Parser() fp = open(filename) - html_tree = HTML5Parser().parse(fp, charset) + parser._parse(fp, encoding=charset, parseMeta=bool(charset)) fp.close() + html_tree = parser.tree.getDocument() - if not html_tree.childNodes: + for node in html_tree.childNodes: + if (node.name == 'html') and (node.type != 3): # Skip DocType element + html = node + break + else: + html = None + + if not html: return None - html = html_tree.childNodes[-1] for node in html.childNodes: if node.name == 'head': head = node @@ -52,6 +60,9 @@ def parse_html(filename, charset=None, log=None): else: title = '' + if not charset: + charset = parser.tokenizer.stream.charEncoding[0] + if title and (charset or meta_charset): title = title.encode(charset or meta_charset) -- 2.39.2