X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fparse_html_html5.py;h=43e8d74ba957f658ffef276cadb2782ac27e84e5;hb=b47c320c38663d1a6d5def2c2d34febcd075be89;hp=fcc7e158a20e2e0a6f35aa8840cbb197dd1c8755;hpb=99128bb510fe0114d77dd8365997eff53a6ab8de;p=bookmarks_db.git diff --git a/Robots/parse_html_html5.py b/Robots/parse_html_html5.py index fcc7e15..43e8d74 100644 --- a/Robots/parse_html_html5.py +++ b/Robots/parse_html_html5.py @@ -9,14 +9,22 @@ from parse_html_util import HTMLParser def parse_html(filename, charset=None, log=None): + parser = HTML5Parser() fp = open(filename) - html_tree = HTML5Parser().parse(fp, charset) + parser._parse(fp, encoding=charset, parseMeta=bool(charset)) fp.close() + html_tree = parser.tree.getDocument() - if not html_tree.childNodes: + for node in html_tree.childNodes: + if (node.name == 'html') and (node.type != 3): # Skip DocType element + html = node + break + else: + html = None + + if not html: return None - html = html_tree.childNodes[-1] for node in html.childNodes: if node.name == 'head': head = node @@ -30,6 +38,17 @@ def parse_html(filename, charset=None, log=None): icon = None if head: + for node in head.childNodes: + if node.name == 'title': + if node.childNodes: + title = node.childNodes[0].value + break + else: + title = '' + + if title is None: + return None + for node in head.childNodes: if node.name == 'meta' and \ ('http-equiv' in node.attributes) and \ @@ -44,15 +63,10 @@ def parse_html(filename, charset=None, log=None): else: break - for node in head.childNodes: - if node.name == 'title': - if node.childNodes: - title = node.childNodes[0].value - break - else: - title = '' + if not charset: + charset = parser.tokenizer.stream.charEncoding[0] - if title and (charset or meta_charset): + if charset or meta_charset: title = title.encode(charset or meta_charset) for node in head.childNodes: @@ -69,4 +83,16 @@ def parse_html(filename, charset=None, log=None): icon = node.attributes['href'] break + else: + for node in html.childNodes: + if node.name == 'title': + if node.childNodes: + title = node.childNodes[0].value + break + else: + title = '' + + if title is None: + return None + return HTMLParser(charset, meta_charset, title, refresh, icon)