From: Oleg Broytman Date: Wed, 11 Aug 2010 20:07:29 +0000 (+0000) Subject: Added HTML Parser based on html5 library. X-Git-Tag: v4.5.3~114 X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=928e59567b0ba5e11efe915ae28d0e89f52bcc4a;p=bookmarks_db.git Added HTML Parser based on html5 library. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@262 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- diff --git a/Robots/parse_html.py b/Robots/parse_html.py index b51bd31..f50b01b 100755 --- a/Robots/parse_html.py +++ b/Robots/parse_html.py @@ -12,6 +12,13 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] +try: + import parse_html_html5 +except ImportError: + pass +else: + parsers.append(parse_html_html5.parse_html) + try: import parse_html_beautifulsoup parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET diff --git a/Robots/parse_html_html5.py b/Robots/parse_html_html5.py new file mode 100644 index 0000000..511e640 --- /dev/null +++ b/Robots/parse_html_html5.py @@ -0,0 +1,66 @@ +""" + HTML Parser using html5. + + Written by Broytman. Copyright (C) 2010 PhiloSoft Design +""" + +from html5lib import HTMLParser as HTML5Parser +from parse_html_util import HTMLParser + + +def parse_html(filename, charset=None, log=None): + fp = open(filename) + html_tree = HTML5Parser().parse(fp, charset) + fp.close() + + html = html_tree.childNodes[-1] + for node in html.childNodes: + if node.name == 'head': + head = node + break + else: + head = None + + meta_charset = False + title = None + refresh = None + icon = None + + if head: + for node in head.childNodes: + if node.name == 'meta' and \ + ('http-equiv' in node.attributes) and \ + (node.attributes['http-equiv'] == 'content-type'): + meta_content = node.attributes['content'] + if meta_content: + try: + meta_charset = \ + meta_content.lower().split('charset=')[1].split(';')[0] + except IndexError: + meta_charset = False + else: + break + + for node in head.childNodes: + if node.name == 'title': + title = node.childNodes[0].value + break + + if title and (charset or meta_charset): + title = title.encode(charset or meta_charset) + + for node in head.childNodes: + if node.name == 'meta' and \ + ('http-equiv' in node.attributes) and \ + (node.attributes['http-equiv'] == 'refresh'): + refresh = node.attributes['content'] + break + + for node in head.childNodes: + if node.name == 'link' and \ + ('rel' in node.attributes) and \ + (node.attributes['rel'] in ('icon', 'shortcut icon')): + icon = node.attributes['href'] + break + + return HTMLParser(charset, meta_charset, title, refresh, icon)