1 """HTML Parser using html5
3 This file is a part of Bookmarks database and Internet robot.
6 __version__ = "$Revision$"[11:-2]
7 __revision__ = "$Id$"[5:-2]
8 __date__ = "$Date$"[7:-2]
9 __author__ = "Oleg Broytman <phd@phdru.name>"
10 __copyright__ = "Copyright (C) 2010, 2011 PhiloSoft Design"
11 __license__ = "GNU GPL"
13 __all__ = ['parse_html']
16 from html5lib import HTMLParser as HTML5Parser
17 from .util import HTMLParser
20 def parse_html(filename, charset=None, log=None):
21 parser = HTML5Parser()
23 parser._parse(fp, encoding=charset, parseMeta=bool(charset))
25 html_tree = parser.tree.getDocument()
27 for node in html_tree.childNodes:
28 if (node.name == 'html') and (node.type != 3): # Skip DocType element
37 for node in html.childNodes:
38 if node.name == 'head':
50 for node in head.childNodes:
51 if node.name == 'title':
53 title = node.childNodes[0].value
61 for node in head.childNodes:
62 if node.name == 'meta' and \
63 ('http-equiv' in node.attributes) and \
64 (node.attributes['http-equiv'] == 'content-type'):
65 meta_content = node.attributes['content']
69 meta_content.lower().split('charset=')[1].split(';')[0]
76 charset = parser.tokenizer.stream.charEncoding[0]
78 if charset or meta_charset:
79 title = title.encode(charset or meta_charset)
81 for node in head.childNodes:
82 if node.name == 'meta' and \
83 ('http-equiv' in node.attributes) and \
84 (node.attributes['http-equiv'] == 'refresh'):
85 refresh = node.attributes['content']
88 for node in head.childNodes:
89 if node.name == 'link' and \
90 ('rel' in node.attributes) and \
91 (node.attributes['rel'] in ('icon', 'shortcut icon')):
92 icon = node.attributes['href']
96 for node in html.childNodes:
97 if node.name == 'title':
99 title = node.childNodes[0].value
107 return HTMLParser(charset, meta_charset, title, refresh, icon)