2 HTML Parser using lxml.html.
4 Written by Broytman. Copyright (C) 2010, 2011 PhiloSoft Design
7 from lxml.html import parse
8 from .util import HTMLParser
11 def parse_html(filename, charset=None, log=None):
12 html_tree = parse(filename)
14 if html_tree.getroot() is None:
17 title = html_tree.findtext('head/title')
19 title = html_tree.findtext('title')
23 meta = html_tree.findall('head/meta')
25 if m.get('http-equiv', '').lower() == 'content-type':
26 meta_content = m.get("content")
30 meta_content.lower().split('charset=')[1].split(';')[0]
37 if charset or meta_charset:
38 title = title.encode(charset or meta_charset)
41 if m.get('http-equiv', '').lower() == 'refresh':
42 refresh = m.get("content")
47 for link in html_tree.findall('head/link'):
48 if link.get('rel', '').lower() in ('icon', 'shortcut icon'):
49 icon = link.get("href")
54 return HTMLParser(charset, meta_charset, title, refresh, icon)