X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fparse_html_lxml.py;fp=Robots%2Fparse_html_lxml.py;h=7aba09cbf5d426e2edf21c96fc4ae3375cef36ac;hb=b747da40daf40c8b32a437758d7b4244d752838a;hp=0000000000000000000000000000000000000000;hpb=331fbf809d7bd3cc14faeb7b3a72b25bc54f6dfb;p=bookmarks_db.git diff --git a/Robots/parse_html_lxml.py b/Robots/parse_html_lxml.py new file mode 100644 index 0000000..7aba09c --- /dev/null +++ b/Robots/parse_html_lxml.py @@ -0,0 +1,51 @@ +""" + HTML Parser using lxml.html. + + Written by Broytman. Copyright (C) 2010 PhiloSoft Design +""" + +from lxml.html import parse +from parse_html_util import HTMLParser + + +def parse_html(filename, charset=None, log=None): + html_tree = parse(filename) + + title = html_tree.findtext('head/title') + if title is None: + title = html_tree.findtext('title') + if title is None: + return None + + meta = html_tree.findall('head/meta') + for m in meta: + if m.get('http-equiv', '').lower() == 'content-type': + meta_content = m.get("content") + if meta_content: + try: + meta_charset = \ + meta_content.lower().split('charset=')[1].split(';')[0] + break + except IndexError: + meta_charset = False + else: + meta_charset = False + + if charset or meta_charset: + title = title.encode(charset or meta_charset) + + for m in meta: + if m.get('http-equiv', '').lower() == 'refresh': + refresh = m.get("content") + break + else: + refresh = None + + for link in html_tree.findall('head/link'): + if link.get('rel', '').lower() in ('icon', 'shortcut icon'): + icon = link.get("href") + break + else: + icon = None + + return HTMLParser(charset, meta_charset, title, refresh, icon)