From: Oleg Broytman Date: Fri, 13 Aug 2010 13:17:33 +0000 (+0000) Subject: Added HTML Parser based on lxml. X-Git-Tag: v4.5.3~98 X-Git-Url: https://git.phdru.name/?p=bookmarks_db.git;a=commitdiff_plain;h=b747da40daf40c8b32a437758d7b4244d752838a Added HTML Parser based on lxml. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@278 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- diff --git a/Robots/parse_html_lxml.py b/Robots/parse_html_lxml.py new file mode 100644 index 0000000..7aba09c --- /dev/null +++ b/Robots/parse_html_lxml.py @@ -0,0 +1,51 @@ +""" + HTML Parser using lxml.html. + + Written by Broytman. Copyright (C) 2010 PhiloSoft Design +""" + +from lxml.html import parse +from parse_html_util import HTMLParser + + +def parse_html(filename, charset=None, log=None): + html_tree = parse(filename) + + title = html_tree.findtext('head/title') + if title is None: + title = html_tree.findtext('title') + if title is None: + return None + + meta = html_tree.findall('head/meta') + for m in meta: + if m.get('http-equiv', '').lower() == 'content-type': + meta_content = m.get("content") + if meta_content: + try: + meta_charset = \ + meta_content.lower().split('charset=')[1].split(';')[0] + break + except IndexError: + meta_charset = False + else: + meta_charset = False + + if charset or meta_charset: + title = title.encode(charset or meta_charset) + + for m in meta: + if m.get('http-equiv', '').lower() == 'refresh': + refresh = m.get("content") + break + else: + refresh = None + + for link in html_tree.findall('head/link'): + if link.get('rel', '').lower() in ('icon', 'shortcut icon'): + icon = link.get("href") + break + else: + icon = None + + return HTMLParser(charset, meta_charset, title, refresh, icon) diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE index 79d7b1b..132ccbc 100644 --- a/doc/ANNOUNCE +++ b/doc/ANNOUNCE @@ -5,6 +5,10 @@ WHAT IS IT A set of classes, libraries, programs and plugins I use to manipulate my bookmarks.html. +WHAT'S NEW in version 4.2.2. + + Added HTML Parser based on lxml. + WHAT'S NEW in version 4.2.1 (2010-08-12). Added HTML Parser based on html5 library.