From 349d6e0241d43e42a257d6019972402116236ee7 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Wed, 11 Aug 2010 18:22:25 +0000 Subject: [PATCH] Added HTML Parser based on TidyHTMLTreeBuilder. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@258 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- Robots/parse_html.py | 8 ++++++ Robots/parse_html_etreetidy.py | 52 ++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 Robots/parse_html_etreetidy.py diff --git a/Robots/parse_html.py b/Robots/parse_html.py index 8ffca5c..cc7655b 100755 --- a/Robots/parse_html.py +++ b/Robots/parse_html.py @@ -11,6 +11,14 @@ universal_charset = "utf-8" DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] + +try: + import parse_html_etreetidy +except ImportError: + pass +else: + parsers.append(parse_html_etreetidy.parse_html) + try: import parse_html_beautifulsoup parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET diff --git a/Robots/parse_html_etreetidy.py b/Robots/parse_html_etreetidy.py new file mode 100644 index 0000000..65d42ae --- /dev/null +++ b/Robots/parse_html_etreetidy.py @@ -0,0 +1,52 @@ +""" + HTML Parser using ElementTree+TidyLib. + + Written by Broytman. Copyright (C) 2010 PhiloSoft Design +""" + +from elementtidy import TidyHTMLTreeBuilder +from parse_html_util import HTMLParser + + +def parse_html(filename, charset=None, log=None): + try: + html_tree = TidyHTMLTreeBuilder.parse(filename) + except: + return None + + XHTML = "{http://www.w3.org/1999/xhtml}" + + for elem in html_tree.getiterator(): + if elem.tag.startswith(XHTML): + elem.tag = elem.tag[len(XHTML):] + + meta = html_tree.findall('head/meta') + for m in meta: + if m.get('http-equiv', '').lower() == 'content-type': + meta_content = m.get("content") + if meta_content: + meta_charset = \ + meta_content.lower().split('charset=')[1].split(';')[0] + break + else: + meta_charset = False + + title = html_tree.findtext('head/title') + if title and (charset or meta_charset): + title = title.encode(charset or meta_charset) + + for m in meta: + if m.get('http-equiv', '').lower() == 'refresh': + refresh = m.get("content") + break + else: + refresh = None + + for link in html_tree.findall('head/link'): + if link.get('rel', '').lower() in ('icon', 'shortcut icon'): + icon = link.get("href") + break + else: + icon = None + + return HTMLParser(charset, meta_charset, title, refresh, icon) -- 2.39.5