From: Oleg Broytman <phd@phdru.name>
Date: Wed, 11 Aug 2010 20:07:29 +0000 (+0000)
Subject: Added HTML Parser based on html5 library.
X-Git-Tag: v4.5.3~114
X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=928e59567b0ba5e11efe915ae28d0e89f52bcc4a;p=bookmarks_db.git

Added HTML Parser based on html5 library.


git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@262 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23
---

diff --git a/Robots/parse_html.py b/Robots/parse_html.py
index b51bd31..f50b01b 100755
--- a/Robots/parse_html.py
+++ b/Robots/parse_html.py
@@ -12,6 +12,13 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
 
 parsers = []
 
+try:
+   import parse_html_html5
+except ImportError:
+   pass
+else:
+   parsers.append(parse_html_html5.parse_html)
+
 try:
    import parse_html_beautifulsoup
    parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET
diff --git a/Robots/parse_html_html5.py b/Robots/parse_html_html5.py
new file mode 100644
index 0000000..511e640
--- /dev/null
+++ b/Robots/parse_html_html5.py
@@ -0,0 +1,66 @@
+"""
+    HTML Parser using html5.
+
+    Written by Broytman. Copyright (C) 2010 PhiloSoft Design
+"""
+
+from html5lib import HTMLParser as HTML5Parser
+from parse_html_util import HTMLParser
+
+
+def parse_html(filename, charset=None, log=None):
+    fp = open(filename)
+    html_tree = HTML5Parser().parse(fp, charset)
+    fp.close()
+
+    html = html_tree.childNodes[-1]
+    for node in html.childNodes:
+        if node.name == 'head':
+            head = node
+            break
+    else:
+        head = None
+
+    meta_charset = False
+    title = None
+    refresh = None
+    icon = None
+
+    if head:
+        for node in head.childNodes:
+            if node.name == 'meta' and \
+                    ('http-equiv' in node.attributes) and \
+                    (node.attributes['http-equiv'] == 'content-type'):
+                meta_content = node.attributes['content']
+                if meta_content:
+                    try:
+                        meta_charset = \
+                            meta_content.lower().split('charset=')[1].split(';')[0]
+                    except IndexError:
+                        meta_charset = False
+                    else:
+                        break
+
+        for node in head.childNodes:
+            if node.name == 'title':
+                title = node.childNodes[0].value
+                break
+
+        if title and (charset or meta_charset):
+            title = title.encode(charset or meta_charset)
+
+        for node in head.childNodes:
+            if node.name == 'meta' and \
+                    ('http-equiv' in node.attributes) and \
+                    (node.attributes['http-equiv'] == 'refresh'):
+                refresh = node.attributes['content']
+                break
+
+        for node in head.childNodes:
+            if node.name == 'link' and \
+                    ('rel' in node.attributes) and \
+                    (node.attributes['rel'] in ('icon', 'shortcut icon')):
+                icon = node.attributes['href']
+                break
+
+    return HTMLParser(charset, meta_charset, title, refresh, icon)