Added HTML Parser based on html5 library.

author Oleg Broytman <phd@phdru.name>

Wed, 11 Aug 2010 20:07:29 +0000 (20:07 +0000)

committer Oleg Broytman <phd@phdru.name>

Wed, 11 Aug 2010 20:07:29 +0000 (20:07 +0000)
author Oleg Broytman <phd@phdru.name>
Wed, 11 Aug 2010 20:07:29 +0000 (20:07 +0000)
committer Oleg Broytman <phd@phdru.name>
Wed, 11 Aug 2010 20:07:29 +0000 (20:07 +0000)
diff --git a/Robots/parse_html.py b/Robots/parse_html.py

index b51bd317488124d82dede34fd23c51e87d43756f..f50b01b415b5c6fd775fa41cb20325127ebf0469 100755 (executable)
--- a/Robots/parse_html.py
+++ b/Robots/parse_html.py
@@ -12,6 +12,13 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
  
  parsers = []
  
+try:
+   import parse_html_html5
+except ImportError:
+   pass
+else:
+   parsers.append(parse_html_html5.parse_html)
+
  try:
     import parse_html_beautifulsoup
     parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET
diff --git a/Robots/parse_html_html5.py b/Robots/parse_html_html5.py

new file mode 100644 (file)

index 0000000..511e640
--- /dev/null
+++ b/Robots/parse_html_html5.py
@@ -0,0 +1,66 @@
+"""
+    HTML Parser using html5.
+
+    Written by Broytman. Copyright (C) 2010 PhiloSoft Design
+"""
+
+from html5lib import HTMLParser as HTML5Parser
+from parse_html_util import HTMLParser
+
+
+def parse_html(filename, charset=None, log=None):
+    fp = open(filename)
+    html_tree = HTML5Parser().parse(fp, charset)
+    fp.close()
+
+    html = html_tree.childNodes[-1]
+    for node in html.childNodes:
+        if node.name == 'head':
+            head = node
+            break
+    else:
+        head = None
+
+    meta_charset = False
+    title = None
+    refresh = None
+    icon = None
+
+    if head:
+        for node in head.childNodes:
+            if node.name == 'meta' and \
+                    ('http-equiv' in node.attributes) and \
+                    (node.attributes['http-equiv'] == 'content-type'):
+                meta_content = node.attributes['content']
+                if meta_content:
+                    try:
+                        meta_charset = \
+                            meta_content.lower().split('charset=')[1].split(';')[0]
+                    except IndexError:
+                        meta_charset = False
+                    else:
+                        break
+
+        for node in head.childNodes:
+            if node.name == 'title':
+                title = node.childNodes[0].value
+                break
+
+        if title and (charset or meta_charset):
+            title = title.encode(charset or meta_charset)
+
+        for node in head.childNodes:
+            if node.name == 'meta' and \
+                    ('http-equiv' in node.attributes) and \
+                    (node.attributes['http-equiv'] == 'refresh'):
+                refresh = node.attributes['content']
+                break
+
+        for node in head.childNodes:
+            if node.name == 'link' and \
+                    ('rel' in node.attributes) and \
+                    (node.attributes['rel'] in ('icon', 'shortcut icon')):
+                icon = node.attributes['href']
+                break
+
+    return HTMLParser(charset, meta_charset, title, refresh, icon)
author	Oleg Broytman <phd@phdru.name>
	Wed, 11 Aug 2010 20:07:29 +0000 (20:07 +0000)
committer	Oleg Broytman <phd@phdru.name>
	Wed, 11 Aug 2010 20:07:29 +0000 (20:07 +0000)
Robots/parse_html.py		patch \| blob \| history
Robots/parse_html_html5.py	[new file with mode: 0644]	patch \| blob