Added HTML Parser based on lxml.

author Oleg Broytman <phd@phdru.name>

Fri, 13 Aug 2010 13:17:33 +0000 (13:17 +0000)

committer Oleg Broytman <phd@phdru.name>

Fri, 13 Aug 2010 13:17:33 +0000 (13:17 +0000)
author Oleg Broytman <phd@phdru.name>
Fri, 13 Aug 2010 13:17:33 +0000 (13:17 +0000)
committer Oleg Broytman <phd@phdru.name>
Fri, 13 Aug 2010 13:17:33 +0000 (13:17 +0000)
diff --git a/Robots/parse_html_lxml.py b/Robots/parse_html_lxml.py

new file mode 100644 (file)

index 0000000..7aba09c
--- /dev/null
+++ b/Robots/parse_html_lxml.py
@@ -0,0 +1,51 @@
+"""
+    HTML Parser using lxml.html.
+
+    Written by Broytman. Copyright (C) 2010 PhiloSoft Design
+"""
+
+from lxml.html import parse
+from parse_html_util import HTMLParser
+
+
+def parse_html(filename, charset=None, log=None):
+    html_tree = parse(filename)
+
+    title = html_tree.findtext('head/title')
+    if title is None:
+        title = html_tree.findtext('title')
+        if title is None:
+            return None
+
+    meta = html_tree.findall('head/meta')
+    for m in meta:
+        if m.get('http-equiv', '').lower() == 'content-type':
+            meta_content = m.get("content")
+            if meta_content:
+                try:
+                    meta_charset = \
+                        meta_content.lower().split('charset=')[1].split(';')[0]
+                    break
+                except IndexError:
+                    meta_charset = False
+    else:
+        meta_charset = False
+
+    if charset or meta_charset:
+        title = title.encode(charset or meta_charset)
+
+    for m in meta:
+        if m.get('http-equiv', '').lower() == 'refresh':
+            refresh = m.get("content")
+            break
+    else:
+        refresh = None
+
+    for link in html_tree.findall('head/link'):
+        if link.get('rel', '').lower() in ('icon', 'shortcut icon'):
+            icon = link.get("href")
+            break
+    else:
+        icon = None
+
+    return HTMLParser(charset, meta_charset, title, refresh, icon)
diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE

index 79d7b1bbd3abfc7eb2d1c549d7b6c8701884bbcd..132ccbc6fe145f3e8f807e38d5974ca00688eddd 100644 (file)
--- a/doc/ANNOUNCE
+++ b/doc/ANNOUNCE
@@ -5,6 +5,10 @@ WHAT IS IT
     A set of classes, libraries, programs and plugins I use to manipulate my
  bookmarks.html.
  
+WHAT'S NEW in version 4.2.2.
+
+    Added HTML Parser based on lxml.
+
  WHAT'S NEW in version 4.2.1 (2010-08-12).
  
     Added HTML Parser based on html5 library.
author	Oleg Broytman <phd@phdru.name>
	Fri, 13 Aug 2010 13:17:33 +0000 (13:17 +0000)
committer	Oleg Broytman <phd@phdru.name>
	Fri, 13 Aug 2010 13:17:33 +0000 (13:17 +0000)
Robots/parse_html_lxml.py	[new file with mode: 0644]	patch \| blob
doc/ANNOUNCE		patch \| blob \| history