]> git.phdru.name Git - bookmarks_db.git/blobdiff - Robots/parse_html_html5.py
Moved lxml-based parser after BeautifulSoup - it doesn't accept charset.
[bookmarks_db.git] / Robots / parse_html_html5.py
index fcc7e158a20e2e0a6f35aa8840cbb197dd1c8755..43e8d74ba957f658ffef276cadb2782ac27e84e5 100644 (file)
@@ -9,14 +9,22 @@ from parse_html_util import HTMLParser
 
 
 def parse_html(filename, charset=None, log=None):
+    parser = HTML5Parser()
     fp = open(filename)
-    html_tree = HTML5Parser().parse(fp, charset)
+    parser._parse(fp, encoding=charset, parseMeta=bool(charset))
     fp.close()
+    html_tree = parser.tree.getDocument()
 
-    if not html_tree.childNodes:
+    for node in html_tree.childNodes:
+        if (node.name == 'html') and (node.type != 3): # Skip DocType element
+            html = node
+            break
+    else:
+        html = None
+
+    if not html:
         return None
 
-    html = html_tree.childNodes[-1]
     for node in html.childNodes:
         if node.name == 'head':
             head = node
@@ -30,6 +38,17 @@ def parse_html(filename, charset=None, log=None):
     icon = None
 
     if head:
+        for node in head.childNodes:
+            if node.name == 'title':
+                if node.childNodes:
+                    title = node.childNodes[0].value
+                    break
+                else:
+                    title = ''
+
+        if title is None:
+            return None
+
         for node in head.childNodes:
             if node.name == 'meta' and \
                     ('http-equiv' in node.attributes) and \
@@ -44,15 +63,10 @@ def parse_html(filename, charset=None, log=None):
                     else:
                         break
 
-        for node in head.childNodes:
-            if node.name == 'title':
-                if node.childNodes:
-                    title = node.childNodes[0].value
-                    break
-                else:
-                    title = ''
+        if not charset:
+            charset = parser.tokenizer.stream.charEncoding[0]
 
-        if title and (charset or meta_charset):
+        if charset or meta_charset:
             title = title.encode(charset or meta_charset)
 
         for node in head.childNodes:
@@ -69,4 +83,16 @@ def parse_html(filename, charset=None, log=None):
                 icon = node.attributes['href']
                 break
 
+    else:
+        for node in html.childNodes:
+            if node.name == 'title':
+                if node.childNodes:
+                    title = node.childNodes[0].value
+                    break
+                else:
+                    title = ''
+
+        if title is None:
+            return None
+
     return HTMLParser(charset, meta_charset, title, refresh, icon)