]> git.phdru.name Git - bookmarks_db.git/blobdiff - Robots/bkmk_robot_base.py
Fix(robots): Do not parse empty strings
[bookmarks_db.git] / Robots / bkmk_robot_base.py
index 85c704a64955608c5add79747020d2b48ea74889..52d6b563f1056c9ffe29a08299d59084d248be4e 100644 (file)
@@ -29,10 +29,11 @@ from parse_html import parse_html
 
 
 reloc_dict = {
-  301: "perm.",
+  301: "perm1.",
   302: "temp2.",
   303: "temp3.",
   307: "temp7.",
+  308: "temp8.",
   "html": "html"
 }
 
@@ -125,7 +126,7 @@ class robot_base(Robot):
             if headers:
                 try:
                     content_type = headers["Content-Type"]
-                    self.log("   Content-Type: %s" % content_type)
+                    self.log("   Content-Type   : %s" % content_type)
                     if content_type is None:
                         if 'html' in content.lower():
                             content_type = 'text/html'
@@ -143,14 +144,14 @@ class robot_base(Robot):
                     except (ValueError, IndexError):
                         charset = None
                         self.log("   no charset in Content-Type header")
+                    is_html = False
                     for ctype in ("text/html", "application/xhtml+xml"):
                         if content_type.startswith(ctype):
-                            html = True
+                            is_html = True
                             break
-                    else:
-                        html = False
-                    if html:
-                        parser = parse_html(content, charset, self.log)
+                    content_stripped = content.strip()
+                    if content_stripped and is_html:
+                        parser = parse_html(content_stripped, charset, self.log)
                         if parser:
                             bookmark.real_title = parser.title
                             icon = parser.icon
@@ -246,6 +247,10 @@ class robot_base(Robot):
                                                   % (url, timeout)
                                                   )
 
+                    if not content_stripped:
+                        self.log("   empty response, no content")
+                    if not is_html:
+                        self.log("   not html")
                 except KeyError as key:
                     self.log("   no header: %s" % key)