]> git.phdru.name Git - bookmarks_db.git/blobdiff - Robots/bkmk_robot_base.py
Feat(robots): Align "Content-Type"
[bookmarks_db.git] / Robots / bkmk_robot_base.py
index fb8bb2d6eab923b78f7e0c83c9236a78e1e2750d..d8877c6f41c6b6f30c8d8ba4b8dbcec396e6c02b 100644 (file)
@@ -29,10 +29,11 @@ from parse_html import parse_html
 
 
 reloc_dict = {
-  301: "perm.",
+  301: "perm1.",
   302: "temp2.",
   303: "temp3.",
   307: "temp7.",
+  308: "temp8.",
   "html": "html"
 }
 
@@ -111,15 +112,28 @@ class robot_base(Robot):
 
             md5 = md5wrapper()
             if url_type == "ftp":  # Pass welcome message through MD5
-                md5.update(self.get_ftp_welcome())
+                ftp_welcome = self.get_ftp_welcome()
+                if not isinstance(ftp_welcome, bytes):
+                    ftp_welcome = ftp_welcome.encode('utf-8')
+                md5.update(ftp_welcome)
 
-            md5.update(content)
+            if isinstance(content, bytes):
+                md5.update(content)
+            else:
+                md5.update(content.encode('utf-8'))
             bookmark.md5 = str(md5)
 
             if headers:
                 try:
                     content_type = headers["Content-Type"]
-                    self.log("   Content-Type: %s" % content_type)
+                    self.log("   Content-Type   : %s" % content_type)
+                    if content_type is None:
+                        if 'html' in content.lower():
+                            content_type = 'text/html'
+                        else:
+                            content_type = 'text/plain'
+                        self.log("   Set Content-Type to: %s"
+                                 % content_type)
                     try:
                         # extract charset from
                         # "text/html; foo; charset=UTF-8, bar; baz;"
@@ -130,13 +144,12 @@ class robot_base(Robot):
                     except (ValueError, IndexError):
                         charset = None
                         self.log("   no charset in Content-Type header")
+                    is_html = False
                     for ctype in ("text/html", "application/xhtml+xml"):
                         if content_type.startswith(ctype):
-                            html = True
+                            is_html = True
                             break
-                    else:
-                        html = False
-                    if html:
+                    if content and is_html:
                         parser = parse_html(content, charset, self.log)
                         if parser:
                             bookmark.real_title = parser.title
@@ -181,9 +194,11 @@ class robot_base(Robot):
                                 icons[icon_url] = None
                             else:
                                 content_type = icon_headers["Content-Type"]
-                                if content_type.startswith("application/") \
-                                   or content_type.startswith("image/") \
-                                   or content_type.startswith("text/plain"):
+                                if content_type and (
+                                    content_type.startswith("application/")
+                                    or content_type.startswith("image/")
+                                    or content_type.startswith("text/plain")
+                                ):
                                     bookmark.icon_href = icon_url
                                     self.log("   got icon       : %s"
                                              % content_type)
@@ -195,6 +210,8 @@ class robot_base(Robot):
                                         self.log("   non-image content type,"
                                                  " assume x-icon")
                                         content_type = 'image/x-icon'
+                                    if not isinstance(icon_data, bytes):
+                                        icon_data = icon_data.encode('utf-8')
                                     bookmark.icon = "data:%s;base64,%s" \
                                         % (content_type, b64encode(icon_data))
                                     icons[icon_url] = (content_type,
@@ -229,6 +246,10 @@ class robot_base(Robot):
                                                   % (url, timeout)
                                                   )
 
+                    if not content:
+                        self.log("   empty response, no content")
+                    if not is_html:
+                        self.log("   not html")
                 except KeyError as key:
                     self.log("   no header: %s" % key)
 
@@ -257,8 +278,15 @@ class robot_base(Robot):
         return 1
 
     def set_redirect(self, bookmark, errcode, newurl):
-        bookmark.moved = "(%s) to %s" % (reloc_dict[errcode], newurl)
-        self.log('   Moved: %s' % bookmark.moved)
+        bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl)
+        try:
+            moved.encode('ascii')
+        except UnicodeEncodeError:
+            try:
+                moved = moved.encode(bookmark.charset)
+            except (LookupError, TypeError, UnicodeEncodeError):
+                moved = moved.encode('utf-8')
+        self.log('   Moved: %s' % moved)
 
     def finish_check_url(self, bookmark):
         start = self.start