]> git.phdru.name Git - bookmarks_db.git/blobdiff - Robots/bkmk_robot_base.py
Fix(robots): Process response without `Content-Type`
[bookmarks_db.git] / Robots / bkmk_robot_base.py
index 395d4f8d6981a3a17e50bbe11986e8c30a325ac2..b5cac691fe9f2842d12184f30de6c6b2271dbb83 100644 (file)
@@ -15,8 +15,11 @@ from base64 import b64encode
 import sys
 import socket
 import time
-import urllib
-from urlparse import urljoin
+try:
+    from urllib.parse import splittype, splithost, splittag, urljoin
+except ImportError:
+    from urllib import splittype, splithost, splittag
+    from urlparse import urljoin
 
 from m_lib.md5wrapper import md5wrapper
 from m_lib.net.www.util import parse_time
@@ -62,13 +65,14 @@ class robot_base(Robot):
             self.start = int(time.time())
             bookmark.icon = None
 
-            url_type, url_rest = urllib.splittype(bookmark.href)
-            url_host, url_path = urllib.splithost(url_rest)
-            url_path, url_tag  = urllib.splittag(url_path)  # noqa: E221
-            #                            multiple spaces before operator
+            url_type, url_rest = splittype(bookmark.href)
+            url_host, url_path = splithost(url_rest)
+            url_path, url_tag  = splittag(url_path)  # noqa: E221
+            #                    multiple spaces before operator
 
             url = "%s://%s%s" % (url_type, url_host, url_path)
-            error, redirect_code, redirect_to, headers, content = self.get(bookmark, url, True)
+            error, redirect_code, redirect_to, headers, content = \
+                self.get(bookmark, url, True)
 
             if error:
                 bookmark.error = error
@@ -106,18 +110,32 @@ class robot_base(Robot):
             bookmark.last_modified = last_modified
 
             md5 = md5wrapper()
-            if url_type == "ftp": # Pass welcome message through MD5
-                md5.update(self.get_ftp_welcome())
-
-            md5.update(content)
+            if url_type == "ftp":  # Pass welcome message through MD5
+                ftp_welcome = self.get_ftp_welcome()
+                if not isinstance(ftp_welcome, bytes):
+                    ftp_welcome = ftp_welcome.encode('utf-8')
+                md5.update(ftp_welcome)
+
+            if isinstance(content, bytes):
+                md5.update(content)
+            else:
+                md5.update(content.encode('utf-8'))
             bookmark.md5 = str(md5)
 
             if headers:
                 try:
                     content_type = headers["Content-Type"]
                     self.log("   Content-Type: %s" % content_type)
+                    if content_type is None:
+                        if 'html' in content.lower():
+                            content_type = 'text/html'
+                        else:
+                            content_type = 'text/plain'
+                        self.log("   Set Content-Type to: %s"
+                                 % content_type)
                     try:
-                        # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
+                        # extract charset from
+                        # "text/html; foo; charset=UTF-8, bar; baz;"
                         content_type, charset = content_type.split(';', 1)
                         content_type = content_type.strip()
                         charset = charset.split('=')[1].strip().split(',')[0]
@@ -140,7 +158,8 @@ class robot_base(Robot):
                             icon = None
                         if not icon:
                             icon = "/favicon.ico"
-                        icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
+                        icon_url = urljoin(
+                            "%s://%s%s" % (url_type, url_host, url_path), icon)
                         self.log("   looking for icon at: %s" % icon_url)
                         if icon_url in icons:
                             if icons[icon_url]:
@@ -153,12 +172,14 @@ class robot_base(Robot):
                             try:
                                 _icon_url = icon_url
                                 for i in range(8):
-                                    error, icon_redirect_code, icon_redirect_to, \
-                                        icon_headers, icon_data = \
+                                    error, icon_redirect_code, \
+                                        icon_redirect_to, icon_headers, \
+                                        icon_data = \
                                         self.get(bookmark, _icon_url)
                                     if icon_redirect_code:
                                         _icon_url = icon_redirect_to
-                                        self.log("   redirect to : %s" % _icon_url)
+                                        self.log("   redirect to : %s"
+                                                 % _icon_url)
                                     else:
                                         if icon_data is None:
                                             raise IOError("No icon")
@@ -166,25 +187,41 @@ class robot_base(Robot):
                                 else:
                                     raise IOError("Too many redirects")
                             except:
-                                etype, emsg, tb = sys.exc_info()
-                                self.log("   no icon        : %s %s" % (etype, emsg))
-                                etype = emsg = tb = None
+                                etype, emsg, _ = sys.exc_info()
+                                self.log("   no icon        : %s %s"
+                                         % (etype, emsg))
+                                etype = emsg = _ = None
                                 icons[icon_url] = None
                             else:
                                 content_type = icon_headers["Content-Type"]
-                                if content_type.startswith("application/") \
-                                   or content_type.startswith("image/") \
-                                   or content_type.startswith("text/plain"):
+                                if content_type and (
+                                    content_type.startswith("application/")
+                                    or content_type.startswith("image/")
+                                    or content_type.startswith("text/plain")
+                                ):
                                     bookmark.icon_href = icon_url
-                                    self.log("   got icon       : %s" % content_type)
-                                    if content_type.startswith("application/") \
-                                       or content_type.startswith("text/plain"):
-                                        self.log("   non-image content type, assume x-icon")
+                                    self.log("   got icon       : %s"
+                                             % content_type)
+                                    if (
+                                        content_type.startswith("application/")
+                                        or content_type.startswith(
+                                            "text/plain")
+                                    ):
+                                        self.log("   non-image content type,"
+                                                 " assume x-icon")
                                         content_type = 'image/x-icon'
-                                    bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
-                                    icons[icon_url] = (content_type, bookmark.icon)
+                                    if not isinstance(icon_data, bytes):
+                                        icon_data = icon_data.encode('utf-8')
+                                    bookmark.icon = "data:%s;base64,%s" \
+                                        % (content_type, b64encode(icon_data))
+                                    icons[icon_url] = (content_type,
+                                                       bookmark.icon
+                                                       )
                                 else:
-                                    self.log("   no icon        : bad content type '%s'" % content_type)
+                                    self.log("   no icon        :"
+                                             "bad content type '%s'"
+                                             % content_type
+                                             )
                                     icons[icon_url] = None
                         if parser and parser.refresh:
                             refresh = parser.refresh
@@ -195,13 +232,19 @@ class robot_base(Robot):
                             try:
                                 timeout = float(refresh.split(';')[0])
                             except (IndexError, ValueError):
-                                self.set_redirect(bookmark, "html", "Bad redirect to %s (%s)" % (url, refresh))
+                                self.set_redirect(bookmark, "html",
+                                                  "Bad redirect to %s (%s)"
+                                                  % (url, refresh)
+                                                  )
                             else:
                                 try:
                                     timeout = int(refresh.split(';')[0])
                                 except ValueError:
-                                    pass # float timeout
-                                self.set_redirect(bookmark, "html", "%s (%s sec)" % (url, timeout))
+                                    pass  # float timeout
+                                self.set_redirect(bookmark, "html",
+                                                  "%s (%s sec)"
+                                                  % (url, timeout)
+                                                  )
 
                 except KeyError as key:
                     self.log("   no header: %s" % key)