]> git.phdru.name Git - bookmarks_db.git/commitdiff
Fix(Py3): Decode content using HTTP chrset
authorOleg Broytman <phd@phdru.name>
Mon, 20 Nov 2023 17:34:36 +0000 (20:34 +0300)
committerOleg Broytman <phd@phdru.name>
Mon, 20 Nov 2023 19:15:20 +0000 (22:15 +0300)
Robots/bkmk_robot_base.py

index fd7237ae0184033797ef30a6a773fe9b200c10de..c5afd3fdc0aef90c1db8f69551e64c37e7c70882 100644 (file)
@@ -111,25 +111,13 @@ class robot_base(Robot):
             bookmark.size = size
             bookmark.last_modified = last_modified
 
-            md5 = md5wrapper()
-            if url_type == "ftp":  # Pass welcome message through MD5
-                ftp_welcome = self.get_ftp_welcome()
-                if not isinstance(ftp_welcome, bytes):
-                    ftp_welcome = ftp_welcome.encode('utf-8')
-                md5.update(ftp_welcome)
-
-            if isinstance(content, bytes):
-                md5.update(content)
-            else:
-                md5.update(content.encode('utf-8'))
-            bookmark.md5 = str(md5)
-
+            charset = None
             if headers:
                 try:
                     content_type = headers["Content-Type"]
                     self.log("   Content-Type   : %s" % content_type)
                     if content_type is None:
-                        if 'html' in content.lower():
+                        if b'html' in content.lower():
                             content_type = 'text/html'
                         else:
                             content_type = 'text/plain'
@@ -137,7 +125,7 @@ class robot_base(Robot):
                                  % content_type)
                     try:
                         # extract charset from
-                        # "text/html; foo; charset=UTF-8, bar; baz;"
+                        # "text/html; charset=UTF-8, foo; bar"
                         content_type, charset = content_type.split(';', 1)
                         content_type = content_type.strip()
                         charset = charset.split('=')[1].strip().split(',')[0]
@@ -151,6 +139,9 @@ class robot_base(Robot):
                             is_html = True
                             break
                     content_stripped = content.strip()
+                    if content_stripped and charset:
+                        content_stripped = content_stripped.decode(
+                            charset, 'replace')
                     if content_stripped and is_html:
                         parser = parse_html(
                             content_stripped, charset, self.log)
@@ -218,7 +209,7 @@ class robot_base(Robot):
                                                  " assume x-icon")
                                         content_type = 'image/x-icon'
                                     if not isinstance(icon_data, bytes):
-                                        icon_data = icon_data.encode('utf-8')
+                                        icon_data = icon_data.encode('latin1')
                                     bookmark.icon = "data:%s;base64,%s" \
                                         % (content_type, b64encode(icon_data))
                                     icons[icon_url] = (content_type,
@@ -252,6 +243,8 @@ class robot_base(Robot):
                                                   "%s (%s sec)"
                                                   % (url, timeout)
                                                   )
+                    elif charset:
+                        bookmark.charset = charset
 
                     if not content_stripped:
                         self.log("   empty response, no content")
@@ -260,6 +253,19 @@ class robot_base(Robot):
                 except KeyError as key:
                     self.log("   no header: %s" % key)
 
+            md5 = md5wrapper()
+            if url_type == "ftp":  # Pass welcome message through MD5
+                ftp_welcome = self.get_ftp_welcome()
+                if not isinstance(ftp_welcome, bytes):
+                    ftp_welcome = ftp_welcome.encode(charset or 'utf-8')
+                md5.update(ftp_welcome)
+
+            if isinstance(content, bytes):
+                md5.update(content)
+            else:
+                md5.update(content.encode(charset or 'utf-8'))
+            bookmark.md5 = str(md5)
+
         except EOFError:
             bookmark.error = "Unexpected EOF (FTP server closed connection)"
             self.log('   EOF: %s' % bookmark.error)