From efd4317f928f77d86f54162a04fa8272b71b355b Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Mon, 20 Nov 2023 20:34:36 +0300 Subject: [PATCH] Fix(Py3): Decode content using HTTP chrset --- Robots/bkmk_robot_base.py | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index fd7237a..c5afd3f 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -111,25 +111,13 @@ class robot_base(Robot): bookmark.size = size bookmark.last_modified = last_modified - md5 = md5wrapper() - if url_type == "ftp": # Pass welcome message through MD5 - ftp_welcome = self.get_ftp_welcome() - if not isinstance(ftp_welcome, bytes): - ftp_welcome = ftp_welcome.encode('utf-8') - md5.update(ftp_welcome) - - if isinstance(content, bytes): - md5.update(content) - else: - md5.update(content.encode('utf-8')) - bookmark.md5 = str(md5) - + charset = None if headers: try: content_type = headers["Content-Type"] self.log(" Content-Type : %s" % content_type) if content_type is None: - if 'html' in content.lower(): + if b'html' in content.lower(): content_type = 'text/html' else: content_type = 'text/plain' @@ -137,7 +125,7 @@ class robot_base(Robot): % content_type) try: # extract charset from - # "text/html; foo; charset=UTF-8, bar; baz;" + # "text/html; charset=UTF-8, foo; bar" content_type, charset = content_type.split(';', 1) content_type = content_type.strip() charset = charset.split('=')[1].strip().split(',')[0] @@ -151,6 +139,9 @@ class robot_base(Robot): is_html = True break content_stripped = content.strip() + if content_stripped and charset: + content_stripped = content_stripped.decode( + charset, 'replace') if content_stripped and is_html: parser = parse_html( content_stripped, charset, self.log) @@ -218,7 +209,7 @@ class robot_base(Robot): " assume x-icon") content_type = 'image/x-icon' if not isinstance(icon_data, bytes): - icon_data = icon_data.encode('utf-8') + icon_data = icon_data.encode('latin1') bookmark.icon = "data:%s;base64,%s" \ % (content_type, b64encode(icon_data)) icons[icon_url] = (content_type, @@ -252,6 +243,8 @@ class robot_base(Robot): "%s (%s sec)" % (url, timeout) ) + elif charset: + bookmark.charset = charset if not content_stripped: self.log(" empty response, no content") @@ -260,6 +253,19 @@ class robot_base(Robot): except KeyError as key: self.log(" no header: %s" % key) + md5 = md5wrapper() + if url_type == "ftp": # Pass welcome message through MD5 + ftp_welcome = self.get_ftp_welcome() + if not isinstance(ftp_welcome, bytes): + ftp_welcome = ftp_welcome.encode(charset or 'utf-8') + md5.update(ftp_welcome) + + if isinstance(content, bytes): + md5.update(content) + else: + md5.update(content.encode(charset or 'utf-8')) + bookmark.md5 = str(md5) + except EOFError: bookmark.error = "Unexpected EOF (FTP server closed connection)" self.log(' EOF: %s' % bookmark.error) -- 2.39.2