]> git.phdru.name Git - bookmarks_db.git/blobdiff - Robots/bkmk_robot_base.py
Fix(Robots/bkmk_robot_base): Ignore unknown charset
[bookmarks_db.git] / Robots / bkmk_robot_base.py
index c5afd3fdc0aef90c1db8f69551e64c37e7c70882..1e511d0b48a625e752f337431892ff7872c050bf 100644 (file)
@@ -5,21 +5,17 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['robot_base', 'get_error']
 
 
 from base64 import b64encode
+from urllib.parse import urlsplit, urljoin
 import sys
 import socket
 import time
-try:
-    from urllib.parse import splittype, splithost, splittag, urljoin
-except ImportError:
-    from urllib import splittype, splithost, splittag
-    from urlparse import urljoin
 
 from m_lib.md5wrapper import md5wrapper
 from m_lib.net.www.util import parse_time
@@ -66,10 +62,9 @@ class robot_base(Robot):
             self.start = int(time.time())
             bookmark.icon = None
 
-            url_type, url_rest = splittype(bookmark.href)
-            url_host, url_path = splithost(url_rest)
-            url_path, url_tag  = splittag(url_path)  # noqa: E221
-            #                    multiple spaces before operator
+            split_results = urlsplit(bookmark.href)
+            url_type, netloc, url_path, query, url_tag = split_results
+            url_host = split_results.hostname
 
             url = "%s://%s%s" % (url_type, url_host, url_path)
             error, redirect_code, redirect_to, headers, content = \
@@ -140,8 +135,13 @@ class robot_base(Robot):
                             break
                     content_stripped = content.strip()
                     if content_stripped and charset:
-                        content_stripped = content_stripped.decode(
-                            charset, 'replace')
+                        try:
+                            content_stripped = content_stripped.decode(
+                                charset, 'replace')
+                        except LookupError:
+                            charset = None
+                            self.log("   unknown charset "
+                                     "in Content-Type header")
                     if content_stripped and is_html:
                         parser = parse_html(
                             content_stripped, charset, self.log)
@@ -216,7 +216,7 @@ class robot_base(Robot):
                                                        bookmark.icon
                                                        )
                                 else:
-                                    self.log("   no icon        :"
+                                    self.log("   no icon        : "
                                              "bad content type '%s'"
                                              % content_type
                                              )