Fix(Robots/bkmk_robot_base): Add forgotten spaces in log

[bookmarks_db.git] / Robots / bkmk_robot_base.py
diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py

index c5afd3fdc0aef90c1db8f69551e64c37e7c70882..c71ce2207f366b46a2d35c07c3a73f4ae206f6b9 100644 (file)
--- a/Robots/bkmk_robot_base.py
+++ b/Robots/bkmk_robot_base.py
@@ -5,21 +5,17 @@ This file is a part of Bookmarks database and Internet robot.
  """
  
  __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design"
  __license__ = "GNU GPL"
  
  __all__ = ['robot_base', 'get_error']
  
  
  from base64 import b64encode
+from urllib.parse import urlsplit, urljoin
  import sys
  import socket
  import time
-try:
-    from urllib.parse import splittype, splithost, splittag, urljoin
-except ImportError:
-    from urllib import splittype, splithost, splittag
-    from urlparse import urljoin
  
  from m_lib.md5wrapper import md5wrapper
  from m_lib.net.www.util import parse_time
@@ -28,6 +24,22 @@ from bkmk_objects import Robot
  from parse_html import parse_html
  
  
+# Fake headers to pretend this is a real browser
+_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)"
+" Gecko/20001221 Firefox/2.0.0"
+_x_user_agent = "bookmarks_db (Python %d.%d.%d)" % sys.version_info[:3]
+
+request_headers = {
+    'Accept': '*/*',
+    'Accept-Language': 'ru,en',
+    'Cache-Control': 'max-age=300',
+    'Connection': 'close',
+    'Referer': '/',
+    'User-Agent': _user_agent,
+    'X-User-Agent': _x_user_agent,
+}
+
+
  reloc_dict = {
    301: "perm1.",
    302: "temp2.",
@@ -66,10 +78,9 @@ class robot_base(Robot):
              self.start = int(time.time())
              bookmark.icon = None
  
-            url_type, url_rest = splittype(bookmark.href)
-            url_host, url_path = splithost(url_rest)
-            url_path, url_tag  = splittag(url_path)  # noqa: E221
-            #                    multiple spaces before operator
+            split_results = urlsplit(bookmark.href)
+            url_type, netloc, url_path, query, url_tag = split_results
+            url_host = split_results.hostname
  
              url = "%s://%s%s" % (url_type, url_host, url_path)
              error, redirect_code, redirect_to, headers, content = \
@@ -140,8 +151,13 @@ class robot_base(Robot):
                              break
                      content_stripped = content.strip()
                      if content_stripped and charset:
-                        content_stripped = content_stripped.decode(
-                            charset, 'replace')
+                        try:
+                            content_stripped = content_stripped.decode(
+                                charset, 'replace')
+                        except LookupError:
+                            charset = None
+                            self.log("   unknown charset "
+                                     "in Content-Type header")
                      if content_stripped and is_html:
                          parser = parse_html(
                              content_stripped, charset, self.log)
@@ -163,9 +179,16 @@ class robot_base(Robot):
                              if icons[icon_url]:
                                  bookmark.icon_href = icon_url
                                  content_type, bookmark.icon = icons[icon_url]
-                                self.log("   cached icon: %s" % content_type)
+                                self.log("   cached icon    : %s"
+                                         % content_type)
                              else:
-                                self.log("   cached icon: no icon")
+                                self.log("   cached icon    : no icon")
+                        elif icon_url.startswith('data:'):
+                            content_type, icon_data = \
+                                icon_url[len('data:'):].split(',', 1)
+                            bookmark.icon_href = bookmark.icon = icon_url
+                            self.log("   got data icon  : %s" % content_type)
+                            icons[icon_url] = (content_type, icon_url)
                          else:
                              try:
                                  _icon_url = icon_url
@@ -216,7 +239,7 @@ class robot_base(Robot):
                                                         bookmark.icon
                                                         )
                                  else:
-                                    self.log("   no icon        :"
+                                    self.log("   no icon        : "
                                               "bad content type '%s'"
                                               % content_type
                                               )