X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fbkmk_robot_base.py;h=a03d5c1b16ec430ab17ac8f899655ae9cf9808bb;hb=HEAD;hp=2024ab85efd74594eb4700078f19f7e012ae2b78;hpb=39125836fc96da8bd411da9b03bdc14b48eb4e9d;p=bookmarks_db.git

diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py
index 2024ab8..a03d5c1 100644
--- a/Robots/bkmk_robot_base.py
+++ b/Robots/bkmk_robot_base.py
@@ -5,14 +5,14 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['robot_base', 'get_error']
 
 
 from base64 import b64encode
-from urllib.parse import urlsplit, urljoin
+from urllib.parse import urljoin
 import sys
 import socket
 import time
@@ -24,6 +24,22 @@ from bkmk_objects import Robot
 from parse_html import parse_html
 
 
+# Fake headers to pretend this is a real browser
+_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)"
+" Gecko/20001221 Firefox/2.0.0"
+_x_user_agent = "bookmarks_db (Python %d.%d.%d)" % sys.version_info[:3]
+
+request_headers = {
+    'Accept': '*/*',
+    'Accept-Language': 'ru,en',
+    'Cache-Control': 'max-age=300',
+    'Connection': 'close',
+    'Referer': '/',
+    'User-Agent': _user_agent,
+    'X-User-Agent': _x_user_agent,
+}
+
+
 reloc_dict = {
   301: "perm1.",
   302: "temp2.",
@@ -62,13 +78,8 @@ class robot_base(Robot):
             self.start = int(time.time())
             bookmark.icon = None
 
-            split_results = urlsplit(bookmark.href)
-            url_type, netloc, url_path, query, url_tag = split_results
-            url_host = split_results.hostname
-
-            url = "%s://%s%s" % (url_type, url_host, url_path)
             error, redirect_code, redirect_to, headers, content = \
-                self.get(bookmark, url, True)
+                self.get(bookmark, bookmark.href, True)
 
             if error:
                 bookmark.error = error
@@ -135,8 +146,13 @@ class robot_base(Robot):
                             break
                     content_stripped = content.strip()
                     if content_stripped and charset:
-                        content_stripped = content_stripped.decode(
-                            charset, 'replace')
+                        try:
+                            content_stripped = content_stripped.decode(
+                                charset, 'replace')
+                        except LookupError:
+                            charset = None
+                            self.log("   unknown charset "
+                                     "in Content-Type header")
                     if content_stripped and is_html:
                         parser = parse_html(
                             content_stripped, charset, self.log)
@@ -151,16 +167,22 @@ class robot_base(Robot):
                             icon = None
                         if not icon:
                             icon = "/favicon.ico"
-                        icon_url = urljoin(
-                            "%s://%s%s" % (url_type, url_host, url_path), icon)
+                        icon_url = urljoin(bookmark.href, icon)
                         self.log("   looking for icon at: %s" % icon_url)
                         if icon_url in icons:
                             if icons[icon_url]:
                                 bookmark.icon_href = icon_url
                                 content_type, bookmark.icon = icons[icon_url]
-                                self.log("   cached icon: %s" % content_type)
+                                self.log("   cached icon    : %s"
+                                         % content_type)
                             else:
-                                self.log("   cached icon: no icon")
+                                self.log("   cached icon    : no icon")
+                        elif icon_url.startswith('data:'):
+                            content_type, icon_data = \
+                                icon_url[len('data:'):].split(',', 1)
+                            bookmark.icon_href = bookmark.icon = icon_url
+                            self.log("   got data icon  : %s" % content_type)
+                            icons[icon_url] = (content_type, icon_url)
                         else:
                             try:
                                 _icon_url = icon_url
@@ -211,7 +233,7 @@ class robot_base(Robot):
                                                        bookmark.icon
                                                        )
                                 else:
-                                    self.log("   no icon        :"
+                                    self.log("   no icon        : "
                                              "bad content type '%s'"
                                              % content_type
                                              )
@@ -249,7 +271,8 @@ class robot_base(Robot):
                     self.log("   no header: %s" % key)
 
             md5 = md5wrapper()
-            if url_type == "ftp":  # Pass welcome message through MD5
+            if bookmark.href.startswith("ftp://"):
+                # Pass welcome message through MD5
                 ftp_welcome = self.get_ftp_welcome()
                 if not isinstance(ftp_welcome, bytes):
                     ftp_welcome = ftp_welcome.encode(charset or 'utf-8')