Fix(Robot): Stop splitting and un-splitting URLs

[bookmarks_db.git] / Robots / bkmk_robot_base.py
diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py

index b5cac691fe9f2842d12184f30de6c6b2271dbb83..a03d5c1b16ec430ab17ac8f899655ae9cf9808bb 100644 (file)
--- a/Robots/bkmk_robot_base.py
+++ b/Robots/bkmk_robot_base.py
@@ -5,21 +5,17 @@ This file is a part of Bookmarks database and Internet robot.
  """
  
  __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design"
  __license__ = "GNU GPL"
  
  __all__ = ['robot_base', 'get_error']
  
  
  from base64 import b64encode
+from urllib.parse import urljoin
  import sys
  import socket
  import time
-try:
-    from urllib.parse import splittype, splithost, splittag, urljoin
-except ImportError:
-    from urllib import splittype, splithost, splittag
-    from urlparse import urljoin
  
  from m_lib.md5wrapper import md5wrapper
  from m_lib.net.www.util import parse_time
@@ -28,11 +24,28 @@ from bkmk_objects import Robot
  from parse_html import parse_html
  
  
+# Fake headers to pretend this is a real browser
+_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)"
+" Gecko/20001221 Firefox/2.0.0"
+_x_user_agent = "bookmarks_db (Python %d.%d.%d)" % sys.version_info[:3]
+
+request_headers = {
+    'Accept': '*/*',
+    'Accept-Language': 'ru,en',
+    'Cache-Control': 'max-age=300',
+    'Connection': 'close',
+    'Referer': '/',
+    'User-Agent': _user_agent,
+    'X-User-Agent': _x_user_agent,
+}
+
+
  reloc_dict = {
-  301: "perm.",
+  301: "perm1.",
    302: "temp2.",
    303: "temp3.",
    307: "temp7.",
+  308: "temp8.",
    "html": "html"
  }
  
@@ -65,14 +78,8 @@ class robot_base(Robot):
              self.start = int(time.time())
              bookmark.icon = None
  
-            url_type, url_rest = splittype(bookmark.href)
-            url_host, url_path = splithost(url_rest)
-            url_path, url_tag  = splittag(url_path)  # noqa: E221
-            #                    multiple spaces before operator
-
-            url = "%s://%s%s" % (url_type, url_host, url_path)
              error, redirect_code, redirect_to, headers, content = \
-                self.get(bookmark, url, True)
+                self.get(bookmark, bookmark.href, True)
  
              if error:
                  bookmark.error = error
@@ -89,7 +96,7 @@ class robot_base(Robot):
                  try:
                      size = headers["Content-Length"]
                  except KeyError:
-                    size = len(content)
+                    pass
  
                  try:
                      last_modified = headers["Last-Modified"]
@@ -98,7 +105,8 @@ class robot_base(Robot):
  
                  if last_modified:
                      last_modified = parse_time(last_modified)
-            else:
+
+            if not size:  # Could be None from headers
                  size = len(content)
  
              if last_modified:
@@ -109,25 +117,13 @@ class robot_base(Robot):
              bookmark.size = size
              bookmark.last_modified = last_modified
  
-            md5 = md5wrapper()
-            if url_type == "ftp":  # Pass welcome message through MD5
-                ftp_welcome = self.get_ftp_welcome()
-                if not isinstance(ftp_welcome, bytes):
-                    ftp_welcome = ftp_welcome.encode('utf-8')
-                md5.update(ftp_welcome)
-
-            if isinstance(content, bytes):
-                md5.update(content)
-            else:
-                md5.update(content.encode('utf-8'))
-            bookmark.md5 = str(md5)
-
+            charset = None
              if headers:
                  try:
                      content_type = headers["Content-Type"]
-                    self.log("   Content-Type: %s" % content_type)
+                    self.log("   Content-Type   : %s" % content_type)
                      if content_type is None:
-                        if 'html' in content.lower():
+                        if b'html' in content.lower():
                              content_type = 'text/html'
                          else:
                              content_type = 'text/plain'
@@ -135,7 +131,7 @@ class robot_base(Robot):
                                   % content_type)
                      try:
                          # extract charset from
-                        # "text/html; foo; charset=UTF-8, bar; baz;"
+                        # "text/html; charset=UTF-8, foo; bar"
                          content_type, charset = content_type.split(';', 1)
                          content_type = content_type.strip()
                          charset = charset.split('=')[1].strip().split(',')[0]
@@ -143,14 +139,27 @@ class robot_base(Robot):
                      except (ValueError, IndexError):
                          charset = None
                          self.log("   no charset in Content-Type header")
+                    is_html = False
                      for ctype in ("text/html", "application/xhtml+xml"):
                          if content_type.startswith(ctype):
-                            html = True
+                            is_html = True
                              break
-                    else:
-                        html = False
-                    if html:
-                        parser = parse_html(content, charset, self.log)
+                    content_stripped = content.strip()
+                    if content_stripped and charset:
+                        try:
+                            content_stripped = content_stripped.decode(
+                                charset, 'replace')
+                        except LookupError:
+                            charset = None
+                            self.log("   unknown charset "
+                                     "in Content-Type header")
+                    if content_stripped and is_html:
+                        parser = parse_html(
+                            content_stripped, charset, self.log)
+                        if charset:
+                            bookmark.charset = charset
+                        elif parser and parser.meta_charset:
+                            bookmark.charset = parser.meta_charset
                          if parser:
                              bookmark.real_title = parser.title
                              icon = parser.icon
@@ -158,16 +167,22 @@ class robot_base(Robot):
                              icon = None
                          if not icon:
                              icon = "/favicon.ico"
-                        icon_url = urljoin(
-                            "%s://%s%s" % (url_type, url_host, url_path), icon)
+                        icon_url = urljoin(bookmark.href, icon)
                          self.log("   looking for icon at: %s" % icon_url)
                          if icon_url in icons:
                              if icons[icon_url]:
                                  bookmark.icon_href = icon_url
                                  content_type, bookmark.icon = icons[icon_url]
-                                self.log("   cached icon: %s" % content_type)
+                                self.log("   cached icon    : %s"
+                                         % content_type)
                              else:
-                                self.log("   cached icon: no icon")
+                                self.log("   cached icon    : no icon")
+                        elif icon_url.startswith('data:'):
+                            content_type, icon_data = \
+                                icon_url[len('data:'):].split(',', 1)
+                            bookmark.icon_href = bookmark.icon = icon_url
+                            self.log("   got data icon  : %s" % content_type)
+                            icons[icon_url] = (content_type, icon_url)
                          else:
                              try:
                                  _icon_url = icon_url
@@ -211,14 +226,14 @@ class robot_base(Robot):
                                                   " assume x-icon")
                                          content_type = 'image/x-icon'
                                      if not isinstance(icon_data, bytes):
-                                        icon_data = icon_data.encode('utf-8')
+                                        icon_data = icon_data.encode('latin1')
                                      bookmark.icon = "data:%s;base64,%s" \
                                          % (content_type, b64encode(icon_data))
                                      icons[icon_url] = (content_type,
                                                         bookmark.icon
                                                         )
                                  else:
-                                    self.log("   no icon        :"
+                                    self.log("   no icon        : "
                                               "bad content type '%s'"
                                               % content_type
                                               )
@@ -245,10 +260,30 @@ class robot_base(Robot):
                                                    "%s (%s sec)"
                                                    % (url, timeout)
                                                    )
+                    elif charset:
+                        bookmark.charset = charset
  
+                    if not content_stripped:
+                        self.log("   empty response, no content")
+                    if not is_html:
+                        self.log("   not html")
                  except KeyError as key:
                      self.log("   no header: %s" % key)
  
+            md5 = md5wrapper()
+            if bookmark.href.startswith("ftp://"):
+                # Pass welcome message through MD5
+                ftp_welcome = self.get_ftp_welcome()
+                if not isinstance(ftp_welcome, bytes):
+                    ftp_welcome = ftp_welcome.encode(charset or 'utf-8')
+                md5.update(ftp_welcome)
+
+            if isinstance(content, bytes):
+                md5.update(content)
+            else:
+                md5.update(content.encode(charset or 'utf-8'))
+            bookmark.md5 = str(md5)
+
          except EOFError:
              bookmark.error = "Unexpected EOF (FTP server closed connection)"
              self.log('   EOF: %s' % bookmark.error)
@@ -274,8 +309,15 @@ class robot_base(Robot):
          return 1
  
      def set_redirect(self, bookmark, errcode, newurl):
-        bookmark.moved = "(%s) to %s" % (reloc_dict[errcode], newurl)
-        self.log('   Moved: %s' % bookmark.moved)
+        bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl)
+        try:
+            moved.encode('ascii')
+        except UnicodeEncodeError:
+            try:
+                moved = moved.encode(bookmark.charset)
+            except (LookupError, TypeError, UnicodeEncodeError):
+                moved = moved.encode('utf-8')
+        self.log('   Moved: %s' % moved)
  
      def finish_check_url(self, bookmark):
          start = self.start