]> git.phdru.name Git - bookmarks_db.git/blobdiff - Robots/bkmk_robot_base.py
Fix(Robots/bkmk_robot_base): Add forgotten space in log
[bookmarks_db.git] / Robots / bkmk_robot_base.py
index d8877c6f41c6b6f30c8d8ba4b8dbcec396e6c02b..724391853f57f4d7912f579e261d403de35ccb47 100644 (file)
@@ -5,21 +5,17 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['robot_base', 'get_error']
 
 
 from base64 import b64encode
+from urllib.parse import urlsplit, urljoin
 import sys
 import socket
 import time
-try:
-    from urllib.parse import splittype, splithost, splittag, urljoin
-except ImportError:
-    from urllib import splittype, splithost, splittag
-    from urlparse import urljoin
 
 from m_lib.md5wrapper import md5wrapper
 from m_lib.net.www.util import parse_time
@@ -66,10 +62,9 @@ class robot_base(Robot):
             self.start = int(time.time())
             bookmark.icon = None
 
-            url_type, url_rest = splittype(bookmark.href)
-            url_host, url_path = splithost(url_rest)
-            url_path, url_tag  = splittag(url_path)  # noqa: E221
-            #                    multiple spaces before operator
+            split_results = urlsplit(bookmark.href)
+            url_type, netloc, url_path, query, url_tag = split_results
+            url_host = split_results.hostname
 
             url = "%s://%s%s" % (url_type, url_host, url_path)
             error, redirect_code, redirect_to, headers, content = \
@@ -90,7 +85,7 @@ class robot_base(Robot):
                 try:
                     size = headers["Content-Length"]
                 except KeyError:
-                    size = len(content)
+                    pass
 
                 try:
                     last_modified = headers["Last-Modified"]
@@ -99,7 +94,8 @@ class robot_base(Robot):
 
                 if last_modified:
                     last_modified = parse_time(last_modified)
-            else:
+
+            if not size:  # Could be None from headers
                 size = len(content)
 
             if last_modified:
@@ -110,25 +106,13 @@ class robot_base(Robot):
             bookmark.size = size
             bookmark.last_modified = last_modified
 
-            md5 = md5wrapper()
-            if url_type == "ftp":  # Pass welcome message through MD5
-                ftp_welcome = self.get_ftp_welcome()
-                if not isinstance(ftp_welcome, bytes):
-                    ftp_welcome = ftp_welcome.encode('utf-8')
-                md5.update(ftp_welcome)
-
-            if isinstance(content, bytes):
-                md5.update(content)
-            else:
-                md5.update(content.encode('utf-8'))
-            bookmark.md5 = str(md5)
-
+            charset = None
             if headers:
                 try:
                     content_type = headers["Content-Type"]
                     self.log("   Content-Type   : %s" % content_type)
                     if content_type is None:
-                        if 'html' in content.lower():
+                        if b'html' in content.lower():
                             content_type = 'text/html'
                         else:
                             content_type = 'text/plain'
@@ -136,7 +120,7 @@ class robot_base(Robot):
                                  % content_type)
                     try:
                         # extract charset from
-                        # "text/html; foo; charset=UTF-8, bar; baz;"
+                        # "text/html; charset=UTF-8, foo; bar"
                         content_type, charset = content_type.split(';', 1)
                         content_type = content_type.strip()
                         charset = charset.split('=')[1].strip().split(',')[0]
@@ -149,8 +133,17 @@ class robot_base(Robot):
                         if content_type.startswith(ctype):
                             is_html = True
                             break
-                    if content and is_html:
-                        parser = parse_html(content, charset, self.log)
+                    content_stripped = content.strip()
+                    if content_stripped and charset:
+                        content_stripped = content_stripped.decode(
+                            charset, 'replace')
+                    if content_stripped and is_html:
+                        parser = parse_html(
+                            content_stripped, charset, self.log)
+                        if charset:
+                            bookmark.charset = charset
+                        elif parser and parser.meta_charset:
+                            bookmark.charset = parser.meta_charset
                         if parser:
                             bookmark.real_title = parser.title
                             icon = parser.icon
@@ -211,14 +204,14 @@ class robot_base(Robot):
                                                  " assume x-icon")
                                         content_type = 'image/x-icon'
                                     if not isinstance(icon_data, bytes):
-                                        icon_data = icon_data.encode('utf-8')
+                                        icon_data = icon_data.encode('latin1')
                                     bookmark.icon = "data:%s;base64,%s" \
                                         % (content_type, b64encode(icon_data))
                                     icons[icon_url] = (content_type,
                                                        bookmark.icon
                                                        )
                                 else:
-                                    self.log("   no icon        :"
+                                    self.log("   no icon        : "
                                              "bad content type '%s'"
                                              % content_type
                                              )
@@ -245,14 +238,29 @@ class robot_base(Robot):
                                                   "%s (%s sec)"
                                                   % (url, timeout)
                                                   )
+                    elif charset:
+                        bookmark.charset = charset
 
-                    if not content:
+                    if not content_stripped:
                         self.log("   empty response, no content")
                     if not is_html:
                         self.log("   not html")
                 except KeyError as key:
                     self.log("   no header: %s" % key)
 
+            md5 = md5wrapper()
+            if url_type == "ftp":  # Pass welcome message through MD5
+                ftp_welcome = self.get_ftp_welcome()
+                if not isinstance(ftp_welcome, bytes):
+                    ftp_welcome = ftp_welcome.encode(charset or 'utf-8')
+                md5.update(ftp_welcome)
+
+            if isinstance(content, bytes):
+                md5.update(content)
+            else:
+                md5.update(content.encode(charset or 'utf-8'))
+            bookmark.md5 = str(md5)
+
         except EOFError:
             bookmark.error = "Unexpected EOF (FTP server closed connection)"
             self.log('   EOF: %s' % bookmark.error)