X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fbkmk_robot_base.py;h=a03d5c1b16ec430ab17ac8f899655ae9cf9808bb;hb=HEAD;hp=2024ab85efd74594eb4700078f19f7e012ae2b78;hpb=39125836fc96da8bd411da9b03bdc14b48eb4e9d;p=bookmarks_db.git diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index 2024ab8..a03d5c1 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -5,14 +5,14 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['robot_base', 'get_error'] from base64 import b64encode -from urllib.parse import urlsplit, urljoin +from urllib.parse import urljoin import sys import socket import time @@ -24,6 +24,22 @@ from bkmk_objects import Robot from parse_html import parse_html +# Fake headers to pretend this is a real browser +_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)" +" Gecko/20001221 Firefox/2.0.0" +_x_user_agent = "bookmarks_db (Python %d.%d.%d)" % sys.version_info[:3] + +request_headers = { + 'Accept': '*/*', + 'Accept-Language': 'ru,en', + 'Cache-Control': 'max-age=300', + 'Connection': 'close', + 'Referer': '/', + 'User-Agent': _user_agent, + 'X-User-Agent': _x_user_agent, +} + + reloc_dict = { 301: "perm1.", 302: "temp2.", @@ -62,13 +78,8 @@ class robot_base(Robot): self.start = int(time.time()) bookmark.icon = None - split_results = urlsplit(bookmark.href) - url_type, netloc, url_path, query, url_tag = split_results - url_host = split_results.hostname - - url = "%s://%s%s" % (url_type, url_host, url_path) error, redirect_code, redirect_to, headers, content = \ - self.get(bookmark, url, True) + self.get(bookmark, bookmark.href, True) if error: bookmark.error = error @@ -135,8 +146,13 @@ class robot_base(Robot): break content_stripped = content.strip() if content_stripped and charset: - content_stripped = content_stripped.decode( - charset, 'replace') + try: + content_stripped = content_stripped.decode( + charset, 'replace') + except LookupError: + charset = None + self.log(" unknown charset " + "in Content-Type header") if content_stripped and is_html: parser = parse_html( content_stripped, charset, self.log) @@ -151,16 +167,22 @@ class robot_base(Robot): icon = None if not icon: icon = "/favicon.ico" - icon_url = urljoin( - "%s://%s%s" % (url_type, url_host, url_path), icon) + icon_url = urljoin(bookmark.href, icon) self.log(" looking for icon at: %s" % icon_url) if icon_url in icons: if icons[icon_url]: bookmark.icon_href = icon_url content_type, bookmark.icon = icons[icon_url] - self.log(" cached icon: %s" % content_type) + self.log(" cached icon : %s" + % content_type) else: - self.log(" cached icon: no icon") + self.log(" cached icon : no icon") + elif icon_url.startswith('data:'): + content_type, icon_data = \ + icon_url[len('data:'):].split(',', 1) + bookmark.icon_href = bookmark.icon = icon_url + self.log(" got data icon : %s" % content_type) + icons[icon_url] = (content_type, icon_url) else: try: _icon_url = icon_url @@ -211,7 +233,7 @@ class robot_base(Robot): bookmark.icon ) else: - self.log(" no icon :" + self.log(" no icon : " "bad content type '%s'" % content_type ) @@ -249,7 +271,8 @@ class robot_base(Robot): self.log(" no header: %s" % key) md5 = md5wrapper() - if url_type == "ftp": # Pass welcome message through MD5 + if bookmark.href.startswith("ftp://"): + # Pass welcome message through MD5 ftp_welcome = self.get_ftp_welcome() if not isinstance(ftp_welcome, bytes): ftp_welcome = ftp_welcome.encode(charset or 'utf-8')