X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fbkmk_robot_base.py;h=0600e7338d2b71c41492d73858acf72d0818ae84;hb=HEAD;hp=3b5ec274760a59b3578e25a5bec87a7b7d09c7ac;hpb=2e673dc1e4202710ca17ee2f6eab2dae721139ac;p=bookmarks_db.git diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index 3b5ec27..a03d5c1 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -5,18 +5,17 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['robot_base', 'get_error'] from base64 import b64encode +from urllib.parse import urljoin import sys import socket import time -import urllib -from urlparse import urljoin from m_lib.md5wrapper import md5wrapper from m_lib.net.www.util import parse_time @@ -25,11 +24,28 @@ from bkmk_objects import Robot from parse_html import parse_html +# Fake headers to pretend this is a real browser +_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)" +" Gecko/20001221 Firefox/2.0.0" +_x_user_agent = "bookmarks_db (Python %d.%d.%d)" % sys.version_info[:3] + +request_headers = { + 'Accept': '*/*', + 'Accept-Language': 'ru,en', + 'Cache-Control': 'max-age=300', + 'Connection': 'close', + 'Referer': '/', + 'User-Agent': _user_agent, + 'X-User-Agent': _x_user_agent, +} + + reloc_dict = { - 301: "perm.", + 301: "perm1.", 302: "temp2.", 303: "temp3.", 307: "temp7.", + 308: "temp8.", "html": "html" } @@ -62,14 +78,8 @@ class robot_base(Robot): self.start = int(time.time()) bookmark.icon = None - url_type, url_rest = urllib.splittype(bookmark.href) - url_host, url_path = urllib.splithost(url_rest) - url_path, url_tag = urllib.splittag(url_path) # noqa: E221 - # multiple spaces before operator - - url = "%s://%s%s" % (url_type, url_host, url_path) error, redirect_code, redirect_to, headers, content = \ - self.get(bookmark, url, True) + self.get(bookmark, bookmark.href, True) if error: bookmark.error = error @@ -86,7 +96,7 @@ class robot_base(Robot): try: size = headers["Content-Length"] except KeyError: - size = len(content) + pass try: last_modified = headers["Last-Modified"] @@ -95,7 +105,8 @@ class robot_base(Robot): if last_modified: last_modified = parse_time(last_modified) - else: + + if not size: # Could be None from headers size = len(content) if last_modified: @@ -106,20 +117,21 @@ class robot_base(Robot): bookmark.size = size bookmark.last_modified = last_modified - md5 = md5wrapper() - if url_type == "ftp": # Pass welcome message through MD5 - md5.update(self.get_ftp_welcome()) - - md5.update(content) - bookmark.md5 = str(md5) - + charset = None if headers: try: content_type = headers["Content-Type"] - self.log(" Content-Type: %s" % content_type) + self.log(" Content-Type : %s" % content_type) + if content_type is None: + if b'html' in content.lower(): + content_type = 'text/html' + else: + content_type = 'text/plain' + self.log(" Set Content-Type to: %s" + % content_type) try: # extract charset from - # "text/html; foo; charset=UTF-8, bar; baz;" + # "text/html; charset=UTF-8, foo; bar" content_type, charset = content_type.split(';', 1) content_type = content_type.strip() charset = charset.split('=')[1].strip().split(',')[0] @@ -127,14 +139,27 @@ class robot_base(Robot): except (ValueError, IndexError): charset = None self.log(" no charset in Content-Type header") + is_html = False for ctype in ("text/html", "application/xhtml+xml"): if content_type.startswith(ctype): - html = True + is_html = True break - else: - html = False - if html: - parser = parse_html(content, charset, self.log) + content_stripped = content.strip() + if content_stripped and charset: + try: + content_stripped = content_stripped.decode( + charset, 'replace') + except LookupError: + charset = None + self.log(" unknown charset " + "in Content-Type header") + if content_stripped and is_html: + parser = parse_html( + content_stripped, charset, self.log) + if charset: + bookmark.charset = charset + elif parser and parser.meta_charset: + bookmark.charset = parser.meta_charset if parser: bookmark.real_title = parser.title icon = parser.icon @@ -142,16 +167,22 @@ class robot_base(Robot): icon = None if not icon: icon = "/favicon.ico" - icon_url = urljoin( - "%s://%s%s" % (url_type, url_host, url_path), icon) + icon_url = urljoin(bookmark.href, icon) self.log(" looking for icon at: %s" % icon_url) if icon_url in icons: if icons[icon_url]: bookmark.icon_href = icon_url content_type, bookmark.icon = icons[icon_url] - self.log(" cached icon: %s" % content_type) + self.log(" cached icon : %s" + % content_type) else: - self.log(" cached icon: no icon") + self.log(" cached icon : no icon") + elif icon_url.startswith('data:'): + content_type, icon_data = \ + icon_url[len('data:'):].split(',', 1) + bookmark.icon_href = bookmark.icon = icon_url + self.log(" got data icon : %s" % content_type) + icons[icon_url] = (content_type, icon_url) else: try: _icon_url = icon_url @@ -178,9 +209,11 @@ class robot_base(Robot): icons[icon_url] = None else: content_type = icon_headers["Content-Type"] - if content_type.startswith("application/") \ - or content_type.startswith("image/") \ - or content_type.startswith("text/plain"): + if content_type and ( + content_type.startswith("application/") + or content_type.startswith("image/") + or content_type.startswith("text/plain") + ): bookmark.icon_href = icon_url self.log(" got icon : %s" % content_type) @@ -192,13 +225,15 @@ class robot_base(Robot): self.log(" non-image content type," " assume x-icon") content_type = 'image/x-icon' + if not isinstance(icon_data, bytes): + icon_data = icon_data.encode('latin1') bookmark.icon = "data:%s;base64,%s" \ % (content_type, b64encode(icon_data)) icons[icon_url] = (content_type, bookmark.icon ) else: - self.log(" no icon :" + self.log(" no icon : " "bad content type '%s'" % content_type ) @@ -225,10 +260,30 @@ class robot_base(Robot): "%s (%s sec)" % (url, timeout) ) + elif charset: + bookmark.charset = charset + if not content_stripped: + self.log(" empty response, no content") + if not is_html: + self.log(" not html") except KeyError as key: self.log(" no header: %s" % key) + md5 = md5wrapper() + if bookmark.href.startswith("ftp://"): + # Pass welcome message through MD5 + ftp_welcome = self.get_ftp_welcome() + if not isinstance(ftp_welcome, bytes): + ftp_welcome = ftp_welcome.encode(charset or 'utf-8') + md5.update(ftp_welcome) + + if isinstance(content, bytes): + md5.update(content) + else: + md5.update(content.encode(charset or 'utf-8')) + bookmark.md5 = str(md5) + except EOFError: bookmark.error = "Unexpected EOF (FTP server closed connection)" self.log(' EOF: %s' % bookmark.error) @@ -254,8 +309,15 @@ class robot_base(Robot): return 1 def set_redirect(self, bookmark, errcode, newurl): - bookmark.moved = "(%s) to %s" % (reloc_dict[errcode], newurl) - self.log(' Moved: %s' % bookmark.moved) + bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl) + try: + moved.encode('ascii') + except UnicodeEncodeError: + try: + moved = moved.encode(bookmark.charset) + except (LookupError, TypeError, UnicodeEncodeError): + moved = moved.encode('utf-8') + self.log(' Moved: %s' % moved) def finish_check_url(self, bookmark): start = self.start