X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fbkmk_robot_base.py;h=c5afd3fdc0aef90c1db8f69551e64c37e7c70882;hb=e5a4bfbed97d95fd2a658d6dde003b5af154d95d;hp=d2175ace96bc3c9d084cc3b3e0394ffe0929791e;hpb=05f0cc396bc24da45fa1a8b0a79c97c79f399465;p=bookmarks_db.git diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index d2175ac..c5afd3f 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -16,10 +16,10 @@ import sys import socket import time try: - from urllib.parse import splittype, splithost, splittag + from urllib.parse import splittype, splithost, splittag, urljoin except ImportError: from urllib import splittype, splithost, splittag -from urlparse import urljoin + from urlparse import urljoin from m_lib.md5wrapper import md5wrapper from m_lib.net.www.util import parse_time @@ -29,10 +29,11 @@ from parse_html import parse_html reloc_dict = { - 301: "perm.", + 301: "perm1.", 302: "temp2.", 303: "temp3.", 307: "temp7.", + 308: "temp8.", "html": "html" } @@ -89,7 +90,7 @@ class robot_base(Robot): try: size = headers["Content-Length"] except KeyError: - size = len(content) + pass try: last_modified = headers["Last-Modified"] @@ -98,7 +99,8 @@ class robot_base(Robot): if last_modified: last_modified = parse_time(last_modified) - else: + + if not size: # Could be None from headers size = len(content) if last_modified: @@ -109,20 +111,21 @@ class robot_base(Robot): bookmark.size = size bookmark.last_modified = last_modified - md5 = md5wrapper() - if url_type == "ftp": # Pass welcome message through MD5 - md5.update(self.get_ftp_welcome()) - - md5.update(content) - bookmark.md5 = str(md5) - + charset = None if headers: try: content_type = headers["Content-Type"] - self.log(" Content-Type: %s" % content_type) + self.log(" Content-Type : %s" % content_type) + if content_type is None: + if b'html' in content.lower(): + content_type = 'text/html' + else: + content_type = 'text/plain' + self.log(" Set Content-Type to: %s" + % content_type) try: # extract charset from - # "text/html; foo; charset=UTF-8, bar; baz;" + # "text/html; charset=UTF-8, foo; bar" content_type, charset = content_type.split(';', 1) content_type = content_type.strip() charset = charset.split('=')[1].strip().split(',')[0] @@ -130,14 +133,22 @@ class robot_base(Robot): except (ValueError, IndexError): charset = None self.log(" no charset in Content-Type header") + is_html = False for ctype in ("text/html", "application/xhtml+xml"): if content_type.startswith(ctype): - html = True + is_html = True break - else: - html = False - if html: - parser = parse_html(content, charset, self.log) + content_stripped = content.strip() + if content_stripped and charset: + content_stripped = content_stripped.decode( + charset, 'replace') + if content_stripped and is_html: + parser = parse_html( + content_stripped, charset, self.log) + if charset: + bookmark.charset = charset + elif parser and parser.meta_charset: + bookmark.charset = parser.meta_charset if parser: bookmark.real_title = parser.title icon = parser.icon @@ -181,9 +192,11 @@ class robot_base(Robot): icons[icon_url] = None else: content_type = icon_headers["Content-Type"] - if content_type.startswith("application/") \ - or content_type.startswith("image/") \ - or content_type.startswith("text/plain"): + if content_type and ( + content_type.startswith("application/") + or content_type.startswith("image/") + or content_type.startswith("text/plain") + ): bookmark.icon_href = icon_url self.log(" got icon : %s" % content_type) @@ -195,6 +208,8 @@ class robot_base(Robot): self.log(" non-image content type," " assume x-icon") content_type = 'image/x-icon' + if not isinstance(icon_data, bytes): + icon_data = icon_data.encode('latin1') bookmark.icon = "data:%s;base64,%s" \ % (content_type, b64encode(icon_data)) icons[icon_url] = (content_type, @@ -228,10 +243,29 @@ class robot_base(Robot): "%s (%s sec)" % (url, timeout) ) + elif charset: + bookmark.charset = charset + if not content_stripped: + self.log(" empty response, no content") + if not is_html: + self.log(" not html") except KeyError as key: self.log(" no header: %s" % key) + md5 = md5wrapper() + if url_type == "ftp": # Pass welcome message through MD5 + ftp_welcome = self.get_ftp_welcome() + if not isinstance(ftp_welcome, bytes): + ftp_welcome = ftp_welcome.encode(charset or 'utf-8') + md5.update(ftp_welcome) + + if isinstance(content, bytes): + md5.update(content) + else: + md5.update(content.encode(charset or 'utf-8')) + bookmark.md5 = str(md5) + except EOFError: bookmark.error = "Unexpected EOF (FTP server closed connection)" self.log(' EOF: %s' % bookmark.error) @@ -257,8 +291,15 @@ class robot_base(Robot): return 1 def set_redirect(self, bookmark, errcode, newurl): - bookmark.moved = "(%s) to %s" % (reloc_dict[errcode], newurl) - self.log(' Moved: %s' % bookmark.moved) + bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl) + try: + moved.encode('ascii') + except UnicodeEncodeError: + try: + moved = moved.encode(bookmark.charset) + except (LookupError, TypeError, UnicodeEncodeError): + moved = moved.encode('utf-8') + self.log(' Moved: %s' % moved) def finish_check_url(self, bookmark): start = self.start