X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fbkmk_robot_base.py;h=52d6b563f1056c9ffe29a08299d59084d248be4e;hb=066f29ea81222a8a2ddd4ab1aff131d7fc1ec37f;hp=9c9d8c5823543bc442472274068da5acbf8417b0;hpb=c2ea4e82718b903aa123dd77490f36657383b0ca;p=bookmarks_db.git diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index 9c9d8c5..52d6b56 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -15,8 +15,11 @@ from base64 import b64encode import sys import socket import time -import urllib -from urlparse import urljoin +try: + from urllib.parse import splittype, splithost, splittag, urljoin +except ImportError: + from urllib import splittype, splithost, splittag + from urlparse import urljoin from m_lib.md5wrapper import md5wrapper from m_lib.net.www.util import parse_time @@ -26,10 +29,11 @@ from parse_html import parse_html reloc_dict = { - 301: "perm.", + 301: "perm1.", 302: "temp2.", 303: "temp3.", 307: "temp7.", + 308: "temp8.", "html": "html" } @@ -62,13 +66,14 @@ class robot_base(Robot): self.start = int(time.time()) bookmark.icon = None - url_type, url_rest = urllib.splittype(bookmark.href) - url_host, url_path = urllib.splithost(url_rest) - url_path, url_tag = urllib.splittag(url_path) # noqa: E221 - # multiple spaces before operator + url_type, url_rest = splittype(bookmark.href) + url_host, url_path = splithost(url_rest) + url_path, url_tag = splittag(url_path) # noqa: E221 + # multiple spaces before operator url = "%s://%s%s" % (url_type, url_host, url_path) - error, redirect_code, redirect_to, headers, content = self.get(bookmark, url, True) + error, redirect_code, redirect_to, headers, content = \ + self.get(bookmark, url, True) if error: bookmark.error = error @@ -107,17 +112,31 @@ class robot_base(Robot): md5 = md5wrapper() if url_type == "ftp": # Pass welcome message through MD5 - md5.update(self.get_ftp_welcome()) + ftp_welcome = self.get_ftp_welcome() + if not isinstance(ftp_welcome, bytes): + ftp_welcome = ftp_welcome.encode('utf-8') + md5.update(ftp_welcome) - md5.update(content) + if isinstance(content, bytes): + md5.update(content) + else: + md5.update(content.encode('utf-8')) bookmark.md5 = str(md5) if headers: try: content_type = headers["Content-Type"] - self.log(" Content-Type: %s" % content_type) + self.log(" Content-Type : %s" % content_type) + if content_type is None: + if 'html' in content.lower(): + content_type = 'text/html' + else: + content_type = 'text/plain' + self.log(" Set Content-Type to: %s" + % content_type) try: - # extract charset from "text/html; foo; charset=UTF-8, bar; baz;" + # extract charset from + # "text/html; foo; charset=UTF-8, bar; baz;" content_type, charset = content_type.split(';', 1) content_type = content_type.strip() charset = charset.split('=')[1].strip().split(',')[0] @@ -125,14 +144,14 @@ class robot_base(Robot): except (ValueError, IndexError): charset = None self.log(" no charset in Content-Type header") + is_html = False for ctype in ("text/html", "application/xhtml+xml"): if content_type.startswith(ctype): - html = True + is_html = True break - else: - html = False - if html: - parser = parse_html(content, charset, self.log) + content_stripped = content.strip() + if content_stripped and is_html: + parser = parse_html(content_stripped, charset, self.log) if parser: bookmark.real_title = parser.title icon = parser.icon @@ -140,7 +159,8 @@ class robot_base(Robot): icon = None if not icon: icon = "/favicon.ico" - icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon) + icon_url = urljoin( + "%s://%s%s" % (url_type, url_host, url_path), icon) self.log(" looking for icon at: %s" % icon_url) if icon_url in icons: if icons[icon_url]: @@ -153,12 +173,14 @@ class robot_base(Robot): try: _icon_url = icon_url for i in range(8): - error, icon_redirect_code, icon_redirect_to, \ - icon_headers, icon_data = \ + error, icon_redirect_code, \ + icon_redirect_to, icon_headers, \ + icon_data = \ self.get(bookmark, _icon_url) if icon_redirect_code: _icon_url = icon_redirect_to - self.log(" redirect to : %s" % _icon_url) + self.log(" redirect to : %s" + % _icon_url) else: if icon_data is None: raise IOError("No icon") @@ -166,25 +188,41 @@ class robot_base(Robot): else: raise IOError("Too many redirects") except: - etype, emsg, tb = sys.exc_info() - self.log(" no icon : %s %s" % (etype, emsg)) - etype = emsg = tb = None + etype, emsg, _ = sys.exc_info() + self.log(" no icon : %s %s" + % (etype, emsg)) + etype = emsg = _ = None icons[icon_url] = None else: content_type = icon_headers["Content-Type"] - if content_type.startswith("application/") \ - or content_type.startswith("image/") \ - or content_type.startswith("text/plain"): + if content_type and ( + content_type.startswith("application/") + or content_type.startswith("image/") + or content_type.startswith("text/plain") + ): bookmark.icon_href = icon_url - self.log(" got icon : %s" % content_type) - if content_type.startswith("application/") \ - or content_type.startswith("text/plain"): - self.log(" non-image content type, assume x-icon") + self.log(" got icon : %s" + % content_type) + if ( + content_type.startswith("application/") + or content_type.startswith( + "text/plain") + ): + self.log(" non-image content type," + " assume x-icon") content_type = 'image/x-icon' - bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data)) - icons[icon_url] = (content_type, bookmark.icon) + if not isinstance(icon_data, bytes): + icon_data = icon_data.encode('utf-8') + bookmark.icon = "data:%s;base64,%s" \ + % (content_type, b64encode(icon_data)) + icons[icon_url] = (content_type, + bookmark.icon + ) else: - self.log(" no icon : bad content type '%s'" % content_type) + self.log(" no icon :" + "bad content type '%s'" + % content_type + ) icons[icon_url] = None if parser and parser.refresh: refresh = parser.refresh @@ -195,14 +233,24 @@ class robot_base(Robot): try: timeout = float(refresh.split(';')[0]) except (IndexError, ValueError): - self.set_redirect(bookmark, "html", "Bad redirect to %s (%s)" % (url, refresh)) + self.set_redirect(bookmark, "html", + "Bad redirect to %s (%s)" + % (url, refresh) + ) else: try: timeout = int(refresh.split(';')[0]) except ValueError: pass # float timeout - self.set_redirect(bookmark, "html", "%s (%s sec)" % (url, timeout)) - + self.set_redirect(bookmark, "html", + "%s (%s sec)" + % (url, timeout) + ) + + if not content_stripped: + self.log(" empty response, no content") + if not is_html: + self.log(" not html") except KeyError as key: self.log(" no header: %s" % key) @@ -231,8 +279,15 @@ class robot_base(Robot): return 1 def set_redirect(self, bookmark, errcode, newurl): - bookmark.moved = "(%s) to %s" % (reloc_dict[errcode], newurl) - self.log(' Moved: %s' % bookmark.moved) + bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl) + try: + moved.encode('ascii') + except UnicodeEncodeError: + try: + moved = moved.encode(bookmark.charset) + except (LookupError, TypeError, UnicodeEncodeError): + moved = moved.encode('utf-8') + self.log(' Moved: %s' % moved) def finish_check_url(self, bookmark): start = self.start