X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fbkmk_robot_base.py;h=d8877c6f41c6b6f30c8d8ba4b8dbcec396e6c02b;hb=8c04e58972d1c58ab82250df093c3d503eed4fe2;hp=fb8bb2d6eab923b78f7e0c83c9236a78e1e2750d;hpb=0b78160bb3a7a59d6c91ee41ff273321a4b3d556;p=bookmarks_db.git diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index fb8bb2d..d8877c6 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -29,10 +29,11 @@ from parse_html import parse_html reloc_dict = { - 301: "perm.", + 301: "perm1.", 302: "temp2.", 303: "temp3.", 307: "temp7.", + 308: "temp8.", "html": "html" } @@ -111,15 +112,28 @@ class robot_base(Robot): md5 = md5wrapper() if url_type == "ftp": # Pass welcome message through MD5 - md5.update(self.get_ftp_welcome()) + ftp_welcome = self.get_ftp_welcome() + if not isinstance(ftp_welcome, bytes): + ftp_welcome = ftp_welcome.encode('utf-8') + md5.update(ftp_welcome) - md5.update(content) + if isinstance(content, bytes): + md5.update(content) + else: + md5.update(content.encode('utf-8')) bookmark.md5 = str(md5) if headers: try: content_type = headers["Content-Type"] - self.log(" Content-Type: %s" % content_type) + self.log(" Content-Type : %s" % content_type) + if content_type is None: + if 'html' in content.lower(): + content_type = 'text/html' + else: + content_type = 'text/plain' + self.log(" Set Content-Type to: %s" + % content_type) try: # extract charset from # "text/html; foo; charset=UTF-8, bar; baz;" @@ -130,13 +144,12 @@ class robot_base(Robot): except (ValueError, IndexError): charset = None self.log(" no charset in Content-Type header") + is_html = False for ctype in ("text/html", "application/xhtml+xml"): if content_type.startswith(ctype): - html = True + is_html = True break - else: - html = False - if html: + if content and is_html: parser = parse_html(content, charset, self.log) if parser: bookmark.real_title = parser.title @@ -181,9 +194,11 @@ class robot_base(Robot): icons[icon_url] = None else: content_type = icon_headers["Content-Type"] - if content_type.startswith("application/") \ - or content_type.startswith("image/") \ - or content_type.startswith("text/plain"): + if content_type and ( + content_type.startswith("application/") + or content_type.startswith("image/") + or content_type.startswith("text/plain") + ): bookmark.icon_href = icon_url self.log(" got icon : %s" % content_type) @@ -195,6 +210,8 @@ class robot_base(Robot): self.log(" non-image content type," " assume x-icon") content_type = 'image/x-icon' + if not isinstance(icon_data, bytes): + icon_data = icon_data.encode('utf-8') bookmark.icon = "data:%s;base64,%s" \ % (content_type, b64encode(icon_data)) icons[icon_url] = (content_type, @@ -229,6 +246,10 @@ class robot_base(Robot): % (url, timeout) ) + if not content: + self.log(" empty response, no content") + if not is_html: + self.log(" not html") except KeyError as key: self.log(" no header: %s" % key) @@ -257,8 +278,15 @@ class robot_base(Robot): return 1 def set_redirect(self, bookmark, errcode, newurl): - bookmark.moved = "(%s) to %s" % (reloc_dict[errcode], newurl) - self.log(' Moved: %s' % bookmark.moved) + bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl) + try: + moved.encode('ascii') + except UnicodeEncodeError: + try: + moved = moved.encode(bookmark.charset) + except (LookupError, TypeError, UnicodeEncodeError): + moved = moved.encode('utf-8') + self.log(' Moved: %s' % moved) def finish_check_url(self, bookmark): start = self.start