X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fbkmk_robot_base.py;h=c5afd3fdc0aef90c1db8f69551e64c37e7c70882;hb=188d6e0bfa16f616aaae084d80411b6f3222270f;hp=f32e1b304d45aab959b047d51db4140241b90633;hpb=e69ee847224026d24684847d19d178f5b0db9309;p=bookmarks_db.git diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index f32e1b3..c5afd3f 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -90,7 +90,7 @@ class robot_base(Robot): try: size = headers["Content-Length"] except KeyError: - size = len(content) + pass try: last_modified = headers["Last-Modified"] @@ -99,7 +99,8 @@ class robot_base(Robot): if last_modified: last_modified = parse_time(last_modified) - else: + + if not size: # Could be None from headers size = len(content) if last_modified: @@ -110,25 +111,13 @@ class robot_base(Robot): bookmark.size = size bookmark.last_modified = last_modified - md5 = md5wrapper() - if url_type == "ftp": # Pass welcome message through MD5 - ftp_welcome = self.get_ftp_welcome() - if not isinstance(ftp_welcome, bytes): - ftp_welcome = ftp_welcome.encode('utf-8') - md5.update(ftp_welcome) - - if isinstance(content, bytes): - md5.update(content) - else: - md5.update(content.encode('utf-8')) - bookmark.md5 = str(md5) - + charset = None if headers: try: content_type = headers["Content-Type"] - self.log(" Content-Type: %s" % content_type) + self.log(" Content-Type : %s" % content_type) if content_type is None: - if 'html' in content.lower(): + if b'html' in content.lower(): content_type = 'text/html' else: content_type = 'text/plain' @@ -136,7 +125,7 @@ class robot_base(Robot): % content_type) try: # extract charset from - # "text/html; foo; charset=UTF-8, bar; baz;" + # "text/html; charset=UTF-8, foo; bar" content_type, charset = content_type.split(';', 1) content_type = content_type.strip() charset = charset.split('=')[1].strip().split(',')[0] @@ -149,8 +138,17 @@ class robot_base(Robot): if content_type.startswith(ctype): is_html = True break - if content and is_html: - parser = parse_html(content, charset, self.log) + content_stripped = content.strip() + if content_stripped and charset: + content_stripped = content_stripped.decode( + charset, 'replace') + if content_stripped and is_html: + parser = parse_html( + content_stripped, charset, self.log) + if charset: + bookmark.charset = charset + elif parser and parser.meta_charset: + bookmark.charset = parser.meta_charset if parser: bookmark.real_title = parser.title icon = parser.icon @@ -211,7 +209,7 @@ class robot_base(Robot): " assume x-icon") content_type = 'image/x-icon' if not isinstance(icon_data, bytes): - icon_data = icon_data.encode('utf-8') + icon_data = icon_data.encode('latin1') bookmark.icon = "data:%s;base64,%s" \ % (content_type, b64encode(icon_data)) icons[icon_url] = (content_type, @@ -245,14 +243,29 @@ class robot_base(Robot): "%s (%s sec)" % (url, timeout) ) + elif charset: + bookmark.charset = charset - if not content: + if not content_stripped: self.log(" empty response, no content") if not is_html: self.log(" not html") except KeyError as key: self.log(" no header: %s" % key) + md5 = md5wrapper() + if url_type == "ftp": # Pass welcome message through MD5 + ftp_welcome = self.get_ftp_welcome() + if not isinstance(ftp_welcome, bytes): + ftp_welcome = ftp_welcome.encode(charset or 'utf-8') + md5.update(ftp_welcome) + + if isinstance(content, bytes): + md5.update(content) + else: + md5.update(content.encode(charset or 'utf-8')) + bookmark.md5 = str(md5) + except EOFError: bookmark.error = "Unexpected EOF (FTP server closed connection)" self.log(' EOF: %s' % bookmark.error)