From 9e2bf3eaee5a8ee84c61ee5eac9c55090d45f63f Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Sat, 11 Nov 2023 21:35:26 +0300 Subject: [PATCH] Fix(Py3): Stop encoding unicode to bytes --- Storage/bkmk_stjson.py | 27 ++++----- Writers/bkmk_wflad.py | 4 +- bkmk_objects.py | 6 +- bkmk_parser.py | 6 +- parse_html/bkmk_parse_html.py | 82 ++++++++++++++-------------- parse_html/bkmk_ph_beautifulsoup.py | 4 +- parse_html/bkmk_ph_beautifulsoup4.py | 12 ++-- parse_html/bkmk_ph_etreetidy.py | 6 +- parse_html/bkmk_ph_html5.py | 6 +- parse_html/bkmk_ph_lxml.py | 6 +- 10 files changed, 77 insertions(+), 82 deletions(-) diff --git a/Storage/bkmk_stjson.py b/Storage/bkmk_stjson.py index 1d34cbd..9f7fed7 100644 --- a/Storage/bkmk_stjson.py +++ b/Storage/bkmk_stjson.py @@ -41,7 +41,7 @@ class storage_json(Walker): dict["lastModified"] = convert_date_to_json(f.last_modified) root = getattr(f, 'root') if root: dict["root"] = root - dict["title"] = f.name.decode('utf-8') + dict["title"] = f.name dict["type"] = "text/x-moz-place-container" if root: self.dict["children"].append(dict) @@ -69,7 +69,7 @@ class storage_json(Walker): keyword = getattr(b, 'keyword') if keyword: dict["keyword"] = keyword dict["lastModified"] = convert_date_to_json(b.last_modified) - dict["title"] = b.name.decode('utf-8') + dict["title"] = b.name dict["type"] = "text/x-moz-place" dict["uri"] = b.href self.folder_stack[-1].append(dict) @@ -84,7 +84,7 @@ class storage_json(Walker): if guid: dict["guid"] = guid dict["index"] = r.index dict["lastModified"] = convert_date_to_json(r.last_modified) - if r.name: dict["title"] = r.name.decode('utf-8') + if r.name: dict["title"] = r.name dict["type"] = "text/x-moz-place-separator" self.folder_stack[-1].append(dict) @@ -135,7 +135,7 @@ class storage_json(Walker): folder.guid = fdict.get("guid") folder.index = fdict.get("index") folder.root = fdict.get("root") - folder.name = encode_title(fdict["title"]) + folder.name = fdict["title"] if "children" in fdict: for record in fdict["children"]: @@ -154,7 +154,7 @@ class storage_json(Walker): elif record["type"] == "text/x-moz-place": bookmark = Bookmark( - href=record["uri"].encode('utf-8'), + href=record["uri"], add_date=convert_date_from_json( record.get("dateAdded")), last_modified=convert_date_from_json( @@ -166,7 +166,7 @@ class storage_json(Walker): bookmark.guid = record.get("guid") bookmark.id = record["id"] bookmark.index = record.get("index") - bookmark.name = encode_title(record["title"]) + bookmark.name = record["title"] self.current_folder.append(bookmark) elif record["type"] == "text/x-moz-place-separator": @@ -178,7 +178,7 @@ class storage_json(Walker): ruler.index = record["index"] ruler.last_modified = convert_date_from_json( record.get("lastModified")) - ruler.name = encode_title(record.get("title")) + ruler.name = record.get("title") ruler.comment = get_comment(record.get("annos")) self.current_folder.append(ruler) @@ -207,15 +207,9 @@ def convert_date_from_json(date): return date -def encode_title(title): - if title: - return title.encode("UTF-8", "xmlcharrefreplace") - return title - - def get_str(record, name): if name in record: - return record[name].encode('utf-8') + return record[name] return '' @@ -225,7 +219,7 @@ def get_comment(annos): for a in annos: if a["name"] == "bookmarkProperties/description": - return a["value"].encode('utf-8') + return a["value"] return '' @@ -235,4 +229,5 @@ def make_annos(value, name="bookmarkProperties/description"): "expires": 4, "flags": 0, "name": name, - "value": value.decode('utf-8')}] + "value": value, + }] diff --git a/Writers/bkmk_wflad.py b/Writers/bkmk_wflad.py index 1adc343..cae46bb 100644 --- a/Writers/bkmk_wflad.py +++ b/Writers/bkmk_wflad.py @@ -64,8 +64,8 @@ Comment: %s""" % ( ): if hasattr(b, attr_name): value = getattr(b, attr_name) - if isinstance(value, unicode): - value = value.encode('utf-8') + #if isinstance(value, unicode): + # value = value.encode('utf-8') self.outfile.write("\n%s: %s" % (attr_out, value)) if hasattr(b, "last_tested"): diff --git a/bkmk_objects.py b/bkmk_objects.py index 4850261..b9dabff 100644 --- a/bkmk_objects.py +++ b/bkmk_objects.py @@ -85,7 +85,7 @@ class Bookmark(object): href += ':' + quote(password) href += '@' if host: - href += host.decode(parser_charset or 'utf-8').encode('idna') + href += host.encode('idna').decode('ascii') if port: href += ':%d' % port if path: @@ -230,8 +230,8 @@ def unquote_title(title): if BKMK_FORMAT == "MOZILLA": from HTMLParser import HTMLParser title = HTMLParser().unescape( - title.replace("&", '&').decode('utf-8')) - title = title.encode('utf-8').replace("'", "'") + title.replace("&", '&')) + title = title.replace("'", "'") return title diff --git a/bkmk_parser.py b/bkmk_parser.py index 0cd8f1f..997728f 100644 --- a/bkmk_parser.py +++ b/bkmk_parser.py @@ -47,9 +47,9 @@ class BkmkParser(HTMLParser): def handle_data(self, data): if data: - if self.charset and default_encoding: - data = data.decode(self.charset, "replace").\ - encode(default_encoding, "xmlcharrefreplace") + #if self.charset and default_encoding: + # data = data.decode(self.charset, "replace").\ + # encode(default_encoding, "xmlcharrefreplace") self.accumulator += data # Mozilla - get charset diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index 2e7df1a..69d9035 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -78,7 +78,7 @@ def recode_entities(title, charset): entity_re.match(part): _part = name2codepoint.get(part[1:-1], None) if _part is not None: - part = unichr(_part).encode(charset) + part = unichr(_part) output.append(part) title = ''.join(output) @@ -86,7 +86,7 @@ def recode_entities(title, charset): for part in num_entity_re.split(title): if num_entity_re.match(part): try: - part = unichr(int(part[2:-1])).encode(charset) + part = unichr(int(part[2:-1])) except UnicodeEncodeError: pass # Leave the entity as is output.append(part) @@ -146,23 +146,23 @@ def parse_html(html_text, charset=None, log=None): p, parser = _parsers[0] if log: log(" Using %s" % p.__module__) - title = parser.title - if isinstance(title, unicode): - if parser.charset: - parser.title = title.encode(parser.charset) - else: - try: - parser.title = title.encode('ascii') - except UnicodeEncodeError: - try: - parser.title = title.encode(DEFAULT_CHARSET) - except UnicodeEncodeError: - parser.title = title.encode(universal_charset) - parser.charset = universal_charset - else: - parser.charset = DEFAULT_CHARSET - else: - parser.charset = 'ascii' + #title = parser.title + #if isinstance(title, unicode): + # if parser.charset: + # parser.title = title.encode(parser.charset) + # else: + # try: + # parser.title = title.encode('ascii') + # except UnicodeEncodeError: + # try: + # parser.title = title.encode(DEFAULT_CHARSET) + # except UnicodeEncodeError: + # parser.title = title.encode(universal_charset) + # parser.charset = universal_charset + # else: + # parser.charset = DEFAULT_CHARSET + # else: + # parser.charset = 'ascii' converted_title = title = parser.title if title and (not parser.charset): @@ -184,21 +184,21 @@ def parse_html(html_text, charset=None, log=None): if log: log(" guessed charset: %s" % parser.charset) # if log: log(" current charset: %s" % universal_charset) if log: log(" title : %s" % title) - if parser.charset != universal_charset: - try: - converted_title = title.decode(parser.charset).\ - encode(universal_charset) - except UnicodeError: - if log: - log(" incorrect conversion from %s," - "converting from %s" - % (parser.charset, DEFAULT_CHARSET)) - converted_title = \ - title.decode(DEFAULT_CHARSET, "replace").\ - encode(universal_charset, "replace") - parser.charset = DEFAULT_CHARSET - if log and (converted_title != title): - log(" converted title: %s" % converted_title) + #if parser.charset != universal_charset: + # try: + # converted_title = title.decode(parser.charset).\ + # encode(universal_charset) + # except UnicodeError: + # if log: + # log(" incorrect conversion from %s," + # "converting from %s" + # % (parser.charset, DEFAULT_CHARSET)) + # converted_title = \ + # title.decode(DEFAULT_CHARSET, "replace").\ + # encode(universal_charset, "replace") + # parser.charset = DEFAULT_CHARSET + #if log and (converted_title != title): + # log(" converted title: %s" % converted_title) except LookupError: if log: log(" unknown charset: '%s'" % parser.charset) else: @@ -212,13 +212,13 @@ def parse_html(html_text, charset=None, log=None): log(" final title : %s" % final_title) parser.title = final_title - icon = parser.icon - if isinstance(icon, unicode): - try: - parser.icon = icon.encode('ascii') - except UnicodeEncodeError: - if parser.charset: - parser.icon = icon.encode(parser.charset) + #icon = parser.icon + #if isinstance(icon, unicode): + # try: + # parser.icon = icon.encode('ascii') + # except UnicodeEncodeError: + # if parser.charset: + # parser.icon = icon.encode(parser.charset) return parser diff --git a/parse_html/bkmk_ph_beautifulsoup.py b/parse_html/bkmk_ph_beautifulsoup.py index 94a572b..ac880cc 100644 --- a/parse_html/bkmk_ph_beautifulsoup.py +++ b/parse_html/bkmk_ph_beautifulsoup.py @@ -125,8 +125,8 @@ def parse_html(html_text, charset=None, log=None): if meta_content: meta_charset = _charset = meta_content.lower() - if title and (_charset or meta_charset): - title = title.encode(_charset or meta_charset) + #if title and (_charset or meta_charset): + # title = title.encode(_charset or meta_charset) meta = head.find(_find_refresh, recursive=False) if meta: diff --git a/parse_html/bkmk_ph_beautifulsoup4.py b/parse_html/bkmk_ph_beautifulsoup4.py index fbd54ff..1095ebc 100644 --- a/parse_html/bkmk_ph_beautifulsoup4.py +++ b/parse_html/bkmk_ph_beautifulsoup4.py @@ -84,12 +84,12 @@ def parse_html(html_text, charset=None, log=None): if meta_content: meta_charset = _charset = meta_content.lower() - if title and (_charset or meta_charset): - try: - title = title.encode(_charset or meta_charset) - except LookupError: - title = title.encode(universal_charset) - _charset = universal_charset + #if title and (_charset or meta_charset): + # try: + # title = title.encode(_charset or meta_charset) + # except LookupError: + # title = title.encode(universal_charset) + # _charset = universal_charset meta = head.find(_find_refresh, recursive=False) if meta: diff --git a/parse_html/bkmk_ph_etreetidy.py b/parse_html/bkmk_ph_etreetidy.py index 09aa2a3..95f2071 100644 --- a/parse_html/bkmk_ph_etreetidy.py +++ b/parse_html/bkmk_ph_etreetidy.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2017 PhiloSoft Design" +__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -48,8 +48,8 @@ def parse_html(html_text, charset=None, log=None): else: meta_charset = False - if title and (charset or meta_charset): - title = title.encode(charset or meta_charset) + #if title and (charset or meta_charset): + # title = title.encode(charset or meta_charset) for m in meta: if m.get('http-equiv', '').lower() == 'refresh': diff --git a/parse_html/bkmk_ph_html5.py b/parse_html/bkmk_ph_html5.py index 111e1ed..68c1aba 100644 --- a/parse_html/bkmk_ph_html5.py +++ b/parse_html/bkmk_ph_html5.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2014 PhiloSoft Design" +__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -73,8 +73,8 @@ def parse_html(html_text, charset=None, log=None): if not charset: charset = parser.tokenizer.stream.charEncoding[0] - if title and (charset or meta_charset): - title = title.encode(charset or meta_charset) + #if title and (charset or meta_charset): + # title = title.encode(charset or meta_charset) for node in head.childNodes: if node.name == 'meta' and \ diff --git a/parse_html/bkmk_ph_lxml.py b/parse_html/bkmk_ph_lxml.py index 1fa4791..03dd6f4 100644 --- a/parse_html/bkmk_ph_lxml.py +++ b/parse_html/bkmk_ph_lxml.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2017 PhiloSoft Design" +__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -42,8 +42,8 @@ def parse_html(html_text, charset=None, log=None): else: meta_charset = False - if title and (charset or meta_charset): - title = title.encode(charset or meta_charset) + #if title and (charset or meta_charset): + # title = title.encode(charset or meta_charset) for m in meta: if m.get('http-equiv', '').lower() == 'refresh': -- 2.39.2