]> git.phdru.name Git - bookmarks_db.git/commitdiff
Fix(Py3): Stop encoding unicode to bytes
authorOleg Broytman <phd@phdru.name>
Sat, 11 Nov 2023 18:35:26 +0000 (21:35 +0300)
committerOleg Broytman <phd@phdru.name>
Sun, 12 Nov 2023 19:21:09 +0000 (22:21 +0300)
Storage/bkmk_stjson.py
Writers/bkmk_wflad.py
bkmk_objects.py
bkmk_parser.py
parse_html/bkmk_parse_html.py
parse_html/bkmk_ph_beautifulsoup.py
parse_html/bkmk_ph_beautifulsoup4.py
parse_html/bkmk_ph_etreetidy.py
parse_html/bkmk_ph_html5.py
parse_html/bkmk_ph_lxml.py

index 1d34cbddb68df94703690d696a14d47b4c3c94ae..9f7fed78202d319658aa199047069f979a8c18e0 100644 (file)
@@ -41,7 +41,7 @@ class storage_json(Walker):
         dict["lastModified"] = convert_date_to_json(f.last_modified)
         root = getattr(f, 'root')
         if root: dict["root"] = root
-        dict["title"] = f.name.decode('utf-8')
+        dict["title"] = f.name
         dict["type"] = "text/x-moz-place-container"
         if root:
             self.dict["children"].append(dict)
@@ -69,7 +69,7 @@ class storage_json(Walker):
         keyword = getattr(b, 'keyword')
         if keyword: dict["keyword"] = keyword
         dict["lastModified"] = convert_date_to_json(b.last_modified)
-        dict["title"] = b.name.decode('utf-8')
+        dict["title"] = b.name
         dict["type"] = "text/x-moz-place"
         dict["uri"] = b.href
         self.folder_stack[-1].append(dict)
@@ -84,7 +84,7 @@ class storage_json(Walker):
         if guid: dict["guid"] = guid
         dict["index"] = r.index
         dict["lastModified"] = convert_date_to_json(r.last_modified)
-        if r.name: dict["title"] = r.name.decode('utf-8')
+        if r.name: dict["title"] = r.name
         dict["type"] = "text/x-moz-place-separator"
         self.folder_stack[-1].append(dict)
 
@@ -135,7 +135,7 @@ class storage_json(Walker):
         folder.guid = fdict.get("guid")
         folder.index = fdict.get("index")
         folder.root = fdict.get("root")
-        folder.name = encode_title(fdict["title"])
+        folder.name = fdict["title"]
 
         if "children" in fdict:
             for record in fdict["children"]:
@@ -154,7 +154,7 @@ class storage_json(Walker):
 
                 elif record["type"] == "text/x-moz-place":
                     bookmark = Bookmark(
-                        href=record["uri"].encode('utf-8'),
+                        href=record["uri"],
                         add_date=convert_date_from_json(
                             record.get("dateAdded")),
                         last_modified=convert_date_from_json(
@@ -166,7 +166,7 @@ class storage_json(Walker):
                     bookmark.guid = record.get("guid")
                     bookmark.id = record["id"]
                     bookmark.index = record.get("index")
-                    bookmark.name = encode_title(record["title"])
+                    bookmark.name = record["title"]
                     self.current_folder.append(bookmark)
 
                 elif record["type"] == "text/x-moz-place-separator":
@@ -178,7 +178,7 @@ class storage_json(Walker):
                     ruler.index = record["index"]
                     ruler.last_modified = convert_date_from_json(
                         record.get("lastModified"))
-                    ruler.name = encode_title(record.get("title"))
+                    ruler.name = record.get("title")
                     ruler.comment = get_comment(record.get("annos"))
                     self.current_folder.append(ruler)
 
@@ -207,15 +207,9 @@ def convert_date_from_json(date):
     return date
 
 
-def encode_title(title):
-    if title:
-        return title.encode("UTF-8", "xmlcharrefreplace")
-    return title
-
-
 def get_str(record, name):
     if name in record:
-        return record[name].encode('utf-8')
+        return record[name]
     return ''
 
 
@@ -225,7 +219,7 @@ def get_comment(annos):
 
     for a in annos:
         if a["name"] == "bookmarkProperties/description":
-            return a["value"].encode('utf-8')
+            return a["value"]
 
     return ''
 
@@ -235,4 +229,5 @@ def make_annos(value, name="bookmarkProperties/description"):
         "expires": 4,
         "flags": 0,
         "name": name,
-        "value": value.decode('utf-8')}]
+        "value": value,
+    }]
index 1adc34310b9c74be278c5ec6addb573243f8fa32..cae46bb44554c1e79a54a32bd1d1ac72e80ab307 100644 (file)
@@ -64,8 +64,8 @@ Comment: %s""" % (
         ):
             if hasattr(b, attr_name):
                 value = getattr(b, attr_name)
-                if isinstance(value, unicode):
-                    value = value.encode('utf-8')
+                #if isinstance(value, unicode):
+                #    value = value.encode('utf-8')
                 self.outfile.write("\n%s: %s" % (attr_out, value))
 
         if hasattr(b, "last_tested"):
index 4850261ef529c854708b5eb0437323720ed9d41e..b9dabff1e316a66c36d14a9f3d0911f4b60f6b2c 100644 (file)
@@ -85,7 +85,7 @@ class Bookmark(object):
                 href += ':' + quote(password)
             href += '@'
         if host:
-            href += host.decode(parser_charset or 'utf-8').encode('idna')
+            href += host.encode('idna').decode('ascii')
             if port:
                 href += ':%d' % port
         if path:
@@ -230,8 +230,8 @@ def unquote_title(title):
     if BKMK_FORMAT == "MOZILLA":
         from HTMLParser import HTMLParser
         title = HTMLParser().unescape(
-            title.replace("&amp;", '&').decode('utf-8'))
-        title = title.encode('utf-8').replace("&#39;", "'")
+            title.replace("&amp;", '&'))
+        title = title.replace("&#39;", "'")
     return title
 
 
index 0cd8f1fe711a7fd00abe8feaecffb89d79a8d303..997728f3b9ceba4072be9c7c872d02fda8ca763b 100644 (file)
@@ -47,9 +47,9 @@ class BkmkParser(HTMLParser):
 
     def handle_data(self, data):
         if data:
-            if self.charset and default_encoding:
-                data = data.decode(self.charset, "replace").\
-                    encode(default_encoding, "xmlcharrefreplace")
+            #if self.charset and default_encoding:
+            #    data = data.decode(self.charset, "replace").\
+            #        encode(default_encoding, "xmlcharrefreplace")
             self.accumulator += data
 
     # Mozilla - get charset
index 2e7df1a2b90fde750d0d47aafcf073a11c60c3a5..69d9035f5001759cb7689221775572b3e49b1373 100644 (file)
@@ -78,7 +78,7 @@ def recode_entities(title, charset):
               entity_re.match(part):
             _part = name2codepoint.get(part[1:-1], None)
             if _part is not None:
-                part = unichr(_part).encode(charset)
+                part = unichr(_part)
         output.append(part)
     title = ''.join(output)
 
@@ -86,7 +86,7 @@ def recode_entities(title, charset):
     for part in num_entity_re.split(title):
         if num_entity_re.match(part):
             try:
-                part = unichr(int(part[2:-1])).encode(charset)
+                part = unichr(int(part[2:-1]))
             except UnicodeEncodeError:
                 pass  # Leave the entity as is
         output.append(part)
@@ -146,23 +146,23 @@ def parse_html(html_text, charset=None, log=None):
         p, parser = _parsers[0]
     if log: log("   Using %s" % p.__module__)
 
-    title = parser.title
-    if isinstance(title, unicode):
-        if parser.charset:
-            parser.title = title.encode(parser.charset)
-        else:
-            try:
-                parser.title = title.encode('ascii')
-            except UnicodeEncodeError:
-                try:
-                    parser.title = title.encode(DEFAULT_CHARSET)
-                except UnicodeEncodeError:
-                    parser.title = title.encode(universal_charset)
-                    parser.charset = universal_charset
-                else:
-                    parser.charset = DEFAULT_CHARSET
-            else:
-                parser.charset = 'ascii'
+    #title = parser.title
+    #if isinstance(title, unicode):
+    #    if parser.charset:
+    #        parser.title = title.encode(parser.charset)
+    #    else:
+    #        try:
+    #            parser.title = title.encode('ascii')
+    #        except UnicodeEncodeError:
+    #            try:
+    #                parser.title = title.encode(DEFAULT_CHARSET)
+    #            except UnicodeEncodeError:
+    #                parser.title = title.encode(universal_charset)
+    #                parser.charset = universal_charset
+    #            else:
+    #                parser.charset = DEFAULT_CHARSET
+    #        else:
+    #            parser.charset = 'ascii'
 
     converted_title = title = parser.title
     if title and (not parser.charset):
@@ -184,21 +184,21 @@ def parse_html(html_text, charset=None, log=None):
                 if log: log("   guessed charset: %s" % parser.charset)
             # if log: log("   current charset: %s" % universal_charset)
             if log: log("   title          : %s" % title)
-            if parser.charset != universal_charset:
-                try:
-                    converted_title = title.decode(parser.charset).\
-                        encode(universal_charset)
-                except UnicodeError:
-                    if log:
-                        log("   incorrect conversion from %s,"
-                            "converting from %s"
-                            % (parser.charset, DEFAULT_CHARSET))
-                    converted_title = \
-                        title.decode(DEFAULT_CHARSET, "replace").\
-                        encode(universal_charset, "replace")
-                    parser.charset = DEFAULT_CHARSET
-            if log and (converted_title != title):
-                log("   converted title: %s" % converted_title)
+            #if parser.charset != universal_charset:
+            #    try:
+            #        converted_title = title.decode(parser.charset).\
+            #            encode(universal_charset)
+            #    except UnicodeError:
+            #        if log:
+            #            log("   incorrect conversion from %s,"
+            #                "converting from %s"
+            #                % (parser.charset, DEFAULT_CHARSET))
+            #        converted_title = \
+            #            title.decode(DEFAULT_CHARSET, "replace").\
+            #            encode(universal_charset, "replace")
+            #        parser.charset = DEFAULT_CHARSET
+            #if log and (converted_title != title):
+            #    log("   converted title: %s" % converted_title)
         except LookupError:
             if log: log("   unknown charset: '%s'" % parser.charset)
     else:
@@ -212,13 +212,13 @@ def parse_html(html_text, charset=None, log=None):
             log("   final title    : %s" % final_title)
         parser.title = final_title
 
-    icon = parser.icon
-    if isinstance(icon, unicode):
-        try:
-            parser.icon = icon.encode('ascii')
-        except UnicodeEncodeError:
-            if parser.charset:
-                parser.icon = icon.encode(parser.charset)
+    #icon = parser.icon
+    #if isinstance(icon, unicode):
+    #    try:
+    #        parser.icon = icon.encode('ascii')
+    #    except UnicodeEncodeError:
+    #        if parser.charset:
+    #            parser.icon = icon.encode(parser.charset)
     return parser
 
 
index 94a572bb48e0153fbb93f403ccfb46abe48566f2..ac880cc6b7c197eab772d056e2d210eb5fabc520 100644 (file)
@@ -125,8 +125,8 @@ def parse_html(html_text, charset=None, log=None):
             if meta_content:
                 meta_charset = _charset = meta_content.lower()
 
-    if title and (_charset or meta_charset):
-        title = title.encode(_charset or meta_charset)
+    #if title and (_charset or meta_charset):
+    #    title = title.encode(_charset or meta_charset)
 
     meta = head.find(_find_refresh, recursive=False)
     if meta:
index fbd54ff9cfd207c583d5119ba06c20faf23f9d4d..1095ebce8d510c22df2c9768baed3c27777e1909 100644 (file)
@@ -84,12 +84,12 @@ def parse_html(html_text, charset=None, log=None):
             if meta_content:
                 meta_charset = _charset = meta_content.lower()
 
-    if title and (_charset or meta_charset):
-        try:
-            title = title.encode(_charset or meta_charset)
-        except LookupError:
-            title = title.encode(universal_charset)
-            _charset = universal_charset
+    #if title and (_charset or meta_charset):
+    #    try:
+    #        title = title.encode(_charset or meta_charset)
+    #    except LookupError:
+    #        title = title.encode(universal_charset)
+    #        _charset = universal_charset
 
     meta = head.find(_find_refresh, recursive=False)
     if meta:
index 09aa2a3773642cf08df73e100206d782602e8685..95f2071bb87a3018a3f616d136c79291d02233eb 100644 (file)
@@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2017 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['parse_html']
@@ -48,8 +48,8 @@ def parse_html(html_text, charset=None, log=None):
     else:
         meta_charset = False
 
-    if title and (charset or meta_charset):
-        title = title.encode(charset or meta_charset)
+    #if title and (charset or meta_charset):
+    #    title = title.encode(charset or meta_charset)
 
     for m in meta:
         if m.get('http-equiv', '').lower() == 'refresh':
index 111e1ed4b4b38360fef84ca41a9276351d0e6c5e..68c1ababbb106f33a14d08e3c51d9e8d19e17573 100644 (file)
@@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2014 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['parse_html']
@@ -73,8 +73,8 @@ def parse_html(html_text, charset=None, log=None):
         if not charset:
             charset = parser.tokenizer.stream.charEncoding[0]
 
-        if title and (charset or meta_charset):
-            title = title.encode(charset or meta_charset)
+        #if title and (charset or meta_charset):
+        #    title = title.encode(charset or meta_charset)
 
         for node in head.childNodes:
             if node.name == 'meta' and \
index 1fa47917deaf81fb6b40dbb413498e45dec7eadc..03dd6f4c0d90bb60627a442d1dfb743eb7a07961 100644 (file)
@@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2017 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['parse_html']
@@ -42,8 +42,8 @@ def parse_html(html_text, charset=None, log=None):
     else:
         meta_charset = False
 
-    if title and (charset or meta_charset):
-        title = title.encode(charset or meta_charset)
+    #if title and (charset or meta_charset):
+    #    title = title.encode(charset or meta_charset)
 
     for m in meta:
         if m.get('http-equiv', '').lower() == 'refresh':