From 7a94766db000e620c3159c573594caf4cf2c97ae Mon Sep 17 00:00:00 2001
From: Oleg Broytman <phd@phdru.name>
Date: Thu, 24 Oct 2024 16:47:26 +0300
Subject: [PATCH] bin: Debug and improve filename re-encoding utilities

---
 bin/cleanup-recode.sh             |  23 ++---
 bin/cp_recode_fname               |  42 +++-----
 bin/get_encodings                 |  64 ++++++++++++
 bin/recode-filenames-recursive.py |  74 +++++---------
 bin/recode_filenames.py           | 163 ++++++++++++++++++++----------
 bin/unicode_norm_nfd.py           |   7 +-
 6 files changed, 231 insertions(+), 142 deletions(-)
 create mode 100755 bin/get_encodings

diff --git a/bin/cleanup-recode.sh b/bin/cleanup-recode.sh
index 3f8213d..4bf86d3 100755
--- a/bin/cleanup-recode.sh
+++ b/bin/cleanup-recode.sh
@@ -1,22 +1,15 @@
 #! /bin/sh
+set -e
 
 if ! echo "$LC_CTYPE" | grep -Fiq utf-8; then
-   echo "This script requires UTF-8 locale" 1>&2
-   exit 1
+    echo "This script requires UTF-8 locale" 1>&2
+    exit 1
 fi
 
-from_enc=utf-8
-to_enc=koi8-r
+. get_encodings
 
-while getopts f:t: opt; do
-   case $opt in
-      f ) from_enc="$OPTARG" ;;
-      t ) to_enc="$OPTARG" ;;
-   esac
-done
-shift `expr $OPTIND - 1`
-
-for dir in "${@:-.}"; do
-   cleanup-filenames-recursive.sh "$dir" &&
-   recode-filenames-recursive.py "$from_enc" "$to_enc" "$dir" || exit 1
+for dir in "${filenames:-.}"; do
+    cleanup-filenames-recursive.sh "$dir" &&
+    recode-filenames-recursive.py -f "$from_encoding" -t "$to_encoding" \
+        "$dir" || exit 1
 done
diff --git a/bin/cp_recode_fname b/bin/cp_recode_fname
index d6dd74b..13fe8c2 100755
--- a/bin/cp_recode_fname
+++ b/bin/cp_recode_fname
@@ -1,34 +1,9 @@
 #! /bin/sh
 set -e
 
-usage() {
-   echo "Usage: $0 [[from_enc] to_enc] filename" >&2
-   exit 1
-}
-
-if [ $# -eq 1 ]; then
-    from_enc="`python3 -c \"from m_lib.defenc import default_encoding; print(default_encoding)"`"
-    if [ "$from_enc" != utf-8 ]; then
-        to_enc=utf-8
-    else
-        usage
-    fi
-    filename="$1"
-elif [ $# -eq 2 ]; then
-    from_enc="`python3 -c \"from m_lib.defenc import default_encoding; print(default_encoding)"`"
-    to_enc="$1"
-    filename="$2"
-elif [ $# -eq 3 ]; then
-    from_enc="$1"
-    to_enc="$2"
-    filename="$3"
-else
-    usage
-fi
-
 cmd="`basename \"$0\"`"
 case "$cmd" in
-    cp_*) cmd="cp -p" ;;
+    cp_*) cmd="cp -ap" ;;
     mv_*) cmd=mv ;;
     *)
         echo "Unknown command $0, aborting" >&2
@@ -36,5 +11,16 @@ case "$cmd" in
     ;;
 esac
 
-filename_recoded=`echo "$filename" | iconv -f "$from_enc" -t "$to_enc"`
-exec $cmd "$filename" "$filename_recoded"
+. get_encodings
+if [ -z "$filenames" ]; then
+    echo "Usage: $0 [-f from_encoding] [-t to_encoding] path [path ...]" >&2
+    exit 1
+fi
+
+for filename in "$filenames"; do
+    filename_recoded=`echo "$filename" |
+        iconv -f "$from_encoding" -t "$to_encoding"`
+    if [ "$filename_recoded" != "$filename" ]; then
+        $cmd "$filename" "$filename_recoded"
+    fi
+done
diff --git a/bin/get_encodings b/bin/get_encodings
new file mode 100755
index 0000000..7327926
--- /dev/null
+++ b/bin/get_encodings
@@ -0,0 +1,64 @@
+#! /bin/sh
+
+case "$0" in
+    */get_encodings)
+        sourced=false
+        ;;
+    *)
+        sourced=true
+        ;;
+esac
+
+from_encoding=
+to_encoding=
+default_encoding="`python3 -c \"from m_lib.defenc import default_encoding; print(default_encoding)"`"
+
+while getopts f:t: opt; do
+   case $opt in
+      f ) from_encoding="$OPTARG" ;;
+      t ) to_encoding="$OPTARG" ;;
+   esac
+done
+shift `expr $OPTIND - 1`
+filenames="$@"
+
+error() {
+    echo "$@" >&2
+    echo "Usage: $0 [-f from_encoding] [-t to_encoding] [path [path ...]]" >&2
+    exit 1
+}
+
+if [ -n "$from_encoding" ]; then
+    if [ -n "$to_encoding" ]; then
+        : # Everything is defined, no need to guess
+    elif [ "$from_encoding" = utf-8 ]; then
+        if [ "$default_encoding" = utf-8 ]; then
+            error Cannot guess to_encoding
+        else
+            to_encoding="$default_encoding"
+        fi
+    else
+        to_encoding=utf-8
+    fi
+elif [ -n "$to_encoding" ]; then
+    if [ "$to_encoding" = "$default_encoding" ]; then
+        if [ "$default_encoding" = utf-8 ]; then
+            error Cannot guess from_encoding
+        else
+            from_encoding=utf-8
+        fi
+    else
+        from_encoding="$default_encoding"
+    fi
+else
+    if [ "$default_encoding" = utf-8 ]; then
+        error Cannot guess encodings
+    else
+        from_encoding="$default_encoding"
+        to_encoding=utf-8
+    fi
+fi
+
+if [ "$sourced" = false ]; then
+    echo "$from_encoding $to_encoding"
+fi
diff --git a/bin/recode-filenames-recursive.py b/bin/recode-filenames-recursive.py
index 5408563..1d73a92 100755
--- a/bin/recode-filenames-recursive.py
+++ b/bin/recode-filenames-recursive.py
@@ -1,57 +1,39 @@
 #! /usr/bin/env python3
 
-import sys, os
+import os
 
-from m_lib.defenc import default_encoding
+from recode_filenames import parse_args, build_recode
 
-if len(sys.argv) == 1:
-    src_encoding = default_encoding
-    if src_encoding == 'utf-8':
-        sys.exit("Usage: %s [[src_enc] dst_enc [start_dir]]" % sys.argv[0])
-    else:
-        dst_encoding = 'utf-8'
-    start_dir = '.'
-elif len(sys.argv) == 2:
-    src_encoding = default_encoding
-    dst_encoding = sys.argv[1]
-    start_dir = '.'
-elif len(sys.argv) == 3:
-    src_encoding = default_encoding
-    dst_encoding = sys.argv[1]
-    start_dir = sys.argv[2]
-elif len(sys.argv) == 4:
-    src_encoding = sys.argv[1]
-    dst_encoding = sys.argv[2]
-    start_dir = sys.argv[3]
-else:
-    sys.exit("Usage: %s [[src_enc] dst_enc [start_dir]]" % sys.argv[0])
-
-# Fake for recode_filenames.py
-sys.argv = ['', src_encoding, dst_encoding]
-from recode_filenames import _recode
+from_encoding, to_encoding, dirnames = parse_args(default='.')
+_recode = build_recode(from_encoding, to_encoding)
 
 
 def _onerror(exc):
     raise exc
 
-plist = list(os.walk(start_dir, topdown=False, onerror=_onerror))
-
 
 save_dir = os.getcwd()
-for dirname, _subdirs, fnames in plist:
-    if dirname == '.':
-        continue
-    os.chdir(dirname)
-    for filename in fnames:
-        # if not exists - it was renamed already
-        if os.path.exists(filename) and \
-                os.path.isfile(filename):
-            newname = _recode(filename)
-            if newname != filename:
-                os.rename(filename, newname)
-    os.chdir('..')
-    dirname = os.path.basename(dirname)
-    newname = _recode(dirname)
-    if newname != dirname:
-        os.rename(dirname, newname)
-    os.chdir(save_dir)
+for start_dir in dirnames:
+    for dirname, _subdirs, fnames in list(
+        os.walk(start_dir, topdown=False, onerror=_onerror)
+    ):
+        if dirname != '.':
+            os.chdir(dirname)
+        for filename in fnames:
+            # if not exists - it was renamed already
+            if os.path.exists(filename) and \
+                    os.path.isfile(filename):
+                newname = _recode(filename)
+                if not isinstance(newname, str):
+                    newname = newname.decode()
+                if newname != filename:
+                    os.rename(filename, newname)
+        if dirname != '.':
+            os.chdir('..')
+            dirname = os.path.basename(dirname)
+            newname = _recode(dirname)
+            if not isinstance(newname, str):
+                newname = newname.decode()
+            if newname != dirname:
+                os.rename(dirname, newname)
+        os.chdir(save_dir)
diff --git a/bin/recode_filenames.py b/bin/recode_filenames.py
index b8d3fa0..bd10bf5 100755
--- a/bin/recode_filenames.py
+++ b/bin/recode_filenames.py
@@ -1,69 +1,128 @@
 #! /usr/bin/env python3
 # -*- coding: koi8-r -*-
 
+import argparse
 import sys
 
-src_encoding = sys.argv[1]
-dst_encoding = sys.argv[2]
+from m_lib.defenc import default_encoding
 
-if src_encoding == "translit":
-    if dst_encoding == "koi8-r":
-        from m_lib.rus.lat2rus import lat2koi as _recode
-    elif dst_encoding == "cp1251":
-        from m_lib.rus.lat2rus import lat2win as _recode
+
+def parse_args(default=None):
+    parser = argparse.ArgumentParser(description='Recode filenames')
+    parser.add_argument('-f', '--from-encoding', help='from encoding')
+    parser.add_argument('-t', '--to-encoding', help='to encoding')
+    parser.add_argument('filename', nargs='*' if default else '+',
+                        default=[default], help='filenames to recode')
+    args = parser.parse_args()
+
+    from_encoding = args.from_encoding
+    to_encoding = args.to_encoding
+
+    if from_encoding:
+        if to_encoding:
+            pass  # Everything is defined, no need to guess
+        elif from_encoding == 'utf-8':
+            if default_encoding == 'utf-8':
+                sys.exit('Cannot guess to_encoding')
+            else:
+                to_encoding = default_encoding
+        else:
+            to_encoding = 'utf-8'
+    elif to_encoding:
+        if to_encoding == default_encoding:
+            if default_encoding == 'utf-8':
+                sys.exit('Cannot guess from_encoding')
+            else:
+                from_encoding = 'utf-8'
+        else:
+            from_encoding = default_encoding
     else:
-        raise NotImplementedError("destination encoding must be koi8-r or cp1251, not `%s'" % dst_encoding)
+        if default_encoding == 'utf-8':
+            sys.exit('Cannot guess encodings')
+        else:
+            from_encoding = default_encoding
+            to_encoding = 'utf-8'
+
+    return from_encoding, to_encoding, args.filename
+
+
+def build_recode(from_encoding, to_encoding):
+    if from_encoding == "translit":
+        if to_encoding == "koi8-r":
+            from m_lib.rus.lat2rus import lat2koi as _recode
+        elif to_encoding == "cp1251":
+            from m_lib.rus.lat2rus import lat2win as _recode
+        else:
+            raise NotImplementedError(
+                "destination encoding must be koi8-r or cp1251, "
+                "not `%s'" % to_encoding)
+
+    elif to_encoding == "translit":
+        if from_encoding == "koi8-r":
+            from m_lib.rus.rus2lat import koi2lat as _recode
+        elif from_encoding == "cp1251":
+            from m_lib.rus.rus2lat import win2lat as _recode
+        else:
+            raise NotImplementedError(
+                "source encoding must be koi8-r or cp1251, "
+                "not `%s'" % from_encoding)
+
+        from m_lib.rus.rus2lat import koi2lat_d
+        koi2lat_d["ÿ"] = ''  # remove apostrophs -
+        koi2lat_d["ø"] = ''  # they are not very good characters in filenames
+        koi2lat_d["ß"] = ''  # especially on Windoze
+        koi2lat_d["Ø"] = ''  # :-)
+
+    elif from_encoding == "url":
+        try:
+            from_encoding, to_encoding = to_encoding.split('/')
+        except ValueError:
+            from_encoding = to_encoding
+        from urllib.parse import unquote
+        from m_lib.opstring import recode
+
+        def _recode(s):
+            s = unquote(s)
+            if from_encoding != to_encoding:
+                s = recode(s, from_encoding, to_encoding, "replace")
+            return s
+
+    elif to_encoding == "url":
+        try:
+            from_encoding, to_encoding = from_encoding.split('/')
+        except ValueError:
+            to_encoding = from_encoding
+        from urllib.parse import quote
+        from m_lib.opstring import recode
+
+        def _recode(s):
+            if from_encoding != to_encoding:
+                s = recode(s, from_encoding, to_encoding, "replace")
+            #                wget treats them as safe
+            #                     vvvvvvvvvvvvv
+            return quote(s, safe=";/?:@&=+$,()'")
 
-elif dst_encoding == "translit":
-    if src_encoding == "koi8-r":
-        from m_lib.rus.rus2lat import koi2lat as _recode
-    elif src_encoding == "cp1251":
-        from m_lib.rus.rus2lat import win2lat as _recode
     else:
-        raise NotImplementedError("source encoding must be koi8-r or cp1251, not `%s'" % src_encoding)
-
-    from m_lib.rus.rus2lat import koi2lat_d
-    koi2lat_d["ÿ"] = '' # remove apostrophs -
-    koi2lat_d["ø"] = '' # they are not very good characters in filenames
-    koi2lat_d["ß"] = '' # especially on Windoze
-    koi2lat_d["Ø"] = '' # :-)
-
-elif src_encoding == "url":
-    try:
-        src_encoding, dst_encoding = dst_encoding.split('/')
-    except ValueError:
-        src_encoding = dst_encoding
-    from m_lib.opstring import recode
-    import urllib
-    def _recode(s):
-        s = urllib.unquote(s)
-        if src_encoding != dst_encoding:
-            s = recode(s, src_encoding, dst_encoding, "replace")
-        return s
-
-elif dst_encoding == "url":
-    try:
-        src_encoding, dst_encoding = src_encoding.split('/')
-    except ValueError:
-        dst_encoding = src_encoding
-    from m_lib.opstring import recode
-    import urllib
-    def _recode(s):
-        if src_encoding != dst_encoding:
-            s = recode(s, src_encoding, dst_encoding, "replace")
-        return urllib.quote(s, safe=";/?:@&=+$,()'") # wget treats them as safe
-
-else:
-    from m_lib.opstring import recode
-    def _recode(s):
-        return recode(s, src_encoding, dst_encoding, "replace")
+        def _recode(s):
+            return s.encode(to_encoding, "surrogateescape").\
+                decode(from_encoding, "surrogateescape")
+
+    return _recode
 
 
 if __name__ == "__main__":
     import os
-    for filename in sys.argv[3:]:
+    from_encoding, to_encoding, filenames = parse_args()
+    _recode = build_recode(from_encoding, to_encoding)
+    for filename in filenames:
         new_name = _recode(filename)
-        if type(filename) is not type(new_name):
+        if not isinstance(new_name, str):
             new_name = new_name.decode()
         if new_name != filename:
+            sys.stdout.buffer.write(
+                b'%s %s / %s -> %s\n' % (
+                    from_encoding.encode(), to_encoding.encode(),
+                    filename.encode(default_encoding, 'replace'),
+                    new_name.encode())
+                )
             os.rename(filename, new_name)
diff --git a/bin/unicode_norm_nfd.py b/bin/unicode_norm_nfd.py
index 2cca0d9..030de47 100755
--- a/bin/unicode_norm_nfd.py
+++ b/bin/unicode_norm_nfd.py
@@ -4,6 +4,8 @@
 import sys
 import unicodedata
 
+from m_lib.defenc import default_encoding
+
 
 def strip_accents(s):
     return ''.join(c for c in unicodedata.normalize('NFD', s)
@@ -20,4 +22,7 @@ if __name__ == '__main__':
     if len(sys.argv) == 1:
         sys.exit('Usage: %s name\n' % sys.argv[0])
     for name in sys.argv[1:]:
-        print(latin1_to_ascii(name))
+        sys.stdout.buffer.write(
+            latin1_to_ascii(name).encode(default_encoding, 'surrogateescape')
+            + b'\n'
+        )
-- 
2.39.5