From 7a94766db000e620c3159c573594caf4cf2c97ae Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Thu, 24 Oct 2024 16:47:26 +0300 Subject: [PATCH] bin: Debug and improve filename re-encoding utilities --- bin/cleanup-recode.sh | 23 ++--- bin/cp_recode_fname | 42 +++----- bin/get_encodings | 64 ++++++++++++ bin/recode-filenames-recursive.py | 74 +++++--------- bin/recode_filenames.py | 163 ++++++++++++++++++++---------- bin/unicode_norm_nfd.py | 7 +- 6 files changed, 231 insertions(+), 142 deletions(-) create mode 100755 bin/get_encodings diff --git a/bin/cleanup-recode.sh b/bin/cleanup-recode.sh index 3f8213d..4bf86d3 100755 --- a/bin/cleanup-recode.sh +++ b/bin/cleanup-recode.sh @@ -1,22 +1,15 @@ #! /bin/sh +set -e if ! echo "$LC_CTYPE" | grep -Fiq utf-8; then - echo "This script requires UTF-8 locale" 1>&2 - exit 1 + echo "This script requires UTF-8 locale" 1>&2 + exit 1 fi -from_enc=utf-8 -to_enc=koi8-r +. get_encodings -while getopts f:t: opt; do - case $opt in - f ) from_enc="$OPTARG" ;; - t ) to_enc="$OPTARG" ;; - esac -done -shift `expr $OPTIND - 1` - -for dir in "${@:-.}"; do - cleanup-filenames-recursive.sh "$dir" && - recode-filenames-recursive.py "$from_enc" "$to_enc" "$dir" || exit 1 +for dir in "${filenames:-.}"; do + cleanup-filenames-recursive.sh "$dir" && + recode-filenames-recursive.py -f "$from_encoding" -t "$to_encoding" \ + "$dir" || exit 1 done diff --git a/bin/cp_recode_fname b/bin/cp_recode_fname index d6dd74b..13fe8c2 100755 --- a/bin/cp_recode_fname +++ b/bin/cp_recode_fname @@ -1,34 +1,9 @@ #! /bin/sh set -e -usage() { - echo "Usage: $0 [[from_enc] to_enc] filename" >&2 - exit 1 -} - -if [ $# -eq 1 ]; then - from_enc="`python3 -c \"from m_lib.defenc import default_encoding; print(default_encoding)"`" - if [ "$from_enc" != utf-8 ]; then - to_enc=utf-8 - else - usage - fi - filename="$1" -elif [ $# -eq 2 ]; then - from_enc="`python3 -c \"from m_lib.defenc import default_encoding; print(default_encoding)"`" - to_enc="$1" - filename="$2" -elif [ $# -eq 3 ]; then - from_enc="$1" - to_enc="$2" - filename="$3" -else - usage -fi - cmd="`basename \"$0\"`" case "$cmd" in - cp_*) cmd="cp -p" ;; + cp_*) cmd="cp -ap" ;; mv_*) cmd=mv ;; *) echo "Unknown command $0, aborting" >&2 @@ -36,5 +11,16 @@ case "$cmd" in ;; esac -filename_recoded=`echo "$filename" | iconv -f "$from_enc" -t "$to_enc"` -exec $cmd "$filename" "$filename_recoded" +. get_encodings +if [ -z "$filenames" ]; then + echo "Usage: $0 [-f from_encoding] [-t to_encoding] path [path ...]" >&2 + exit 1 +fi + +for filename in "$filenames"; do + filename_recoded=`echo "$filename" | + iconv -f "$from_encoding" -t "$to_encoding"` + if [ "$filename_recoded" != "$filename" ]; then + $cmd "$filename" "$filename_recoded" + fi +done diff --git a/bin/get_encodings b/bin/get_encodings new file mode 100755 index 0000000..7327926 --- /dev/null +++ b/bin/get_encodings @@ -0,0 +1,64 @@ +#! /bin/sh + +case "$0" in + */get_encodings) + sourced=false + ;; + *) + sourced=true + ;; +esac + +from_encoding= +to_encoding= +default_encoding="`python3 -c \"from m_lib.defenc import default_encoding; print(default_encoding)"`" + +while getopts f:t: opt; do + case $opt in + f ) from_encoding="$OPTARG" ;; + t ) to_encoding="$OPTARG" ;; + esac +done +shift `expr $OPTIND - 1` +filenames="$@" + +error() { + echo "$@" >&2 + echo "Usage: $0 [-f from_encoding] [-t to_encoding] [path [path ...]]" >&2 + exit 1 +} + +if [ -n "$from_encoding" ]; then + if [ -n "$to_encoding" ]; then + : # Everything is defined, no need to guess + elif [ "$from_encoding" = utf-8 ]; then + if [ "$default_encoding" = utf-8 ]; then + error Cannot guess to_encoding + else + to_encoding="$default_encoding" + fi + else + to_encoding=utf-8 + fi +elif [ -n "$to_encoding" ]; then + if [ "$to_encoding" = "$default_encoding" ]; then + if [ "$default_encoding" = utf-8 ]; then + error Cannot guess from_encoding + else + from_encoding=utf-8 + fi + else + from_encoding="$default_encoding" + fi +else + if [ "$default_encoding" = utf-8 ]; then + error Cannot guess encodings + else + from_encoding="$default_encoding" + to_encoding=utf-8 + fi +fi + +if [ "$sourced" = false ]; then + echo "$from_encoding $to_encoding" +fi diff --git a/bin/recode-filenames-recursive.py b/bin/recode-filenames-recursive.py index 5408563..1d73a92 100755 --- a/bin/recode-filenames-recursive.py +++ b/bin/recode-filenames-recursive.py @@ -1,57 +1,39 @@ #! /usr/bin/env python3 -import sys, os +import os -from m_lib.defenc import default_encoding +from recode_filenames import parse_args, build_recode -if len(sys.argv) == 1: - src_encoding = default_encoding - if src_encoding == 'utf-8': - sys.exit("Usage: %s [[src_enc] dst_enc [start_dir]]" % sys.argv[0]) - else: - dst_encoding = 'utf-8' - start_dir = '.' -elif len(sys.argv) == 2: - src_encoding = default_encoding - dst_encoding = sys.argv[1] - start_dir = '.' -elif len(sys.argv) == 3: - src_encoding = default_encoding - dst_encoding = sys.argv[1] - start_dir = sys.argv[2] -elif len(sys.argv) == 4: - src_encoding = sys.argv[1] - dst_encoding = sys.argv[2] - start_dir = sys.argv[3] -else: - sys.exit("Usage: %s [[src_enc] dst_enc [start_dir]]" % sys.argv[0]) - -# Fake for recode_filenames.py -sys.argv = ['', src_encoding, dst_encoding] -from recode_filenames import _recode +from_encoding, to_encoding, dirnames = parse_args(default='.') +_recode = build_recode(from_encoding, to_encoding) def _onerror(exc): raise exc -plist = list(os.walk(start_dir, topdown=False, onerror=_onerror)) - save_dir = os.getcwd() -for dirname, _subdirs, fnames in plist: - if dirname == '.': - continue - os.chdir(dirname) - for filename in fnames: - # if not exists - it was renamed already - if os.path.exists(filename) and \ - os.path.isfile(filename): - newname = _recode(filename) - if newname != filename: - os.rename(filename, newname) - os.chdir('..') - dirname = os.path.basename(dirname) - newname = _recode(dirname) - if newname != dirname: - os.rename(dirname, newname) - os.chdir(save_dir) +for start_dir in dirnames: + for dirname, _subdirs, fnames in list( + os.walk(start_dir, topdown=False, onerror=_onerror) + ): + if dirname != '.': + os.chdir(dirname) + for filename in fnames: + # if not exists - it was renamed already + if os.path.exists(filename) and \ + os.path.isfile(filename): + newname = _recode(filename) + if not isinstance(newname, str): + newname = newname.decode() + if newname != filename: + os.rename(filename, newname) + if dirname != '.': + os.chdir('..') + dirname = os.path.basename(dirname) + newname = _recode(dirname) + if not isinstance(newname, str): + newname = newname.decode() + if newname != dirname: + os.rename(dirname, newname) + os.chdir(save_dir) diff --git a/bin/recode_filenames.py b/bin/recode_filenames.py index b8d3fa0..bd10bf5 100755 --- a/bin/recode_filenames.py +++ b/bin/recode_filenames.py @@ -1,69 +1,128 @@ #! /usr/bin/env python3 # -*- coding: koi8-r -*- +import argparse import sys -src_encoding = sys.argv[1] -dst_encoding = sys.argv[2] +from m_lib.defenc import default_encoding -if src_encoding == "translit": - if dst_encoding == "koi8-r": - from m_lib.rus.lat2rus import lat2koi as _recode - elif dst_encoding == "cp1251": - from m_lib.rus.lat2rus import lat2win as _recode + +def parse_args(default=None): + parser = argparse.ArgumentParser(description='Recode filenames') + parser.add_argument('-f', '--from-encoding', help='from encoding') + parser.add_argument('-t', '--to-encoding', help='to encoding') + parser.add_argument('filename', nargs='*' if default else '+', + default=[default], help='filenames to recode') + args = parser.parse_args() + + from_encoding = args.from_encoding + to_encoding = args.to_encoding + + if from_encoding: + if to_encoding: + pass # Everything is defined, no need to guess + elif from_encoding == 'utf-8': + if default_encoding == 'utf-8': + sys.exit('Cannot guess to_encoding') + else: + to_encoding = default_encoding + else: + to_encoding = 'utf-8' + elif to_encoding: + if to_encoding == default_encoding: + if default_encoding == 'utf-8': + sys.exit('Cannot guess from_encoding') + else: + from_encoding = 'utf-8' + else: + from_encoding = default_encoding else: - raise NotImplementedError("destination encoding must be koi8-r or cp1251, not `%s'" % dst_encoding) + if default_encoding == 'utf-8': + sys.exit('Cannot guess encodings') + else: + from_encoding = default_encoding + to_encoding = 'utf-8' + + return from_encoding, to_encoding, args.filename + + +def build_recode(from_encoding, to_encoding): + if from_encoding == "translit": + if to_encoding == "koi8-r": + from m_lib.rus.lat2rus import lat2koi as _recode + elif to_encoding == "cp1251": + from m_lib.rus.lat2rus import lat2win as _recode + else: + raise NotImplementedError( + "destination encoding must be koi8-r or cp1251, " + "not `%s'" % to_encoding) + + elif to_encoding == "translit": + if from_encoding == "koi8-r": + from m_lib.rus.rus2lat import koi2lat as _recode + elif from_encoding == "cp1251": + from m_lib.rus.rus2lat import win2lat as _recode + else: + raise NotImplementedError( + "source encoding must be koi8-r or cp1251, " + "not `%s'" % from_encoding) + + from m_lib.rus.rus2lat import koi2lat_d + koi2lat_d["ÿ"] = '' # remove apostrophs - + koi2lat_d["ø"] = '' # they are not very good characters in filenames + koi2lat_d["ß"] = '' # especially on Windoze + koi2lat_d["Ø"] = '' # :-) + + elif from_encoding == "url": + try: + from_encoding, to_encoding = to_encoding.split('/') + except ValueError: + from_encoding = to_encoding + from urllib.parse import unquote + from m_lib.opstring import recode + + def _recode(s): + s = unquote(s) + if from_encoding != to_encoding: + s = recode(s, from_encoding, to_encoding, "replace") + return s + + elif to_encoding == "url": + try: + from_encoding, to_encoding = from_encoding.split('/') + except ValueError: + to_encoding = from_encoding + from urllib.parse import quote + from m_lib.opstring import recode + + def _recode(s): + if from_encoding != to_encoding: + s = recode(s, from_encoding, to_encoding, "replace") + # wget treats them as safe + # vvvvvvvvvvvvv + return quote(s, safe=";/?:@&=+$,()'") -elif dst_encoding == "translit": - if src_encoding == "koi8-r": - from m_lib.rus.rus2lat import koi2lat as _recode - elif src_encoding == "cp1251": - from m_lib.rus.rus2lat import win2lat as _recode else: - raise NotImplementedError("source encoding must be koi8-r or cp1251, not `%s'" % src_encoding) - - from m_lib.rus.rus2lat import koi2lat_d - koi2lat_d["ÿ"] = '' # remove apostrophs - - koi2lat_d["ø"] = '' # they are not very good characters in filenames - koi2lat_d["ß"] = '' # especially on Windoze - koi2lat_d["Ø"] = '' # :-) - -elif src_encoding == "url": - try: - src_encoding, dst_encoding = dst_encoding.split('/') - except ValueError: - src_encoding = dst_encoding - from m_lib.opstring import recode - import urllib - def _recode(s): - s = urllib.unquote(s) - if src_encoding != dst_encoding: - s = recode(s, src_encoding, dst_encoding, "replace") - return s - -elif dst_encoding == "url": - try: - src_encoding, dst_encoding = src_encoding.split('/') - except ValueError: - dst_encoding = src_encoding - from m_lib.opstring import recode - import urllib - def _recode(s): - if src_encoding != dst_encoding: - s = recode(s, src_encoding, dst_encoding, "replace") - return urllib.quote(s, safe=";/?:@&=+$,()'") # wget treats them as safe - -else: - from m_lib.opstring import recode - def _recode(s): - return recode(s, src_encoding, dst_encoding, "replace") + def _recode(s): + return s.encode(to_encoding, "surrogateescape").\ + decode(from_encoding, "surrogateescape") + + return _recode if __name__ == "__main__": import os - for filename in sys.argv[3:]: + from_encoding, to_encoding, filenames = parse_args() + _recode = build_recode(from_encoding, to_encoding) + for filename in filenames: new_name = _recode(filename) - if type(filename) is not type(new_name): + if not isinstance(new_name, str): new_name = new_name.decode() if new_name != filename: + sys.stdout.buffer.write( + b'%s %s / %s -> %s\n' % ( + from_encoding.encode(), to_encoding.encode(), + filename.encode(default_encoding, 'replace'), + new_name.encode()) + ) os.rename(filename, new_name) diff --git a/bin/unicode_norm_nfd.py b/bin/unicode_norm_nfd.py index 2cca0d9..030de47 100755 --- a/bin/unicode_norm_nfd.py +++ b/bin/unicode_norm_nfd.py @@ -4,6 +4,8 @@ import sys import unicodedata +from m_lib.defenc import default_encoding + def strip_accents(s): return ''.join(c for c in unicodedata.normalize('NFD', s) @@ -20,4 +22,7 @@ if __name__ == '__main__': if len(sys.argv) == 1: sys.exit('Usage: %s name\n' % sys.argv[0]) for name in sys.argv[1:]: - print(latin1_to_ascii(name)) + sys.stdout.buffer.write( + latin1_to_ascii(name).encode(default_encoding, 'surrogateescape') + + b'\n' + ) -- 2.39.5