Use encoding (default is utf-8) and unicode

author Oleg Broytman <phd@phdru.name>

Fri, 2 Sep 2016 22:26:02 +0000 (01:26 +0300)

committer Oleg Broytman <phd@phdru.name>

Fri, 2 Sep 2016 22:26:02 +0000 (01:26 +0300)
author Oleg Broytman <phd@phdru.name>
Fri, 2 Sep 2016 22:26:02 +0000 (01:26 +0300)
committer Oleg Broytman <phd@phdru.name>
Fri, 2 Sep 2016 22:26:02 +0000 (01:26 +0300)
diff --git a/demo/sample.sql b/demo/sample.sql

index fb31ec364763533aef6d857ce53b63c6d229fd87..64818c3818e1208d6ccde243a3aadd1e8e7dfd00 100644 (file)
--- a/demo/sample.sql
+++ b/demo/sample.sql
@@ -1,5 +1,5 @@
  SELECT * FROM `mytable`; -- line-comment"
  SELECT * FROM `mytable`; -- line-comment"
-INSERT into /* inline comment */ mytable VALUES (1, 'one');
+INSERT into /* inline comment */ mytable VALUES (1, 'тест');
  /*! directive*/ INSERT INTO `MyTable` (`Id`, `Name`)
  VALUES (1, 'one');
  
  /*! directive*/ INSERT INTO `MyTable` (`Id`, `Name`)
  VALUES (1, 'one');
  
diff --git a/docs/index.rst b/docs/index.rst

index 356f1d124872edffef16db48578a1a497ff03764..49cb311151e8825fa19315cc6b91e54dcf2e4a1d 100644 (file)
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -25,10 +25,16 @@ mysql-to-sql.py
  
  Usage::
  
  
  Usage::
  
-    mysql-to-sql.py [infile] [[-o] outfile]
+    mysql-to-sql.py [-e encoding] [-E output_encoding] [infile] [[-o] outfile]
  
  Options::
  
  
  Options::
  
+   -e ENCODING, --encoding ENCODING
+                           input/output encoding, default is utf-8
+   -E OUTPUT_ENCODING, --output-encoding OUTPUT_ENCODING
+                           separate output encoding, default is the same as
+                           `-e` except for console; for console output charset
+                           from the current locale is used
      infile                 Input file, stdin if absent or '-'
      -o, --outfile outfile  Output file, stdout if absent or '-'
  
      infile                 Input file, stdin if absent or '-'
      -o, --outfile outfile  Output file, stdout if absent or '-'
  
diff --git a/mysql2sql/print_tokens.py b/mysql2sql/print_tokens.py

index 142391fbe05196497b25396a571072765d099ea5..3e2b0d5cab08be97bba84a777fe52ceddb422fca 100644 (file)
--- a/mysql2sql/print_tokens.py
+++ b/mysql2sql/print_tokens.py
@@ -2,10 +2,15 @@
  import sys
  
  
  import sys
  
  
-def print_tokens(token_list, outfile=sys.stdout):
+def print_tokens(token_list, outfile=sys.stdout, encoding=None):
+    if encoding:
+        outfile = getattr(outfile, 'buffer', outfile)
      for token in token_list.flatten():
      for token in token_list.flatten():
-        outfile.write(token.normalized)
+        normalized = token.normalized
+        if encoding:
+            normalized = normalized.encode(encoding)
+        outfile.write(normalized)
  
  
  def tlist2str(token_list):
  
  
  def tlist2str(token_list):
-    return ''.join(token.normalized for token in token_list.flatten())
+    return u''.join(token.normalized for token in token_list.flatten())
diff --git a/mysql2sql/process_tokens.py b/mysql2sql/process_tokens.py

index 9e1e760a3e4e65461511457f1994275b0339c4e7..1e74ac9b69a5a416bb6cf8b0418219095b73b357 100644 (file)
--- a/mysql2sql/process_tokens.py
+++ b/mysql2sql/process_tokens.py
@@ -33,16 +33,17 @@ if PY3:
  class StatementGrouper(object):
      """Collect lines and reparse until the last statement is complete"""
  
  class StatementGrouper(object):
      """Collect lines and reparse until the last statement is complete"""
  
-    def __init__(self):
+    def __init__(self, encoding=None):
          self.lines = []
          self.statements = []
          self.lines = []
          self.statements = []
+        self.encoding = encoding
  
      def process_line(self, line):
          self.lines.append(line)
          self.process_lines()
  
      def process_lines(self):
  
      def process_line(self, line):
          self.lines.append(line)
          self.process_lines()
  
      def process_lines(self):
-        statements = parse(''.join(self.lines))
+        statements = parse(''.join(self.lines), encoding=self.encoding)
          last_stmt = statements[-1]
          for i in xrange(len(last_stmt.tokens) - 1, 0, -1):
              token = last_stmt.tokens[i]
          last_stmt = statements[-1]
          for i in xrange(len(last_stmt.tokens) - 1, 0, -1):
              token = last_stmt.tokens[i]
@@ -64,7 +65,7 @@ class StatementGrouper(object):
      def close(self):
          if not self.lines:
              return
      def close(self):
          if not self.lines:
              return
-        tokens = parse(''.join(self.lines))
+        tokens = parse(''.join(self.lines), encoding=self.encoding)
          for token in tokens:
              if (token.ttype not in (Comment.Single, Comment.Multiline,
                                      Newline, Whitespace)):
          for token in tokens:
              if (token.ttype not in (Comment.Single, Comment.Multiline,
                                      Newline, Whitespace)):
diff --git a/requirements.txt b/requirements.txt

index 90c942b69c75765595903630b00831ac51e9ab0f..f1bf8b4457acfd2f1c4fe5d4b347718fa9623ec8 100644 (file)
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,5 @@
  
  argparse; python_version == '2.6'
  sqlparse
  
  argparse; python_version == '2.6'
  sqlparse
+m_lib>=2.0; python_version >= '2.6' and python_version < '3.0'
+m_lib>=3.0; python_version >= '3.4'
diff --git a/scripts/mysql-to-sql.py b/scripts/mysql-to-sql.py

index c40563fed0e7d849554eac67358026789bbc090f..23197ffce6b2cbe7b312d7ab7b0f60d02ba7a7c2 100755 (executable)
--- a/scripts/mysql-to-sql.py
+++ b/scripts/mysql-to-sql.py
@@ -2,28 +2,38 @@
  from __future__ import print_function
  
  import argparse
  from __future__ import print_function
  
  import argparse
+from io import open
  import sys
  
  from mysql2sql.print_tokens import print_tokens
  from mysql2sql.process_tokens import requote_names, StatementGrouper
  
  import sys
  
  from mysql2sql.print_tokens import print_tokens
  from mysql2sql.process_tokens import requote_names, StatementGrouper
  
+from m_lib.defenc import default_encoding
  
  
-def main(infile, outfile):
-    grouper = StatementGrouper()
+
+def main(infile, encoding, outfile, output_encoding):
+    grouper = StatementGrouper(encoding=encoding)
      for line in infile:
          grouper.process_line(line)
          if grouper.statements:
              for statement in grouper.get_statements():
                  requote_names(statement)
      for line in infile:
          grouper.process_line(line)
          if grouper.statements:
              for statement in grouper.get_statements():
                  requote_names(statement)
-                print_tokens(statement, outfile=outfile)
+                print_tokens(statement, outfile=outfile,
+                             encoding=output_encoding)
      tokens = grouper.close()
      if tokens:
          for token in tokens:
      tokens = grouper.close()
      if tokens:
          for token in tokens:
-            print_tokens(token, outfile=outfile)
+            print_tokens(token, outfile=outfile, encoding=output_encoding)
  
  
  if __name__ == '__main__':
      parser = argparse.ArgumentParser(description='Convert MySQL to SQL')
  
  
  if __name__ == '__main__':
      parser = argparse.ArgumentParser(description='Convert MySQL to SQL')
+    parser.add_argument('-e', '--encoding', default='utf-8',
+                        help='input/output encoding, default is utf-8')
+    parser.add_argument('-E', '--output-encoding',
+                        help='separate output encoding, default is the same '
+                        'as -e except for console; for console output '
+                        'charset from the current locale is used')
      parser.add_argument('-o', '--outfile', help='output file name')
      parser.add_argument('infile', help='input file name')
      parser.add_argument('output_file', nargs='?', help='output file name')
      parser.add_argument('-o', '--outfile', help='output file name')
      parser.add_argument('infile', help='input file name')
      parser.add_argument('output_file', nargs='?', help='output file name')
@@ -33,7 +43,7 @@ if __name__ == '__main__':
          if args.infile == '-':
              infile = sys.stdin
          else:
          if args.infile == '-':
              infile = sys.stdin
          else:
-            infile = open(args.infile, 'rt')
+            infile = open(args.infile, 'rt', encoding=args.encoding)
      else:
          infile = sys.stdin
  
      else:
          infile = sys.stdin
  
@@ -56,14 +66,21 @@ if __name__ == '__main__':
      else:
          outfile = '-'
  
      else:
          outfile = '-'
  
+    if args.output_encoding:
+        output_encoding = args.output_encoding
+    elif outfile == '-':
+        output_encoding = default_encoding
+    else:
+        output_encoding = args.encoding
+
      if outfile == '-':
          outfile = sys.stdout
      else:
          try:
      if outfile == '-':
          outfile = sys.stdout
      else:
          try:
-            outfile = open(outfile, 'wt')
+            outfile = open(outfile, 'wt', encoding=output_encoding)
          except:
              if infile is not sys.stdin:
                  infile.close()
              raise
  
          except:
              if infile is not sys.stdin:
                  infile.close()
              raise
  
-    main(infile, outfile)
+    main(infile, args.encoding, outfile, output_encoding)
diff --git a/tests/test_tokens.py b/tests/test_tokens.py

index c39cd311b09c5e4b7b588101a94e7fadd3d49c3f..83951c5cc3152d89ed965351804ff5076bd14924 100755 (executable)
--- a/tests/test_tokens.py
+++ b/tests/test_tokens.py
@@ -1,5 +1,5 @@
  #! /usr/bin/env python
  #! /usr/bin/env python
-
+# -*- coding: utf-8 -*-
  
  import unittest
  from sqlparse import parse
  
  import unittest
  from sqlparse import parse
@@ -16,6 +16,16 @@ class TestTokens(unittest.TestCase):
          query = tlist2str(parsed)
          self.assertEqual(query, 'SELECT * FROM "T"')
  
          query = tlist2str(parsed)
          self.assertEqual(query, 'SELECT * FROM "T"')
  
+    def test_encoding(self):
+        parsed = parse("insert into test (1, 'тест')", 'utf-8')[0]
+        query = tlist2str(parsed).encode('utf-8')
+        self.assertEqual(query, "INSERT INTO test (1, 'тест')")
+
+    def test_unicode(self):
+        parsed = parse(u"insert into test (1, 'тест')")[0]
+        query = tlist2str(parsed)
+        self.assertEqual(query, u"INSERT INTO test (1, 'тест')")
+
  
  if __name__ == "__main__":
      main()
  
  if __name__ == "__main__":
      main()
author	Oleg Broytman <phd@phdru.name>
	Fri, 2 Sep 2016 22:26:02 +0000 (01:26 +0300)
committer	Oleg Broytman <phd@phdru.name>
	Fri, 2 Sep 2016 22:26:02 +0000 (01:26 +0300)
demo/sample.sql		patch \| blob \| history
docs/index.rst		patch \| blob \| history
mysql2sql/print_tokens.py		patch \| blob \| history
mysql2sql/process_tokens.py		patch \| blob \| history
requirements.txt		patch \| blob \| history
scripts/mysql-to-sql.py		patch \| blob \| history
tests/test_tokens.py		patch \| blob \| history