From: Oleg Broytman Date: Tue, 19 Nov 2013 19:19:55 +0000 (+0400) Subject: Add lxml.etree-based implementation X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=d4a65630f87d5732b003a33b75bc1bed430f43b6;p=extfs.d.git Add lxml.etree-based implementation --- diff --git a/xml b/xml deleted file mode 120000 index e9fc63c..0000000 --- a/xml +++ /dev/null @@ -1 +0,0 @@ -xml-unified \ No newline at end of file diff --git a/xml b/xml new file mode 100755 index 0000000..084a784 --- /dev/null +++ b/xml @@ -0,0 +1,436 @@ +#! /usr/bin/env python +"""XML Virtual FileSystem for Midnight Commander + +The script requires Midnight Commander 3.1+ +(http://www.midnight-commander.org/), Python 2.4+ (http://www.python.org/). + +For mc 4.7+ put the script in $HOME/[.local/share/].mc/extfs.d. +For older versions put it in /usr/[local/][lib|share]/mc/extfs +and add a line "xml" to the /usr/[local/][lib|share]/mc/extfs/extfs.ini. +Make the script executable. + +For mc 4.7+ run this "cd" command in the Midnight Commander (in the "bindings" +file the command is "%cd"): cd file/xml://; In older versions it is +cd file#xml, where "file" is the name of your XML file. + +The VFS represents tags as directories; the directories are numbered to +distinguish tags with the same name; also numbering helps to sort tags by their +order in XML instead of sorting them by name. Attributes, text nodes and +comments are represented as text files; attributes are shown in a file named +"attributes", attributes are listed in the file as name=value lines (I +deliberately ignore a small chance of newline characters in values); names and +values are reencoded to the console encoding. Text nodes and comments are +collected in a file named "text", stripped and reencoded. The filesystem is +read-only. ElementTree- and lxml.etree-based implementations don't show +namespaces as attributes. + +It is useful to have a top-down view on an XML structure but it's especially +convenient to extract text values from tags. One can get, for example, a +base64-encoded image - just walk down the VFS to the tag's directory and copy +its text file to a real file. + +The VFS was inspired by a FUSE xmlfs: https://github.com/halhen/xmlfs + +""" + +__version__ = "0.5.0" +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2013 PhiloSoft Design" +__license__ = "GPL" + +default_implementation = None # Can be None for default choice, + # 'lxml', 'elementtree' or 'minidom' + +use_minidom = True +use_elementtree = False +use_lxml = False + +import math +import sys +import xml.dom.minidom + +try: + import xml.etree.ElementTree as ET +except ImportError: + pass +else: + use_elementtree = True + +try: + import lxml.etree as etree +except ImportError: + pass +else: + use_lxml = True + +try: + import locale + use_locale = True +except ImportError: + use_locale = False + +if use_locale: + # Get the default charset. + try: + lcAll = locale.getdefaultlocale() + except locale.Error, err: + print >>sys.stderr, "WARNING:", err + lcAll = [] + + if len(lcAll) == 2: + default_encoding = lcAll[1] + else: + try: + default_encoding = locale.getpreferredencoding() + except locale.Error, err: + print >>sys.stderr, "WARNING:", err + default_encoding = sys.getdefaultencoding() +else: + default_encoding = sys.getdefaultencoding() + +import logging +logger = logging.getLogger('xml-mcextfs') +log_err_handler = logging.StreamHandler(sys.stderr) +logger.addHandler(log_err_handler) +logger.setLevel(logging.INFO) + +if len(sys.argv) < 3: + logger.critical("""\ +XML Virtual FileSystem for Midnight Commander version %s +Author: %s +%s + +This is not a program. Put the script in $HOME/[.local/share/].mc/extfs.d or +/usr/[local/][lib|share]/mc/extfs. For more information read the source!""", + __version__, __author__, __copyright__ +) + sys.exit(1) + + +locale.setlocale(locale.LC_ALL, '') + + +class XmlVfs(object): + def __init__(self): + self.parse() + + def list(self): + self._list(self.getroot()) + + def get_child_node(self, node, i): + n = 0 + for element in self.getchildren(node): + if self.istag(element): + n += 1 + if n == i: + return element + xml_error('There are less than %d nodes' % i) + + +class MiniDOMXmlVfs(XmlVfs): + def parse(self): + self.document = xml.dom.minidom.parse(sys.argv[2]) + + def hasattrs(self, node): + return bool(node.attributes) + + def attrs2text(self, node): + attrs = node.attributes + attrs = [attrs.item(i) for i in range (attrs.length)] + return '\n'.join(["%s=%s" % + (a.name.encode(default_encoding, "replace"), + a.value.encode(default_encoding, "replace")) + for a in attrs]) + + def collect_text(self, node): + text_accumulator = [] + for element in node.childNodes: + if element.localName: + continue + elif element.nodeType == element.COMMENT_NODE: + text = u"" % element.nodeValue + elif element.nodeType == element.TEXT_NODE: + text = element.nodeValue.strip() + else: + xml_error("Unknown node type %d" % element.nodeType) + if text: text_accumulator.append(text) + return '\n'.join(text_accumulator).encode(default_encoding, "replace") + + def _list(self, node, path=''): + childNodes = node.childNodes + n = 0 + for element in childNodes: + if element.localName: + n += 1 + if n: + width = int(math.log10(n))+1 + template = "%%0%dd" % width + else: + template = "%d" + n = 0 + for element in childNodes: + if element.localName: + n += 1 + if path: + subpath = '%s/%s %s' % (path, template % n, element.localName) + else: + subpath = '%s %s' % (template % n, element.localName) + subpath_encoded = subpath.encode(default_encoding, "replace") + print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded + if self.hasattrs(element): + attr_text = self.attrs2text(element) + print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % ( + len(attr_text), subpath_encoded) + text = self.collect_text(element) + if text: + print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % ( + len(text), subpath_encoded) + self._list(element, subpath) + + def getroot(self): + return self.document + + def getchildren(self, node): + return node.childNodes + + def istag(self, node): + return bool(node.localName) + + +if use_elementtree or use_lxml: + class CommonEtreeXmlVfs(XmlVfs): + def hasattrs(self, node): + return bool(node.attrib) + + def collect_text(self, node): + text_accumulator = [] + if node.text: + text = node.text.strip() + if text: text_accumulator.append(text) + for element in node: + if not self.istag(element): + text = u"" % text + text_accumulator.append(text) + if node.tail: + text = node.tail.strip() + if text: text_accumulator.append(text) + return '\n'.join(text_accumulator).encode(default_encoding, "replace") + + def getroot(self): + return self.document.getroot() + + def getchildren(self, node): + return list(node) + + def istag(self, node): + return isinstance(node.tag, basestring) + + +if use_elementtree: + class ElementTreeXmlVfs(CommonEtreeXmlVfs): + def parse(self): + # Copied from http://effbot.org/zone/element-pi.ht + + class PIParser(ET.XMLTreeBuilder): + + def __init__(self): + ET.XMLTreeBuilder.__init__(self) + # assumes ElementTree 1.2.X + self._parser.CommentHandler = self.handle_comment + self._parser.ProcessingInstructionHandler = self.handle_pi + self._target.start("document", {}) + + def close(self): + self._target.end("document") + return ET.XMLTreeBuilder.close(self) + + def handle_comment(self, data): + self._target.start(ET.Comment, {}) + self._target.data(data) + self._target.end(ET.Comment) + + def handle_pi(self, target, data): + self._target.start(ET.PI, {}) + self._target.data(target + " " + data) + self._target.end(ET.PI) + + self.document = ET.parse(sys.argv[2], PIParser()) + + def attrs2text(self, node): + attr_accumulator = [] + for name, value in node.attrib.items(): + name = name.encode(default_encoding, "replace") + value = value.encode(default_encoding, "replace") + if name.startswith('{'): + name = name.split('}', 1)[1] # Remove XML namespace + attr_accumulator.append("%s=%s" % (name, value)) + return '\n'.join(attr_accumulator) + + def _list(self, node, path=''): + n = len(node) + if n: + width = int(math.log10(n))+1 + template = "%%0%dd" % width + else: + template = "%d" + n = 0 + for element in node: + if not isinstance(element.tag, basestring): + continue + n += 1 + tag = element.tag + if tag.startswith('{'): + tag = tag.split('}', 1)[1] # Remove XML namespace + if path: + subpath = '%s/%s %s' % (path, template % n, tag) + else: + subpath = '%s %s' % (template % n, tag) + subpath_encoded = subpath.encode(default_encoding, "replace") + print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded + if self.hasattrs(element): + attr_text = self.attrs2text(element) + print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % ( + len(attr_text), subpath_encoded) + text = self.collect_text(element) + if text: + print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % ( + len(text), subpath_encoded) + self._list(element, subpath) + + +if use_lxml: + class LxmlEtreeXmlVfs(CommonEtreeXmlVfs): + def parse(self): + self.document = etree.parse(sys.argv[2]) + + def attrs2text(self, node): + attr_accumulator = [] + for name, value in node.attrib.items(): + name = etree.QName(name).localname.encode(default_encoding, "replace") + value = value.encode(default_encoding, "replace") + attr_accumulator.append("%s=%s" % (name, value)) + return '\n'.join(attr_accumulator) + + def list(self): + self._list(self.getroot()) + + def _list(self, node, path=''): + n = len(node) + if n: + width = int(math.log10(n))+1 + template = "%%0%dd" % width + else: + template = "%d" + n = 0 + for element in node: + if not isinstance(element.tag, basestring): + continue + n += 1 + tag = etree.QName(element.tag).localname + if path: + subpath = '%s/%s %s' % (path, template % n, tag) + else: + subpath = '%s %s' % (template % n, tag) + subpath_encoded = subpath.encode(default_encoding, "replace") + print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded + if self.hasattrs(element): + attr_text = self.attrs2text(element) + print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % ( + len(attr_text), subpath_encoded) + text = self.collect_text(element) + if text: + print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % ( + len(text), subpath_encoded) + self._list(element, subpath) + + def getroot(self): + return [self.document.getroot()] + + +def build_xmlvfs(): + if default_implementation is None: + if use_lxml: + return LxmlEtreeXmlVfs() + elif use_elementtree: + return ElementTreeXmlVfs() + else: + return MiniDOMXmlVfs() + elif default_implementation == 'minidom': + return MiniDOMXmlVfs() + elif default_implementation == 'elementtree': + return ElementTreeXmlVfs() + elif default_implementation == 'lxml': + return LxmlEtreeXmlVfs() + + +def mcxml_list(): + """List the entire VFS""" + + xmlvfs = build_xmlvfs() + xmlvfs.list() + + +def mcxml_copyout(): + """Extract a file from the VFS""" + + xmlvfs = build_xmlvfs() + xml_filename = sys.argv[3] + real_filename = sys.argv[4] + + node = xmlvfs.getroot() + for path_comp in xml_filename.split('/'): + if ' ' in path_comp: + i = int(path_comp.split(' ', 1)[0]) + node = xmlvfs.get_child_node(node, i) + elif path_comp in ('attributes', 'text'): + break + else: + xml_error('Unknown file') + + if path_comp == 'attributes': + if xmlvfs.hasattrs(node): + text = xmlvfs.attrs2text(node) + else: + xml_error('There are no attributes') + + if path_comp == 'text': + text = xmlvfs.collect_text(node) + + outfile = open(real_filename, 'w') + outfile.write(text) + outfile.close() + + +def mcxml_copyin(): + """Put a file to the VFS""" + sys.exit("XML VFS doesn't support adding files (read-only filesystem)") + +def mcxml_rm(): + """Remove a file from the VFS""" + sys.exit("XML VFS doesn't support removing files/directories (read-only filesystem)") + +mcxml_rmdir = mcxml_rm + +def mcxml_mkdir(): + """Create a directory in the VFS""" + sys.exit("XML VFS doesn't support creating directories (read-only filesystem)") + + +def xml_error(error_str): + logger.critical("Error walking XML file: %s", error_str) + sys.exit(1) + +command = sys.argv[1] +procname = "mcxml_" + command + +g = globals() +if not g.has_key(procname): + logger.critical("Unknown command %s", command) + sys.exit(1) + +try: + g[procname]() +except SystemExit: + raise +except: + logger.exception("Error during run") diff --git a/xml-ANNOUNCE b/xml-ANNOUNCE index bdd1bd7..e709e53 100644 --- a/xml-ANNOUNCE +++ b/xml-ANNOUNCE @@ -5,11 +5,11 @@ WHAT IS IT View an XML file in Midnight Commander as a filesystem. -WHAT'S NEW in version 0.3.2 (2013-11-19) - Refactored collection of text and comments nodes. +WHAT'S NEW in version 0.5.0 (2013-11-19) + Added lxml.etree-based implementation. -WHAT'S NEW in version 0.3.1 (2013-11-18) - Fixed a bug in directories permissions. +WHAT'S NEW in version 0.4.0 (2013-11-19) + Added ElementTree-based implementation. WHAT'S NEW in version 0.3.0 (2013-11-16) Initial release. diff --git a/xml-unified b/xml-unified deleted file mode 100755 index c8758a0..0000000 --- a/xml-unified +++ /dev/null @@ -1,370 +0,0 @@ -#! /usr/bin/env python -"""XML Virtual FileSystem for Midnight Commander - -The script requires Midnight Commander 3.1+ -(http://www.midnight-commander.org/), Python 2.4+ (http://www.python.org/). - -For mc 4.7+ put the script in $HOME/[.local/share/].mc/extfs.d. -For older versions put it in /usr/[local/][lib|share]/mc/extfs -and add a line "xml" to the /usr/[local/][lib|share]/mc/extfs/extfs.ini. -Make the script executable. - -For mc 4.7+ run this "cd" command in the Midnight Commander (in the "bindings" -file the command is "%cd"): cd file/xml://; In older versions it is -cd file#xml, where "file" is the name of your XML file. - -The VFS represents tags as directories; the directories are numbered to -distinguish tags with the same name; also numbering helps to sort tags by their -order in XML instead of sorting them by name. Attributes, text nodes and -comments are represented as text files; attributes are shown in a file named -"attributes", attributes are listed in the file as name=value lines (I -deliberately ignore a small chance of newline characters in values); names and -values are reencoded to the console encoding. Text nodes and comments are -collected in a file named "text", stripped and reencoded. The filesystem is -read-only. - -It is useful to have a top-down view on an XML structure but it's especially -convenient to extract text values from tags. One can get, for example, a -base64-encoded image - just walk down the VFS to the tag's directory and copy -its text file to a real file. - -The VFS was inspired by a FUSE xmlfs: https://github.com/halhen/xmlfs - -""" - -__version__ = "0.5.0" -__author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2013 PhiloSoft Design" -__license__ = "GPL" - -default_implementation = None # Can be elementtree or minidom - -use_minidom = True -use_elementtree = False - -import math -import sys -import xml.dom.minidom - -try: - import xml.etree.ElementTree as ET -except ImportError: - pass -else: - use_elementtree = True - -try: - import locale - use_locale = True -except ImportError: - use_locale = False - -if use_locale: - # Get the default charset. - try: - lcAll = locale.getdefaultlocale() - except locale.Error, err: - print >>sys.stderr, "WARNING:", err - lcAll = [] - - if len(lcAll) == 2: - default_encoding = lcAll[1] - else: - try: - default_encoding = locale.getpreferredencoding() - except locale.Error, err: - print >>sys.stderr, "WARNING:", err - default_encoding = sys.getdefaultencoding() -else: - default_encoding = sys.getdefaultencoding() - -import logging -logger = logging.getLogger('xml-mcextfs') -log_err_handler = logging.StreamHandler(sys.stderr) -logger.addHandler(log_err_handler) -logger.setLevel(logging.INFO) - -if len(sys.argv) < 3: - logger.critical("""\ -XML Virtual FileSystem for Midnight Commander version %s -Author: %s -%s - -This is not a program. Put the script in $HOME/[.local/share/].mc/extfs.d or -/usr/[local/][lib|share]/mc/extfs. For more information read the source!""", - __version__, __author__, __copyright__ -) - sys.exit(1) - - -locale.setlocale(locale.LC_ALL, '') - - -class XmlVfs(object): - def __init__(self): - self.parse() - - def list(self): - self._list(self.getroot()) - - def get_child_node(self, node, i): - n = 0 - for element in self.getchildren(node): - if self.istag(element): - n += 1 - if n == i: - return element - xml_error('There are less than %d nodes' % i) - - -class MiniDOMXmlVfs(XmlVfs): - def parse(self): - self.document = xml.dom.minidom.parse(sys.argv[2]) - - def hasattrs(self, node): - return bool(node.attributes) - - def attrs2text(self, node): - attrs = node.attributes - attrs = [attrs.item(i) for i in range (attrs.length)] - return '\n'.join(["%s=%s" % - (a.name.encode(default_encoding, "replace"), - a.value.encode(default_encoding, "replace")) - for a in attrs]) - - def collect_text(self, node): - text_accumulator = [] - for element in node.childNodes: - if element.localName: - continue - elif element.nodeType == element.COMMENT_NODE: - text = u"" % element.nodeValue - elif element.nodeType == element.TEXT_NODE: - text = element.nodeValue.strip() - else: - xml_error("Unknown node type %d" % element.nodeType) - if text: text_accumulator.append(text) - return '\n'.join(text_accumulator).encode(default_encoding, "replace") - - def _list(self, node, path=''): - childNodes = node.childNodes - n = 0 - for element in childNodes: - if element.localName: - n += 1 - if n: - width = int(math.log10(n))+1 - template = "%%0%dd" % width - else: - template = "%d" - n = 0 - for element in childNodes: - if element.localName: - n += 1 - if path: - subpath = '%s/%s %s' % (path, template % n, element.localName) - else: - subpath = '%s %s' % (template % n, element.localName) - subpath_encoded = subpath.encode(default_encoding, "replace") - print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded - if self.hasattrs(element): - attr_text = self.attrs2text(element) - print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % ( - len(attr_text), subpath_encoded) - text = self.collect_text(element) - if text: - print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % ( - len(text), subpath_encoded) - self._list(element, subpath) - - def getroot(self): - return self.document - - def getchildren(self, node): - return node.childNodes - - def istag(self, node): - return bool(node.localName) - - -if use_elementtree: - class ElementTreeXmlVfs(XmlVfs): - def parse(self): - # Copied from http://effbot.org/zone/element-pi.ht - - class PIParser(ET.XMLTreeBuilder): - - def __init__(self): - ET.XMLTreeBuilder.__init__(self) - # assumes ElementTree 1.2.X - self._parser.CommentHandler = self.handle_comment - self._parser.ProcessingInstructionHandler = self.handle_pi - self._target.start("document", {}) - - def close(self): - self._target.end("document") - return ET.XMLTreeBuilder.close(self) - - def handle_comment(self, data): - self._target.start(ET.Comment, {}) - self._target.data(data) - self._target.end(ET.Comment) - - def handle_pi(self, target, data): - self._target.start(ET.PI, {}) - self._target.data(target + " " + data) - self._target.end(ET.PI) - - self.document = ET.parse(sys.argv[2], PIParser()) - - def hasattrs(self, node): - return bool(node.attrib) - - def attrs2text(self, node): - attr_accumulator = [] - for name, value in node.attrib.items(): - name = name.encode(default_encoding, "replace") - value = value.encode(default_encoding, "replace") - if name.startswith('{'): - name = name.split('}', 1)[1] # Remove XML namespace - attr_accumulator.append("%s=%s" % (name, value)) - return '\n'.join(attr_accumulator) - - def collect_text(self, node): - text_accumulator = [] - if node.text: - text = node.text.strip() - if text: text_accumulator.append(text) - for element in node: - if element.tag is ET.Comment: - text = u"" % text - text_accumulator.append(text) - if node.tail: - text = node.tail.strip() - if text: text_accumulator.append(text) - return '\n'.join(text_accumulator).encode(default_encoding, "replace") - - def _list(self, node, path=''): - n = len(node) - if n: - width = int(math.log10(n))+1 - template = "%%0%dd" % width - else: - template = "%d" - n = 0 - for element in node: - if not isinstance(element.tag, basestring): - continue - n += 1 - tag = element.tag - if tag.startswith('{'): - tag = tag.split('}', 1)[1] # Remove XML namespace - if path: - subpath = '%s/%s %s' % (path, template % n, tag) - else: - subpath = '%s %s' % (template % n, tag) - subpath_encoded = subpath.encode(default_encoding, "replace") - print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded - if self.hasattrs(element): - attr_text = self.attrs2text(element) - print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % ( - len(attr_text), subpath_encoded) - text = self.collect_text(element) - if text: - print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % ( - len(text), subpath_encoded) - self._list(element, subpath) - - def getroot(self): - return self.document.getroot() - - def getchildren(self, node): - return list(node) - - def istag(self, node): - return isinstance(node.tag, basestring) - - -def build_xmlvfs(): - if default_implementation is None: - if use_elementtree: - return ElementTreeXmlVfs() - else: - return MiniDOMXmlVfs() - elif default_implementation == 'minidom': - return MiniDOMXmlVfs() - elif default_implementation == 'elementtree': - return ElementTreeXmlVfs() - - -def mcxml_list(): - """List the entire VFS""" - - xmlvfs = build_xmlvfs() - xmlvfs.list() - - -def mcxml_copyout(): - """Extract a file from the VFS""" - - xmlvfs = build_xmlvfs() - xml_filename = sys.argv[3] - real_filename = sys.argv[4] - - node = xmlvfs.getroot() - for path_comp in xml_filename.split('/'): - if ' ' in path_comp: - i = int(path_comp.split(' ', 1)[0]) - node = xmlvfs.get_child_node(node, i) - elif path_comp in ('attributes', 'text'): - break - else: - xml_error('Unknown file') - - if path_comp == 'attributes': - if xmlvfs.hasattrs(node): - text = xmlvfs.attrs2text(node) - else: - xml_error('There are no attributes') - - if path_comp == 'text': - text = xmlvfs.collect_text(node) - - outfile = open(real_filename, 'w') - outfile.write(text) - outfile.close() - - -def mcxml_copyin(): - """Put a file to the VFS""" - sys.exit("XML VFS doesn't support adding files (read-only filesystem)") - -def mcxml_rm(): - """Remove a file from the VFS""" - sys.exit("XML VFS doesn't support removing files/directories (read-only filesystem)") - -mcxml_rmdir = mcxml_rm - -def mcxml_mkdir(): - """Create a directory in the VFS""" - sys.exit("XML VFS doesn't support creating directories (read-only filesystem)") - - -def xml_error(error_str): - logger.critical("Error walking XML file: %s", error_str) - sys.exit(1) - -command = sys.argv[1] -procname = "mcxml_" + command - -g = globals() -if not g.has_key(procname): - logger.critical("Unknown command %s", command) - sys.exit(1) - -try: - g[procname]() -except SystemExit: - raise -except: - logger.exception("Error during run")