lxml.etree-based implementation shows namespaces

[extfs.d.git] / xml
diff --git a/xml b/xml

index 1c98c86fbc6a97ed0bd7a1d5ecd074736757d85f..8b4b299956b57df41c57a1d7a45927420e5835ae 100755 (executable)
--- a/xml
+++ b/xml
@@ -4,36 +4,66 @@
  The script requires Midnight Commander 3.1+
  (http://www.midnight-commander.org/), Python 2.4+ (http://www.python.org/).
  
-For mc 4.7+ put the script in $HOME/.mc/extfs.d.
+For mc 4.7+ put the script in $HOME/[.local/share/].mc/extfs.d.
  For older versions put it in /usr/[local/][lib|share]/mc/extfs
  and add a line "xml" to the /usr/[local/][lib|share]/mc/extfs/extfs.ini.
  Make the script executable.
  
-Run this "cd" command in the Midnight Commander (in the "bindings" file the
-command is "%cd"): cd file.xml#xml, where "file.xml" is the name of your xml
-file.
+For mc 4.7+ run this "cd" command in the Midnight Commander (in the "bindings"
+file the command is "%cd"): cd file/xml://; In older versions it is
+cd file#xml, where "file" is the name of your XML file.
  
  The VFS represents tags as directories; the directories are numbered to
  distinguish tags with the same name; also numbering helps to sort tags by their
  order in XML instead of sorting them by name. Attributes, text nodes and
  comments are represented as text files; attributes are shown in a file named
  "attributes", attributes are listed in the file as name=value lines (I
-deliberately ignore a small chance there is a newline character in values). The
-filesystem is read-only.
+deliberately ignore a small chance of newline characters in values); names and
+values are reencoded to the console encoding. Text nodes and comments are
+collected in a file named "text", stripped and reencoded. The filesystem is
+read-only. ElementTree-based implementation doesn't show namespaces as
+attributes; lxml.etree-based implementation shows namespaces as a separate file
+"namespaces"; every child tag includes its parent's namespaces.
+
+It is useful to have a top-down view on an XML structure but it's especially
+convenient to extract text values from tags. One can get, for example, a
+base64-encoded image - just walk down the VFS to the tag's directory and copy
+its text file to a real file.
  
  The VFS was inspired by a FUSE xmlfs: https://github.com/halhen/xmlfs
  
  """
  
-__version__ = "0.2.0"
+__version__ = "0.5.0"
  __author__ = "Oleg Broytman <phd@phdru.name>"
  __copyright__ = "Copyright (C) 2013 PhiloSoft Design"
  __license__ = "GPL"
  
+default_implementation = None # Can be None for default choice,
+                              # 'lxml', 'elementtree' or 'minidom'
+
+use_minidom = True
+use_elementtree = False
+use_lxml = False
+
  import math
  import sys
  import xml.dom.minidom
  
+try:
+    import xml.etree.ElementTree as ET
+except ImportError:
+    pass
+else:
+    use_elementtree = True
+
+try:
+    import lxml.etree as etree
+except ImportError:
+    pass
+else:
+    use_lxml = True
+
  try:
     import locale
     use_locale = True
@@ -71,7 +101,7 @@ XML Virtual FileSystem for Midnight Commander version %s
  Author: %s
  %s
  
-This is not a program. Put the script in $HOME/.mc/extfs.d or
+This is not a program. Put the script in $HOME/[.local/share/].mc/extfs.d or
  /usr/[local/][lib|share]/mc/extfs. For more information read the source!""",
     __version__, __author__, __copyright__
  )
@@ -80,80 +110,320 @@ This is not a program. Put the script in $HOME/.mc/extfs.d or
  
  locale.setlocale(locale.LC_ALL, '')
  
-def _attrs2text(attrs):
-    attrs = [attrs.item(i) for i in range (attrs.length)]
-    return '\n'.join(["%s=%s" %
-        (a.name.encode(default_encoding, "replace"),
-        a.value.encode(default_encoding, "replace"))
-        for a in attrs])
-
-def _list(node, path=''):
-    childNodes = node.childNodes
-    n = 0
-    for element in childNodes:
-        if element.localName:
-            n += 1
-    if n:
-        width = int(math.log10(n))+1
-        template = "%%0%dd" % width
-    else:
-        template = "%d"
-    n = 0
-    for element in childNodes:
-        if element.localName:
-            n += 1
-            if path:
-                subpath = '%s/%s %s' % (path, template % n, element.localName)
+
+class XmlVfs(object):
+    def __init__(self):
+        self.parse()
+
+    def list(self):
+        self._list(self.getroot())
+
+    def has_ns(self, node):
+        return False
+
+    def get_child_node(self, node, i):
+        n = 0
+        for element in self.getchildren(node):
+            if self.istag(element):
+                n += 1
+                if n == i:
+                    return element
+        xml_error('There are less than %d nodes' % i)
+
+
+class MiniDOMXmlVfs(XmlVfs):
+    def parse(self):
+        self.document = xml.dom.minidom.parse(sys.argv[2])
+
+    def hasattrs(self, node):
+        return bool(node.attributes)
+
+    def attrs2text(self, node):
+        attrs = node.attributes
+        attrs = [attrs.item(i) for i in range (attrs.length)]
+        return '\n'.join(["%s=%s" %
+            (a.name.encode(default_encoding, "replace"),
+            a.value.encode(default_encoding, "replace"))
+            for a in attrs])
+
+    def collect_text(self, node):
+        text_accumulator = []
+        for element in node.childNodes:
+            if element.localName:
+                continue
+            elif element.nodeType == element.COMMENT_NODE:
+                text = u"<!--%s-->" % element.nodeValue
+            elif element.nodeType == element.TEXT_NODE:
+                text = element.nodeValue.strip()
+            else:
+                xml_error("Unknown node type %d" % element.nodeType)
+            if text: text_accumulator.append(text)
+        return '\n'.join(text_accumulator).encode(default_encoding, "replace")
+
+    def _list(self, node, path=''):
+        childNodes = node.childNodes
+        n = 0
+        for element in childNodes:
+            if element.localName:
+                n += 1
+        if n:
+            width = int(math.log10(n))+1
+            template = "%%0%dd" % width
+        else:
+            template = "%d"
+        n = 0
+        for element in childNodes:
+            if element.localName:
+                n += 1
+                if path:
+                    subpath = '%s/%s %s' % (path, template % n, element.localName)
+                else:
+                    subpath = '%s %s' % (template % n, element.localName)
+                subpath_encoded = subpath.encode(default_encoding, "replace")
+                print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded
+                if self.hasattrs(element):
+                    attr_text = self.attrs2text(element)
+                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % (
+                        len(attr_text), subpath_encoded)
+                text = self.collect_text(element)
+                if text:
+                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % (
+                        len(text), subpath_encoded)
+                self._list(element, subpath)
+
+    def getroot(self):
+        return self.document
+
+    def getchildren(self, node):
+        return node.childNodes
+
+    def istag(self, node):
+        return bool(node.localName)
+
+
+if use_elementtree or use_lxml:
+    class CommonEtreeXmlVfs(XmlVfs):
+        def hasattrs(self, node):
+            return bool(node.attrib)
+
+        def collect_text(self, node):
+            text_accumulator = []
+            if node.text:
+                text = node.text.strip()
+                if text: text_accumulator.append(text)
+            for element in node:
+                if not self.istag(element):
+                    text = u"<!--%s-->" % text
+                    text_accumulator.append(text)
+            if node.tail:
+                text = node.tail.strip()
+                if text: text_accumulator.append(text)
+            return '\n'.join(text_accumulator).encode(default_encoding, "replace")
+
+        def getroot(self):
+            return self.document.getroot()
+
+        def getchildren(self, node):
+            return list(node)
+
+        def istag(self, node):
+            return isinstance(node.tag, basestring)
+
+
+if use_elementtree:
+    class ElementTreeXmlVfs(CommonEtreeXmlVfs):
+        def parse(self):
+            # Copied from http://effbot.org/zone/element-pi.ht
+
+            class PIParser(ET.XMLTreeBuilder):
+
+               def __init__(self):
+                   ET.XMLTreeBuilder.__init__(self)
+                   # assumes ElementTree 1.2.X
+                   self._parser.CommentHandler = self.handle_comment
+                   self._parser.ProcessingInstructionHandler = self.handle_pi
+                   self._target.start("document", {})
+
+               def close(self):
+                   self._target.end("document")
+                   return ET.XMLTreeBuilder.close(self)
+
+               def handle_comment(self, data):
+                   self._target.start(ET.Comment, {})
+                   self._target.data(data)
+                   self._target.end(ET.Comment)
+
+               def handle_pi(self, target, data):
+                   self._target.start(ET.PI, {})
+                   self._target.data(target + " " + data)
+                   self._target.end(ET.PI)
+
+            self.document = ET.parse(sys.argv[2], PIParser())
+
+        def attrs2text(self, node):
+            attr_accumulator = []
+            for name, value in node.attrib.items():
+                name = name.encode(default_encoding, "replace")
+                value = value.encode(default_encoding, "replace")
+                if name.startswith('{'):
+                    name = name.split('}', 1)[1] # Remove XML namespace
+                attr_accumulator.append("%s=%s" % (name, value))
+            return '\n'.join(attr_accumulator)
+
+        def _list(self, node, path=''):
+            n = len(node)
+            if n:
+                width = int(math.log10(n))+1
+                template = "%%0%dd" % width
+            else:
+                template = "%d"
+            n = 0
+            for element in node:
+                if not isinstance(element.tag, basestring):
+                    continue
+                n += 1
+                tag = element.tag
+                if tag.startswith('{'):
+                    tag = tag.split('}', 1)[1] # Remove XML namespace
+                if path:
+                    subpath = '%s/%s %s' % (path, template % n, tag)
+                else:
+                    subpath = '%s %s' % (template % n, tag)
+                subpath_encoded = subpath.encode(default_encoding, "replace")
+                print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded
+                if self.hasattrs(element):
+                    attr_text = self.attrs2text(element)
+                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % (
+                        len(attr_text), subpath_encoded)
+                text = self.collect_text(element)
+                if text:
+                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % (
+                        len(text), subpath_encoded)
+                self._list(element, subpath)
+
+
+if use_lxml:
+    class LxmlEtreeXmlVfs(CommonEtreeXmlVfs):
+        def parse(self):
+            self.document = etree.parse(sys.argv[2])
+
+        def attrs2text(self, node):
+            attr_accumulator = []
+            for name, value in node.attrib.items():
+                name = etree.QName(name).localname.encode(default_encoding, "replace")
+                value = value.encode(default_encoding, "replace")
+                attr_accumulator.append("%s=%s" % (name, value))
+            return '\n'.join(attr_accumulator)
+
+        def has_ns(self, node):
+            return bool(node.nsmap)
+
+        def ns2text(self, node):
+            ns_accumulator = []
+            for name, value in node.nsmap.items():
+                name = name.encode(default_encoding, "replace")
+                value = value.encode(default_encoding, "replace")
+                ns_accumulator.append("%s=%s" % (name, value))
+            return '\n'.join(ns_accumulator)
+
+        def list(self):
+            self._list(self.getroot())
+
+        def _list(self, node, path=''):
+            n = len(node)
+            if n:
+                width = int(math.log10(n))+1
+                template = "%%0%dd" % width
              else:
-                subpath = '%s %s' % (template % n, element.localName)
-            subpath_encoded = subpath.encode(default_encoding, "replace")
-            print "dr--r--r-- 1 user group 0 Jan 1 00:00 %s" % subpath_encoded
-            attrs = element.attributes
-            if attrs:
-                attr_text = _attrs2text(attrs)
-                print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % (
-                    len(attr_text), subpath_encoded)
-            _list(element, subpath)
+                template = "%d"
+            n = 0
+            for element in node:
+                if not isinstance(element.tag, basestring):
+                    continue
+                n += 1
+                tag = etree.QName(element.tag).localname
+                if path:
+                    subpath = '%s/%s %s' % (path, template % n, tag)
+                else:
+                    subpath = '%s %s' % (template % n, tag)
+                subpath_encoded = subpath.encode(default_encoding, "replace")
+                print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded
+                if self.hasattrs(element):
+                    attr_text = self.attrs2text(element)
+                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % (
+                        len(attr_text), subpath_encoded)
+                if element.nsmap:
+                    ns_text = self.ns2text(element)
+                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/namespaces" % (
+                        len(ns_text), subpath_encoded)
+                text = self.collect_text(element)
+                if text:
+                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % (
+                        len(text), subpath_encoded)
+                self._list(element, subpath)
+
+        def getroot(self):
+            return [self.document.getroot()]
+
+
+def build_xmlvfs():
+    if default_implementation is None:
+        if use_lxml:
+            return LxmlEtreeXmlVfs()
+        elif use_elementtree:
+            return ElementTreeXmlVfs()
+        else:
+            return MiniDOMXmlVfs()
+    elif default_implementation == 'minidom':
+        return MiniDOMXmlVfs()
+    elif default_implementation == 'elementtree':
+        return ElementTreeXmlVfs()
+    elif default_implementation == 'lxml':
+        return LxmlEtreeXmlVfs()
+
  
  def mcxml_list():
      """List the entire VFS"""
  
-    dom = xml.dom.minidom.parse(sys.argv[2])
-    _list(dom)
+    xmlvfs = build_xmlvfs()
+    xmlvfs.list()
  
  
-def _get_child_node(node, i):
-    n = 0
-    for element in node.childNodes:
-        if element.localName:
-            n += 1
-            if n == i:
-                return element
-    xml_error('There are less than %d nodes' % i)
-
  def mcxml_copyout():
      """Extract a file from the VFS"""
  
-    node = xml.dom.minidom.parse(sys.argv[2])
+    xmlvfs = build_xmlvfs()
      xml_filename = sys.argv[3]
      real_filename = sys.argv[4]
  
+    node = xmlvfs.getroot()
      for path_comp in xml_filename.split('/'):
          if ' ' in path_comp:
              i = int(path_comp.split(' ', 1)[0])
-            node = _get_child_node(node, i)
-        elif path_comp == 'attributes':
+            node = xmlvfs.get_child_node(node, i)
+        elif path_comp in ('attributes', 'namespaces', 'text'):
              break
          else:
              xml_error('Unknown file')
  
      if path_comp == 'attributes':
-        attrs = node.attributes
-        if attrs:
-            text = _attrs2text(attrs)
+        if xmlvfs.hasattrs(node):
+            text = xmlvfs.attrs2text(node)
+        else:
+            xml_error('There are no attributes')
+
+    elif path_comp == 'namespaces':
+        if xmlvfs.has_ns(node):
+            text = xmlvfs.ns2text(node)
          else:
              xml_error('There are no attributes')
  
+    elif path_comp == 'text':
+        text = xmlvfs.collect_text(node)
+
+    else:
+        xml_error('Unknown file')
+
      outfile = open(real_filename, 'w')
      outfile.write(text)
      outfile.close()