xml version 1.1.3: fix a minor bug in handling namespaces

[extfs.d.git] / xml
diff --git a/xml b/xml

index 15ef7117f65788269070259e166d9585ada87955..4822ab8c6005edc39c7995afe1fd856adcc62883 100755 (executable)
--- a/xml
+++ b/xml
@@ -13,17 +13,23 @@ For mc 4.7+ run this "cd" command in the Midnight Commander (in the "bindings"
  file the command is "%cd"): cd file/xml://; in older versions it is
  cd file#xml, where "file" is the name of your XML file.
  
+See detailed installation instructions at
+http://phdru.name/Software/mc/xml_INSTALL.html.
+
  The VFS represents tags as directories; the directories are numbered to
-distinguish tags with the same name; also numbering helps to sort tags by their
+distinguish tags with the same name; numbering also helps to sort tags by their
  order in XML instead of sorting them by name. Attributes, text nodes and
  comments are represented as text files; attributes are shown in a file named
  "attributes", attributes are listed in the file as name=value lines (I
  deliberately ignore a small chance of newline characters in values); names and
  values are reencoded to the console encoding. Text nodes and comments are
  collected in a file named "text", stripped and reencoded. The filesystem is
-read-only. ElementTree-based implementation doesn't show namespaces as
-attributes; lxml.etree-based implementation shows namespaces as a separate file
-"namespaces"; every child tag includes its parent's namespaces.
+read-only.
+
+Implementation based on minidom doesn't understand namespaces, it just shows
+them among other attributes. ElementTree-based implementation doesn't show
+namespaces at all. Implementation based on lxml.etree shows namespaces in a
+separate file "namespaces".
  
  It is useful to have a top-down view on an XML structure but it's especially
  convenient to extract text values from tags. One can get, for example, a
@@ -34,13 +40,13 @@ The VFS was inspired by a FUSE xmlfs: https://github.com/halhen/xmlfs
  
  """
  
-__version__ = "0.5.0"
+__version__ = "1.1.3"
  __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2013 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2013-2015 PhiloSoft Design"
  __license__ = "GPL"
  
-default_implementation = None # Can be None for default choice,
-                              # 'lxml', 'elementtree' or 'minidom'
+force_implementation = None  # Can be None for default choice,
+                             # 'lxml', 'elementtree' or 'minidom'
  
  use_minidom = True
  use_elementtree = False
@@ -112,16 +118,53 @@ locale.setlocale(locale.LC_ALL, '')
  
  
  class XmlVfs(object):
+    """Abstract base class"""
+
+    supports_namespaces = False
+
      def __init__(self):
          self.parse()
  
      def list(self):
+        root_comments = self.get_root_comments()
+        if root_comments:
+            print "-r--r--r-- 1 user group %d Jan 1 00:00 text" % (len(root_comments))
          self._list(self.getroot())
  
-    def has_ns(self, node):
-        return False
-
-    def get_child_node(self, node, i):
+    def _list(self, node, path=''):
+        n = len(self.getchildren(node))
+        if n:
+            width = int(math.log10(n)) + 1
+            template = "%%0%dd" % width
+        else:
+            template = "%d"
+        n = 0
+        for element in self.getchildren(node):
+            if not self.istag(element):
+                continue
+            n += 1
+            tag = self.getlocalname(self.gettag(element))
+            if path:
+                subpath = '%s/%s %s' % (path, template % n, tag)
+            else:
+                subpath = '%s %s' % (template % n, tag)
+            subpath_encoded = subpath.encode(default_encoding, "replace")
+            print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded
+            if self.getattrs(element):
+                attr_text = self.attrs2text(element)
+                print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % (
+                    len(attr_text), subpath_encoded)
+            if self.supports_namespaces and self.has_ns(element):
+                ns_text = self.ns2text(element)
+                print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/namespaces" % (
+                    len(ns_text), subpath_encoded)
+            text = self.collect_text(element)
+            if text:
+                print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % (
+                    len(text), subpath_encoded)
+            self._list(element, subpath)
+
+    def get_tag_node(self, node, i):
          n = 0
          for element in self.getchildren(node):
              if self.istag(element):
@@ -130,21 +173,26 @@ class XmlVfs(object):
                      return element
          xml_error('There are less than %d nodes' % i)
  
+    def attrs2text(self, node):
+        attr_accumulator = []
+        for name, value in self.getattrs(node):
+            name = self.getlocalname(name).encode(default_encoding, "replace")
+            value = value.encode(default_encoding, "replace")
+            attr_accumulator.append("%s=%s" % (name, value))
+        return '\n'.join(attr_accumulator)
+
+    def has_ns(self, node):
+        return False
+
  
  class MiniDOMXmlVfs(XmlVfs):
      def parse(self):
          self.document = xml.dom.minidom.parse(sys.argv[2])
  
-    def hasattrs(self, node):
-        return bool(node.attributes)
-
-    def attrs2text(self, node):
+    def getattrs(self, node):
          attrs = node.attributes
-        attrs = [attrs.item(i) for i in range (attrs.length)]
-        return '\n'.join(["%s=%s" %
-            (a.name.encode(default_encoding, "replace"),
-            a.value.encode(default_encoding, "replace"))
-            for a in attrs])
+        attrs = [attrs.item(i) for i in range(attrs.length)]
+        return [(a.name, a.value) for a in attrs]
  
      def collect_text(self, node):
          text_accumulator = []
@@ -160,51 +208,29 @@ class MiniDOMXmlVfs(XmlVfs):
              if text: text_accumulator.append(text)
          return '\n'.join(text_accumulator).encode(default_encoding, "replace")
  
-    def _list(self, node, path=''):
-        childNodes = node.childNodes
-        n = 0
-        for element in childNodes:
-            if element.localName:
-                n += 1
-        if n:
-            width = int(math.log10(n))+1
-            template = "%%0%dd" % width
-        else:
-            template = "%d"
-        n = 0
-        for element in childNodes:
-            if element.localName:
-                n += 1
-                if path:
-                    subpath = '%s/%s %s' % (path, template % n, element.localName)
-                else:
-                    subpath = '%s %s' % (template % n, element.localName)
-                subpath_encoded = subpath.encode(default_encoding, "replace")
-                print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded
-                if self.hasattrs(element):
-                    attr_text = self.attrs2text(element)
-                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % (
-                        len(attr_text), subpath_encoded)
-                text = self.collect_text(element)
-                if text:
-                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % (
-                        len(text), subpath_encoded)
-                self._list(element, subpath)
-
      def getroot(self):
          return self.document
  
+    def get_root_comments(self):
+        return self.collect_text(self.document)
+
      def getchildren(self, node):
          return node.childNodes
  
+    def gettag(self, node):
+        return node.localName
+
      def istag(self, node):
          return bool(node.localName)
  
+    def getlocalname(self, name):
+        return name
+
  
  if use_elementtree or use_lxml:
      class CommonEtreeXmlVfs(XmlVfs):
-        def hasattrs(self, node):
-            return bool(node.attrib)
+        def getattrs(self, node):
+            return node.attrib.items()
  
          def collect_text(self, node):
              text_accumulator = []
@@ -213,19 +239,19 @@ if use_elementtree or use_lxml:
                  if text: text_accumulator.append(text)
              for element in node:
                  if not self.istag(element):
-                    text = u"<!--%s-->" % text
+                    text = u"<!--%s-->" % element.text
                      text_accumulator.append(text)
              if node.tail:
                  text = node.tail.strip()
                  if text: text_accumulator.append(text)
              return '\n'.join(text_accumulator).encode(default_encoding, "replace")
  
-        def getroot(self):
-            return self.document.getroot()
-
          def getchildren(self, node):
              return list(node)
  
+        def gettag(self, node):
+            return node.tag
+
          def istag(self, node):
              return isinstance(node.tag, basestring)
  
@@ -233,153 +259,111 @@ if use_elementtree or use_lxml:
  if use_elementtree:
      class ElementTreeXmlVfs(CommonEtreeXmlVfs):
          def parse(self):
-            # Copied from http://effbot.org/zone/element-pi.ht
+            # Copied from http://effbot.org/zone/element-pi.htm
  
              class PIParser(ET.XMLTreeBuilder):
  
-               def __init__(self):
-                   ET.XMLTreeBuilder.__init__(self)
-                   # assumes ElementTree 1.2.X
-                   self._parser.CommentHandler = self.handle_comment
-                   self._parser.ProcessingInstructionHandler = self.handle_pi
-                   self._target.start("document", {})
+                def __init__(self):
+                    ET.XMLTreeBuilder.__init__(self)
+                    # assumes ElementTree 1.2.X
+                    self._parser.CommentHandler = self.handle_comment
+                    self._parser.ProcessingInstructionHandler = self.handle_pi
+                    self._target.start("document", {})
  
-               def close(self):
-                   self._target.end("document")
-                   return ET.XMLTreeBuilder.close(self)
+                def close(self):
+                    self._target.end("document")
+                    return ET.XMLTreeBuilder.close(self)
  
-               def handle_comment(self, data):
-                   self._target.start(ET.Comment, {})
-                   self._target.data(data)
-                   self._target.end(ET.Comment)
+                def handle_comment(self, data):
+                    self._target.start(ET.Comment, {})
+                    self._target.data(data)
+                    self._target.end(ET.Comment)
  
-               def handle_pi(self, target, data):
-                   self._target.start(ET.PI, {})
-                   self._target.data(target + " " + data)
-                   self._target.end(ET.PI)
+                def handle_pi(self, target, data):
+                    self._target.start(ET.PI, {})
+                    self._target.data(target + " " + data)
+                    self._target.end(ET.PI)
  
              self.document = ET.parse(sys.argv[2], PIParser())
  
-        def attrs2text(self, node):
-            attr_accumulator = []
-            for name, value in node.attrib.items():
-                name = name.encode(default_encoding, "replace")
-                value = value.encode(default_encoding, "replace")
-                if name.startswith('{'):
-                    name = name.split('}', 1)[1] # Remove XML namespace
-                attr_accumulator.append("%s=%s" % (name, value))
-            return '\n'.join(attr_accumulator)
-
-        def _list(self, node, path=''):
-            n = len(node)
-            if n:
-                width = int(math.log10(n))+1
-                template = "%%0%dd" % width
-            else:
-                template = "%d"
-            n = 0
-            for element in node:
-                if not isinstance(element.tag, basestring):
-                    continue
-                n += 1
-                tag = element.tag
-                if tag.startswith('{'):
-                    tag = tag.split('}', 1)[1] # Remove XML namespace
-                if path:
-                    subpath = '%s/%s %s' % (path, template % n, tag)
-                else:
-                    subpath = '%s %s' % (template % n, tag)
-                subpath_encoded = subpath.encode(default_encoding, "replace")
-                print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded
-                if self.hasattrs(element):
-                    attr_text = self.attrs2text(element)
-                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % (
-                        len(attr_text), subpath_encoded)
-                text = self.collect_text(element)
-                if text:
-                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % (
-                        len(text), subpath_encoded)
-                self._list(element, subpath)
+        def getroot(self):
+            return self.document.getroot()
+
+        def get_root_comments(self):
+            text_accumulator = []
+            for element in self.getroot():
+                if not self.istag(element):
+                    text = u"<!--%s-->" % element.text
+                    text_accumulator.append(text)
+            return '\n'.join(text_accumulator).encode(default_encoding, "replace")
+
+        def getlocalname(self, name):
+            if name.startswith('{'):
+                name = name.split('}', 1)[1]  # Remove XML namespace
+            return name
  
  
  if use_lxml:
      class LxmlEtreeXmlVfs(CommonEtreeXmlVfs):
+        supports_namespaces = True
+
          def parse(self):
              self.document = etree.parse(sys.argv[2])
  
-        def attrs2text(self, node):
-            attr_accumulator = []
-            for name, value in node.attrib.items():
-                name = etree.QName(name).localname.encode(default_encoding, "replace")
-                value = value.encode(default_encoding, "replace")
-                attr_accumulator.append("%s=%s" % (name, value))
-            return '\n'.join(attr_accumulator)
+        def getroot(self):
+            return [self.document.getroot()]
+
+        def get_root_comments(self):
+            text_accumulator = []
+            for element in self.document.getroot().itersiblings(tag=etree.Comment, preceding=True):
+                text = u"<!--%s-->" % element.text
+                text_accumulator.append(text)
+            return '\n'.join(text_accumulator).encode(default_encoding, "replace")
+
+        def getlocalname(self, name):
+            return etree.QName(name).localname
+
+        def _get_local_ns(self, node):
+            this_nsmap = node.nsmap
+            parent = node.getparent()
+            if parent is not None:
+                parent_nsmap = parent.nsmap
+                for key in parent_nsmap:
+                    if this_nsmap[key] == parent_nsmap[key]:
+                        del this_nsmap[key]
+            return this_nsmap
  
          def has_ns(self, node):
-            return bool(node.nsmap)
+            return bool(self._get_local_ns(node))
  
          def ns2text(self, node):
              ns_accumulator = []
-            for name, value in node.nsmap.items():
-                name = name.encode(default_encoding, "replace")
+            for name, value in self._get_local_ns(node).items():
+                if name:
+                    name = name.encode(default_encoding, "replace")
+                else:
+                    name = 'xmlns'
                  value = value.encode(default_encoding, "replace")
                  ns_accumulator.append("%s=%s" % (name, value))
              return '\n'.join(ns_accumulator)
  
-        def list(self):
-            self._list(self.getroot())
-
-        def _list(self, node, path=''):
-            n = len(node)
-            if n:
-                width = int(math.log10(n))+1
-                template = "%%0%dd" % width
-            else:
-                template = "%d"
-            n = 0
-            for element in node:
-                if not isinstance(element.tag, basestring):
-                    continue
-                n += 1
-                tag = etree.QName(element.tag).localname
-                if path:
-                    subpath = '%s/%s %s' % (path, template % n, tag)
-                else:
-                    subpath = '%s %s' % (template % n, tag)
-                subpath_encoded = subpath.encode(default_encoding, "replace")
-                print "dr-xr-xr-x 1 user group 0 Jan 1 00:00 %s" % subpath_encoded
-                if self.hasattrs(element):
-                    attr_text = self.attrs2text(element)
-                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/attributes" % (
-                        len(attr_text), subpath_encoded)
-                if element.nsmap:
-                    ns_text = self.ns2text(element)
-                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/namespaces" % (
-                        len(ns_text), subpath_encoded)
-                text = self.collect_text(element)
-                if text:
-                    print "-r--r--r-- 1 user group %d Jan 1 00:00 %s/text" % (
-                        len(text), subpath_encoded)
-                self._list(element, subpath)
-
-        def getroot(self):
-            return [self.document.getroot()]
-
  
  def build_xmlvfs():
-    if default_implementation is None:
+    if force_implementation is None:
          if use_lxml:
              return LxmlEtreeXmlVfs()
          elif use_elementtree:
              return ElementTreeXmlVfs()
          else:
              return MiniDOMXmlVfs()
-    elif default_implementation == 'minidom':
+    elif force_implementation == 'minidom':
          return MiniDOMXmlVfs()
-    elif default_implementation == 'elementtree':
+    elif force_implementation == 'elementtree':
          return ElementTreeXmlVfs()
-    elif default_implementation == 'lxml':
+    elif force_implementation == 'lxml':
          return LxmlEtreeXmlVfs()
+    else:
+        raise ValueError('Unknown implementation "%s", expected "minidom", "elementtree" or "lxml"' % force_implementation)
  
  
  def mcxml_list():
@@ -400,26 +384,29 @@ def mcxml_copyout():
      for path_comp in xml_filename.split('/'):
          if ' ' in path_comp:
              i = int(path_comp.split(' ', 1)[0])
-            node = xmlvfs.get_child_node(node, i)
+            node = xmlvfs.get_tag_node(node, i)
          elif path_comp in ('attributes', 'namespaces', 'text'):
              break
          else:
              xml_error('Unknown file')
  
      if path_comp == 'attributes':
-        if xmlvfs.hasattrs(node):
+        if xmlvfs.getattrs(node):
              text = xmlvfs.attrs2text(node)
          else:
              xml_error('There are no attributes')
  
      elif path_comp == 'namespaces':
-        if xmlvfs.has_ns(node):
+        if xmlvfs.supports_namespaces and xmlvfs.has_ns(node):
              text = xmlvfs.ns2text(node)
          else:
-            xml_error('There are no attributes')
+            xml_error('There are no namespaces')
  
      elif path_comp == 'text':
-        text = xmlvfs.collect_text(node)
+        if '/' in xml_filename:
+            text = xmlvfs.collect_text(node)
+        else:
+            text = xmlvfs.get_root_comments()
  
      else:
          xml_error('Unknown file')