m_lib/net/www/html.py

   1 """HTML parsers"""
   2
   3
   4 try:
   5     from html.parser import HTMLParser as _HTMLParser
   6     from html.entities import entitydefs
   7 except ImportError:
   8     from HTMLParser import HTMLParser as _HTMLParser
   9     from htmlentitydefs import entitydefs
  10
  11
  12 def join_attrs(attrs):
  13    attr_list = ['']
  14    for attrname, value in attrs:
  15       if value is None:
  16          attr_list.append('%s' % attrname)
  17       else:
  18          attr_list.append('%s="%s"' % (attrname, value.strip()))
  19
  20    return ' '.join(attr_list)
  21
  22
  23 class HTMLParser(_HTMLParser):
  24
  25
  26    def __init__(self):
  27       _HTMLParser.__init__(self)
  28       self.accumulator = ""
  29
  30
  31    def handle_starttag(self, tag, attrs):
  32       try:
  33          method = getattr(self, 'start_' + tag)
  34       except AttributeError:
  35          try:
  36             method = getattr(self, 'do_' + tag)
  37          except AttributeError:
  38             self.unknown_starttag(tag, attrs)
  39          else:
  40             method(attrs)
  41       else:
  42          method(attrs)
  43
  44    def handle_endtag(self, tag):
  45       try:
  46          method = getattr(self, 'end_' + tag)
  47       except AttributeError:
  48          self.unknown_endtag(tag)
  49       else:
  50          method()
  51
  52
  53    def handle_data(self, data):
  54       if data:
  55          self.accumulator = "%s%s" % (self.accumulator, data)
  56
  57    def handle_comment(self, data):
  58       if data:
  59          self.accumulator = "%s<!--%s-->" % (self.accumulator, data)
  60
  61
  62    def handle_charref(self, name):
  63       self.accumulator = "%s&#%s;" % (self.accumulator, name)
  64
  65    def handle_entityref(self, name):
  66       if entitydefs.has_key(name): # If it is one of the standard SGML entities - close it with semicolon
  67          x = ';'
  68       else:
  69          x = ''
  70       self.accumulator = "%s&%s%s" % (self.accumulator, name, x)
  71
  72
  73    # Pass other tags unmodified
  74    def unknown_starttag(self, tag, attrs):
  75       self.accumulator = "%s<%s%s>" % (self.accumulator, tag, join_attrs(attrs))
  76
  77    def unknown_endtag(self, tag):
  78       self.accumulator = "%s</%s>" % (self.accumulator, tag)
  79
  80
  81 # Additional classes for filters
  82
  83 class _allowStartTag:
  84     def __init__(self, filter, tag):
  85         self.filter = filter
  86         self.tag = tag
  87
  88     def __call__(self, attrs):
  89         filter = self.filter
  90         filter.accumulator = "%s<%s%s>" % (filter.accumulator, self.tag, join_attrs(attrs))
  91
  92 class _allowEndTag:
  93     def __init__(self, filter, tag):
  94         self.filter = filter
  95         self.tag = tag
  96
  97     def __call__(self):
  98         filter = self.filter
  99         filter.accumulator = "%s</%s>" % (filter.accumulator, self.tag)
 100
 101
 102 class HTMLFilter(HTMLParser):
 103    allowStartTagClass = _allowStartTag
 104    allowEndTagClass = _allowEndTag
 105
 106    def handle_comment(self, data):
 107       pass
 108
 109    # Filter out all tags
 110    def unknown_starttag(self, tag, attrs):
 111       pass
 112
 113    def unknown_endtag(self, tag):
 114       pass
 115
 116
 117    def allow_startTag(self, tag):
 118       setattr(self, "start_%s" % tag, self.allowStartTagClass(self, tag))
 119
 120    def allow_endTag(self, tag):
 121       setattr(self, "end_%s" % tag, self.allowEndTagClass(self, tag))
 122
 123
 124 # how to use them:
 125
 126 #class DocHTMLFilter(HTMLFilter):
 127 #    def __init__(self):
 128 #        HTMLFilter.__init__(self)
 129 #
 130 #        # allow tags <table>, <tr>, <td>
 131 #        # ... and closing tags
 132 #
 133 #        self.allow_startTag('table')
 134 #        self.allow_endTag('table')
 135 #
 136 #        self.allow_startTag('tr')
 137 #        self.allow_endTag('tr')
 138 #
 139 #        self.allow_startTag('td')
 140 #        self.allow_endTag('td')
 141
 142 def filter_html(str, filter=None):
 143     "Process HTML using some HTML parser/filter"
 144
 145     if filter is None:
 146        filter = HTMLFilter()
 147
 148     filter.feed(str)
 149     return filter.accumulator