m_lib/net/www/html.py

   1 """HTML parsers"""
   2
   3
   4 from HTMLParser import HTMLParser as _HTMLParser
   5 from htmlentitydefs import entitydefs
   6
   7
   8 def join_attrs(attrs):
   9    attr_list = ['']
  10    for attrname, value in attrs:
  11       if value is None:
  12          attr_list.append('%s' % attrname)
  13       else:
  14          attr_list.append('%s="%s"' % (attrname, value.strip()))
  15
  16    return ' '.join(attr_list)
  17
  18
  19 class HTMLParser(_HTMLParser):
  20
  21
  22    def __init__(self):
  23       _HTMLParser.__init__(self)
  24       self.accumulator = ""
  25
  26
  27    def handle_starttag(self, tag, attrs):
  28       try:
  29          method = getattr(self, 'start_' + tag)
  30       except AttributeError:
  31          try:
  32             method = getattr(self, 'do_' + tag)
  33          except AttributeError:
  34             self.unknown_starttag(tag, attrs)
  35          else:
  36             method(attrs)
  37       else:
  38          method(attrs)
  39
  40    def handle_endtag(self, tag):
  41       try:
  42          method = getattr(self, 'end_' + tag)
  43       except AttributeError:
  44          self.unknown_endtag(tag)
  45       else:
  46          method()
  47
  48
  49    def handle_data(self, data):
  50       if data:
  51          self.accumulator = "%s%s" % (self.accumulator, data)
  52
  53    def handle_comment(self, data):
  54       if data:
  55          self.accumulator = "%s<!--%s-->" % (self.accumulator, data)
  56
  57
  58    def handle_charref(self, name):
  59       self.accumulator = "%s&#%s;" % (self.accumulator, name)
  60
  61    def handle_entityref(self, name):
  62       if entitydefs.has_key(name): # If it is one of the standard SGML entities - close it with semicolon
  63          x = ';'
  64       else:
  65          x = ''
  66       self.accumulator = "%s&%s%s" % (self.accumulator, name, x)
  67
  68
  69    # Pass other tags unmodified
  70    def unknown_starttag(self, tag, attrs):
  71       self.accumulator = "%s<%s%s>" % (self.accumulator, tag, join_attrs(attrs))
  72
  73    def unknown_endtag(self, tag):
  74       self.accumulator = "%s</%s>" % (self.accumulator, tag)
  75
  76
  77 # Additional classes for filters
  78
  79 class _allowStartTag:
  80     def __init__(self, filter, tag):
  81         self.filter = filter
  82         self.tag = tag
  83
  84     def __call__(self, attrs):
  85         filter = self.filter
  86         filter.accumulator = "%s<%s%s>" % (filter.accumulator, self.tag, join_attrs(attrs))
  87
  88 class _allowEndTag:
  89     def __init__(self, filter, tag):
  90         self.filter = filter
  91         self.tag = tag
  92
  93     def __call__(self):
  94         filter = self.filter
  95         filter.accumulator = "%s</%s>" % (filter.accumulator, self.tag)
  96
  97
  98 class HTMLFilter(HTMLParser):
  99    allowStartTagClass = _allowStartTag
 100    allowEndTagClass = _allowEndTag
 101
 102    def handle_comment(self, data):
 103       pass
 104
 105    # Filter out all tags
 106    def unknown_starttag(self, tag, attrs):
 107       pass
 108
 109    def unknown_endtag(self, tag):
 110       pass
 111
 112
 113    def allow_startTag(self, tag):
 114       setattr(self, "start_%s" % tag, self.allowStartTagClass(self, tag))
 115
 116    def allow_endTag(self, tag):
 117       setattr(self, "end_%s" % tag, self.allowEndTagClass(self, tag))
 118
 119
 120 # how to use them:
 121
 122 #class DocHTMLFilter(HTMLFilter):
 123 #    def __init__(self):
 124 #        HTMLFilter.__init__(self)
 125 #
 126 #        # allow tags <table>, <tr>, <td>
 127 #        # ... and closing tags
 128 #
 129 #        self.allow_startTag('table')
 130 #        self.allow_endTag('table')
 131 #
 132 #        self.allow_startTag('tr')
 133 #        self.allow_endTag('tr')
 134 #
 135 #        self.allow_startTag('td')
 136 #        self.allow_endTag('td')
 137
 138 def filter_html(str, filter=None):
 139     "Process HTML using some HTML parser/filter"
 140
 141     if filter is None:
 142        filter = HTMLFilter()
 143
 144     filter.feed(str)
 145     return filter.accumulator