4 from HTMLParser import HTMLParser as _HTMLParser
5 from htmlentitydefs import entitydefs
10 for attrname, value in attrs:
12 attr_list.append('%s' % attrname)
14 attr_list.append('%s="%s"' % (attrname, value.strip()))
16 return ' '.join(attr_list)
19 class HTMLParser(_HTMLParser):
23 _HTMLParser.__init__(self)
27 def handle_starttag(self, tag, attrs):
29 method = getattr(self, 'start_' + tag)
30 except AttributeError:
32 method = getattr(self, 'do_' + tag)
33 except AttributeError:
34 self.unknown_starttag(tag, attrs)
40 def handle_endtag(self, tag):
42 method = getattr(self, 'end_' + tag)
43 except AttributeError:
44 self.unknown_endtag(tag)
49 def handle_data(self, data):
51 self.accumulator = "%s%s" % (self.accumulator, data)
53 def handle_comment(self, data):
55 self.accumulator = "%s<!--%s-->" % (self.accumulator, data)
58 def handle_charref(self, name):
59 self.accumulator = "%s&#%s;" % (self.accumulator, name)
61 def handle_entityref(self, name):
62 if entitydefs.has_key(name): # If it is one of the standard SGML entities - close it with semicolon
66 self.accumulator = "%s&%s%s" % (self.accumulator, name, x)
69 # Pass other tags unmodified
70 def unknown_starttag(self, tag, attrs):
71 self.accumulator = "%s<%s%s>" % (self.accumulator, tag, join_attrs(attrs))
73 def unknown_endtag(self, tag):
74 self.accumulator = "%s</%s>" % (self.accumulator, tag)
77 # Additional classes for filters
80 def __init__(self, filter, tag):
84 def __call__(self, attrs):
86 filter.accumulator = "%s<%s%s>" % (filter.accumulator, self.tag, join_attrs(attrs))
89 def __init__(self, filter, tag):
95 filter.accumulator = "%s</%s>" % (filter.accumulator, self.tag)
98 class HTMLFilter(HTMLParser):
99 allowStartTagClass = _allowStartTag
100 allowEndTagClass = _allowEndTag
102 def handle_comment(self, data):
105 # Filter out all tags
106 def unknown_starttag(self, tag, attrs):
109 def unknown_endtag(self, tag):
113 def allow_startTag(self, tag):
114 setattr(self, "start_%s" % tag, self.allowStartTagClass(self, tag))
116 def allow_endTag(self, tag):
117 setattr(self, "end_%s" % tag, self.allowEndTagClass(self, tag))
122 #class DocHTMLFilter(HTMLFilter):
123 # def __init__(self):
124 # HTMLFilter.__init__(self)
126 # # allow tags <table>, <tr>, <td>
127 # # ... and closing tags
129 # self.allow_startTag('table')
130 # self.allow_endTag('table')
132 # self.allow_startTag('tr')
133 # self.allow_endTag('tr')
135 # self.allow_startTag('td')
136 # self.allow_endTag('td')
138 def filter_html(str, filter=None):
139 "Process HTML using some HTML parser/filter"
142 filter = HTMLFilter()
145 return filter.accumulator