5 from html.parser import HTMLParser as _HTMLParser
6 from html.entities import entitydefs
8 from HTMLParser import HTMLParser as _HTMLParser
9 from htmlentitydefs import entitydefs
12 def join_attrs(attrs):
14 for attrname, value in attrs:
16 attr_list.append('%s' % attrname)
18 attr_list.append('%s="%s"' % (attrname, value.strip()))
20 return ' '.join(attr_list)
23 class HTMLParser(_HTMLParser):
27 _HTMLParser.__init__(self)
31 def handle_starttag(self, tag, attrs):
33 method = getattr(self, 'start_' + tag)
34 except AttributeError:
36 method = getattr(self, 'do_' + tag)
37 except AttributeError:
38 self.unknown_starttag(tag, attrs)
44 def handle_endtag(self, tag):
46 method = getattr(self, 'end_' + tag)
47 except AttributeError:
48 self.unknown_endtag(tag)
53 def handle_data(self, data):
55 self.accumulator = "%s%s" % (self.accumulator, data)
57 def handle_comment(self, data):
59 self.accumulator = "%s<!--%s-->" % (self.accumulator, data)
62 def handle_charref(self, name):
63 self.accumulator = "%s&#%s;" % (self.accumulator, name)
65 def handle_entityref(self, name):
66 if entitydefs.has_key(name): # If it is one of the standard SGML entities - close it with semicolon
70 self.accumulator = "%s&%s%s" % (self.accumulator, name, x)
73 # Pass other tags unmodified
74 def unknown_starttag(self, tag, attrs):
75 self.accumulator = "%s<%s%s>" % (self.accumulator, tag, join_attrs(attrs))
77 def unknown_endtag(self, tag):
78 self.accumulator = "%s</%s>" % (self.accumulator, tag)
81 # Additional classes for filters
84 def __init__(self, filter, tag):
88 def __call__(self, attrs):
90 filter.accumulator = "%s<%s%s>" % (filter.accumulator, self.tag, join_attrs(attrs))
93 def __init__(self, filter, tag):
99 filter.accumulator = "%s</%s>" % (filter.accumulator, self.tag)
102 class HTMLFilter(HTMLParser):
103 allowStartTagClass = _allowStartTag
104 allowEndTagClass = _allowEndTag
106 def handle_comment(self, data):
109 # Filter out all tags
110 def unknown_starttag(self, tag, attrs):
113 def unknown_endtag(self, tag):
117 def allow_startTag(self, tag):
118 setattr(self, "start_%s" % tag, self.allowStartTagClass(self, tag))
120 def allow_endTag(self, tag):
121 setattr(self, "end_%s" % tag, self.allowEndTagClass(self, tag))
126 #class DocHTMLFilter(HTMLFilter):
127 # def __init__(self):
128 # HTMLFilter.__init__(self)
130 # # allow tags <table>, <tr>, <td>
131 # # ... and closing tags
133 # self.allow_startTag('table')
134 # self.allow_endTag('table')
136 # self.allow_startTag('tr')
137 # self.allow_endTag('tr')
139 # self.allow_startTag('td')
140 # self.allow_endTag('td')
142 def filter_html(str, filter=None):
143 "Process HTML using some HTML parser/filter"
146 filter = HTMLFilter()
149 return filter.accumulator