3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it.
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
24 http://cjkpython.i18n.org/
26 Beautiful Soup defines classes for two main parsing strategies:
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39 For more than you ever wanted to know about Beautiful Soup, see the
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
43 Here, have some legalese:
45 Copyright (c) 2004-2010, Leonard Richardson
49 Redistribution and use in source and binary forms, with or without
50 modification, are permitted provided that the following conditions are
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
79 from __future__ import generators
81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
83 __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
84 __license__ = "New-style BSD"
86 from sgmllib import SGMLParser, SGMLParseError
93 from htmlentitydefs import name2codepoint
99 from sets import Set as set
101 #These hacks make Beautiful Soup able to parse XML with namespaces
102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
105 DEFAULT_OUTPUT_ENCODING = "utf-8"
107 def _match_css_class(str):
108 """Build a RE to match the given CSS class."""
109 return re.compile(r"(^|.*\s)%s($|\s)" % str)
111 # First, the classes that represent markup elements.
113 class PageElement(object):
114 """Contains the navigational information for some part of the page
115 (either a tag or a piece of text)"""
117 def setup(self, parent=None, previous=None):
118 """Sets up the initial relations between this element and
121 self.previous = previous
123 self.previousSibling = None
124 self.nextSibling = None
125 if self.parent and self.parent.contents:
126 self.previousSibling = self.parent.contents[-1]
127 self.previousSibling.nextSibling = self
129 def replaceWith(self, replaceWith):
130 oldParent = self.parent
131 myIndex = self.parent.index(self)
132 if hasattr(replaceWith, "parent")\
133 and replaceWith.parent is self.parent:
134 # We're replacing this element with one of its siblings.
135 index = replaceWith.parent.index(replaceWith)
136 if index and index < myIndex:
137 # Furthermore, it comes before this element. That
138 # means that when we extract it, the index of this
139 # element will change.
140 myIndex = myIndex - 1
142 oldParent.insert(myIndex, replaceWith)
144 def replaceWithChildren(self):
145 myParent = self.parent
146 myIndex = self.parent.index(self)
148 reversedChildren = list(self.contents)
149 reversedChildren.reverse()
150 for child in reversedChildren:
151 myParent.insert(myIndex, child)
154 """Destructively rips this element out of the tree."""
157 del self.parent.contents[self.parent.index(self)]
161 #Find the two elements that would be next to each other if
162 #this element (and any children) hadn't been parsed. Connect
164 lastChild = self._lastRecursiveChild()
165 nextElement = lastChild.next
168 self.previous.next = nextElement
170 nextElement.previous = self.previous
172 lastChild.next = None
175 if self.previousSibling:
176 self.previousSibling.nextSibling = self.nextSibling
178 self.nextSibling.previousSibling = self.previousSibling
179 self.previousSibling = self.nextSibling = None
182 def _lastRecursiveChild(self):
183 "Finds the last element beneath this object to be parsed."
185 while hasattr(lastChild, 'contents') and lastChild.contents:
186 lastChild = lastChild.contents[-1]
189 def insert(self, position, newChild):
190 if isinstance(newChild, basestring) \
191 and not isinstance(newChild, NavigableString):
192 newChild = NavigableString(newChild)
194 position = min(position, len(self.contents))
195 if hasattr(newChild, 'parent') and newChild.parent is not None:
196 # We're 'inserting' an element that's already one
197 # of this object's children.
198 if newChild.parent is self:
199 index = self.index(newChild)
201 # Furthermore we're moving it further down the
202 # list of this object's children. That means that
203 # when we extract this element, our target index
204 # will jump down one.
205 position = position - 1
208 newChild.parent = self
211 newChild.previousSibling = None
212 newChild.previous = self
214 previousChild = self.contents[position-1]
215 newChild.previousSibling = previousChild
216 newChild.previousSibling.nextSibling = newChild
217 newChild.previous = previousChild._lastRecursiveChild()
218 if newChild.previous:
219 newChild.previous.next = newChild
221 newChildsLastElement = newChild._lastRecursiveChild()
223 if position >= len(self.contents):
224 newChild.nextSibling = None
227 parentsNextSibling = None
228 while not parentsNextSibling:
229 parentsNextSibling = parent.nextSibling
230 parent = parent.parent
231 if not parent: # This is the last element in the document.
233 if parentsNextSibling:
234 newChildsLastElement.next = parentsNextSibling
236 newChildsLastElement.next = None
238 nextChild = self.contents[position]
239 newChild.nextSibling = nextChild
240 if newChild.nextSibling:
241 newChild.nextSibling.previousSibling = newChild
242 newChildsLastElement.next = nextChild
244 if newChildsLastElement.next:
245 newChildsLastElement.next.previous = newChildsLastElement
246 self.contents.insert(position, newChild)
248 def append(self, tag):
249 """Appends the given tag to the contents of this tag."""
250 self.insert(len(self.contents), tag)
252 def findNext(self, name=None, attrs={}, text=None, **kwargs):
253 """Returns the first item that matches the given criteria and
254 appears after this Tag in the document."""
255 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
257 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
259 """Returns all items that match the given criteria and appear
260 after this Tag in the document."""
261 return self._findAll(name, attrs, text, limit, self.nextGenerator,
264 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
265 """Returns the closest sibling to this Tag that matches the
266 given criteria and appears after this Tag in the document."""
267 return self._findOne(self.findNextSiblings, name, attrs, text,
270 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
272 """Returns the siblings of this Tag that match the given
273 criteria and appear after this Tag in the document."""
274 return self._findAll(name, attrs, text, limit,
275 self.nextSiblingGenerator, **kwargs)
276 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
278 def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
279 """Returns the first item that matches the given criteria and
280 appears before this Tag in the document."""
281 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
283 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
285 """Returns all items that match the given criteria and appear
286 before this Tag in the document."""
287 return self._findAll(name, attrs, text, limit, self.previousGenerator,
289 fetchPrevious = findAllPrevious # Compatibility with pre-3.x
291 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
292 """Returns the closest sibling to this Tag that matches the
293 given criteria and appears before this Tag in the document."""
294 return self._findOne(self.findPreviousSiblings, name, attrs, text,
297 def findPreviousSiblings(self, name=None, attrs={}, text=None,
298 limit=None, **kwargs):
299 """Returns the siblings of this Tag that match the given
300 criteria and appear before this Tag in the document."""
301 return self._findAll(name, attrs, text, limit,
302 self.previousSiblingGenerator, **kwargs)
303 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
305 def findParent(self, name=None, attrs={}, **kwargs):
306 """Returns the closest parent of this Tag that matches the given
308 # NOTE: We can't use _findOne because findParents takes a different
311 l = self.findParents(name, attrs, 1)
316 def findParents(self, name=None, attrs={}, limit=None, **kwargs):
317 """Returns the parents of this Tag that match the given
320 return self._findAll(name, attrs, None, limit, self.parentGenerator,
322 fetchParents = findParents # Compatibility with pre-3.x
324 #These methods do the real heavy lifting.
326 def _findOne(self, method, name, attrs, text, **kwargs):
328 l = method(name, attrs, text, 1, **kwargs)
333 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
334 "Iterates over a generator looking for things that match."
336 if isinstance(name, SoupStrainer):
338 # (Possibly) special case some findAll*(...) searches
339 elif text is None and not limit and not attrs and not kwargs:
342 return [element for element in generator()
343 if isinstance(element, Tag)]
344 # findAll*('tag-name')
345 elif isinstance(name, basestring):
346 return [element for element in generator()
347 if isinstance(element, Tag) and
348 element.name == name]
350 strainer = SoupStrainer(name, attrs, text, **kwargs)
351 # Build a SoupStrainer
353 strainer = SoupStrainer(name, attrs, text, **kwargs)
354 results = ResultSet(strainer)
359 except StopIteration:
362 found = strainer.search(i)
364 results.append(found)
365 if limit and len(results) >= limit:
369 #These Generators can be used to navigate starting from both
370 #NavigableStrings and Tags.
371 def nextGenerator(self):
377 def nextSiblingGenerator(self):
383 def previousGenerator(self):
389 def previousSiblingGenerator(self):
392 i = i.previousSibling
395 def parentGenerator(self):
402 def substituteEncoding(self, str, encoding=None):
403 encoding = encoding or "utf-8"
404 return str.replace("%SOUP-ENCODING%", encoding)
406 def toEncoding(self, s, encoding=None):
407 """Encodes an object to a string in some encoding, or to Unicode.
409 if isinstance(s, unicode):
411 s = s.encode(encoding)
412 elif isinstance(s, str):
414 s = s.encode(encoding)
419 s = self.toEncoding(str(s), encoding)
424 class NavigableString(unicode, PageElement):
426 def __new__(cls, value):
427 """Create a new NavigableString.
429 When unpickling a NavigableString, this method is called with
430 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
431 passed in to the superclass's __new__ or the superclass won't know
432 how to handle non-ASCII characters.
434 if isinstance(value, unicode):
435 return unicode.__new__(cls, value)
436 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
438 def __getnewargs__(self):
439 return (NavigableString.__str__(self),)
441 def __getattr__(self, attr):
442 """text.string gives you text. This is for backwards
443 compatibility for Navigable*String, but for CData* it lets you
444 get the string without the CData wrapper."""
448 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
450 def __unicode__(self):
451 return str(self).decode(DEFAULT_OUTPUT_ENCODING)
453 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
455 return self.encode(encoding)
459 class CData(NavigableString):
461 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
462 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
464 class ProcessingInstruction(NavigableString):
465 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
467 if "%SOUP-ENCODING%" in output:
468 output = self.substituteEncoding(output, encoding)
469 return "<?%s?>" % self.toEncoding(output, encoding)
471 class Comment(NavigableString):
472 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
473 return "<!--%s-->" % NavigableString.__str__(self, encoding)
475 class Declaration(NavigableString):
476 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
477 return "<!%s>" % NavigableString.__str__(self, encoding)
479 class Tag(PageElement):
481 """Represents a found HTML tag with its attributes and contents."""
484 "Cheap function to invert a hash."
486 for k,v in h.items():
490 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
496 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
498 def _convertEntities(self, match):
499 """Used in a call to re.sub to replace HTML, XML, and numeric
500 entities with the appropriate Unicode characters. If HTML
501 entities are being converted, any unrecognized entities are
504 if self.convertHTMLEntities and x in name2codepoint:
505 return unichr(name2codepoint[x])
506 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
507 if self.convertXMLEntities:
508 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
511 elif len(x) > 0 and x[0] == '#':
512 # Handle numeric entities
513 if len(x) > 1 and x[1] == 'x':
514 return unichr(int(x[2:], 16))
516 return unichr(int(x[1:]))
518 elif self.escapeUnrecognizedEntities:
519 return u'&%s;' % x
523 def __init__(self, parser, name, attrs=None, parent=None,
527 # We don't actually store the parser object: that lets extracted
528 # chunks be garbage-collected
529 self.parserClass = parser.__class__
530 self.isSelfClosing = parser.isSelfClosingTag(name)
534 elif isinstance(attrs, dict):
535 attrs = attrs.items()
538 self.setup(parent, previous)
540 self.containsSubstitutions = False
541 self.convertHTMLEntities = parser.convertHTMLEntities
542 self.convertXMLEntities = parser.convertXMLEntities
543 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
545 # Convert any HTML, XML, or numeric entities in the attribute values.
546 convert = lambda(k, val): (k,
547 re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
548 self._convertEntities,
550 self.attrs = map(convert, self.attrs)
553 if (len(self.contents) == 1
554 and isinstance(self.contents[0], NavigableString)):
555 return self.contents[0]
557 def setString(self, string):
558 """Replace the contents of the tag with a string"""
562 string = property(getString, setString)
564 def getText(self, separator=u""):
565 if not len(self.contents):
567 stopNode = self._lastRecursiveChild().next
569 current = self.contents[0]
570 while current is not stopNode:
571 if isinstance(current, NavigableString):
572 strings.append(current.strip())
573 current = current.next
574 return separator.join(strings)
576 text = property(getText)
578 def get(self, key, default=None):
579 """Returns the value of the 'key' attribute for the tag, or
580 the value given for 'default' if it doesn't have that
582 return self._getAttrMap().get(key, default)
585 """Extract all children."""
586 for child in self.contents[:]:
589 def index(self, element):
590 for i, child in enumerate(self.contents):
593 raise ValueError("Tag.index: element not in tag")
595 def has_key(self, key):
596 return self._getAttrMap().has_key(key)
598 def __getitem__(self, key):
599 """tag[key] returns the value of the 'key' attribute for the tag,
600 and throws an exception if it's not there."""
601 return self._getAttrMap()[key]
604 "Iterating over a tag iterates over its contents."
605 return iter(self.contents)
608 "The length of a tag is the length of its list of contents."
609 return len(self.contents)
611 def __contains__(self, x):
612 return x in self.contents
614 def __nonzero__(self):
615 "A tag is non-None even if it has no contents."
618 def __setitem__(self, key, value):
619 """Setting tag[key] sets the value of the 'key' attribute for the
622 self.attrMap[key] = value
624 for i in range(0, len(self.attrs)):
625 if self.attrs[i][0] == key:
626 self.attrs[i] = (key, value)
629 self.attrs.append((key, value))
630 self._getAttrMap()[key] = value
632 def __delitem__(self, key):
633 "Deleting tag[key] deletes all 'key' attributes for the tag."
634 for item in self.attrs:
636 self.attrs.remove(item)
637 #We don't break because bad HTML can define the same
638 #attribute multiple times.
640 if self.attrMap.has_key(key):
641 del self.attrMap[key]
643 def __call__(self, *args, **kwargs):
644 """Calling a tag like a function is the same as calling its
645 findAll() method. Eg. tag('a') returns a list of all the A tags
646 found within this tag."""
647 return apply(self.findAll, args, kwargs)
649 def __getattr__(self, tag):
650 #print "Getattr %s.%s" % (self.__class__, tag)
651 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
652 return self.find(tag[:-3])
653 elif tag.find('__') != 0:
654 return self.find(tag)
655 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
657 def __eq__(self, other):
658 """Returns true iff this tag has the same name, the same attributes,
659 and the same contents (recursively) as the given tag.
661 NOTE: right now this will return false if two tags have the
662 same attributes in a different order. Should this be fixed?"""
665 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
667 for i in range(0, len(self.contents)):
668 if self.contents[i] != other.contents[i]:
672 def __ne__(self, other):
673 """Returns true iff this tag is not identical to the other tag,
674 as defined in __eq__."""
675 return not self == other
677 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
678 """Renders this tag as a string."""
679 return self.__str__(encoding)
681 def __unicode__(self):
682 return self.__str__(None)
684 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
685 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
688 def _sub_entity(self, x):
689 """Used with a regular expression to substitute the
690 appropriate XML entity for an XML special character."""
691 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
693 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
694 prettyPrint=False, indentLevel=0):
695 """Returns a string or Unicode representation of this tag and
696 its contents. To get Unicode, pass None for encoding.
698 NOTE: since Python's HTML parser consumes whitespace, this
699 method is not certain to reproduce the whitespace present in
700 the original string."""
702 encodedName = self.toEncoding(self.name, encoding)
706 for key, val in self.attrs:
708 if isinstance(val, basestring):
709 if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
710 val = self.substituteEncoding(val, encoding)
712 # The attribute value either:
714 # * Contains no embedded double quotes or single quotes.
715 # No problem: we enclose it in double quotes.
716 # * Contains embedded single quotes. No problem:
717 # double quotes work here too.
718 # * Contains embedded double quotes. No problem:
719 # we enclose it in single quotes.
720 # * Embeds both single _and_ double quotes. This
721 # can't happen naturally, but it can happen if
722 # you modify an attribute value after parsing
723 # the document. Now we have a bit of a
724 # problem. We solve it by enclosing the
725 # attribute in single quotes, and escaping any
726 # embedded single quotes to XML entities.
730 # TODO: replace with apos when
732 val = val.replace("'", "&squot;")
734 # Now we're okay w/r/t quotes. But the attribute
735 # value might also contain angle brackets, or
736 # ampersands that aren't part of entities. We need
737 # to escape those to XML entities too.
738 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
740 attrs.append(fmt % (self.toEncoding(key, encoding),
741 self.toEncoding(val, encoding)))
744 if self.isSelfClosing:
747 closeTag = '</%s>' % encodedName
749 indentTag, indentContents = 0, 0
751 indentTag = indentLevel
752 space = (' ' * (indentTag-1))
753 indentContents = indentTag + 1
754 contents = self.renderContents(encoding, prettyPrint, indentContents)
761 attributeString = ' ' + ' '.join(attrs)
764 s.append('<%s%s%s>' % (encodedName, attributeString, close))
768 if prettyPrint and contents and contents[-1] != "\n":
770 if prettyPrint and closeTag:
773 if prettyPrint and closeTag and self.nextSibling:
779 """Recursively destroys the contents of this tree."""
781 if len(self.contents) == 0:
783 current = self.contents[0]
784 while current is not None:
786 if isinstance(current, Tag):
787 del current.contents[:]
788 current.parent = None
789 current.previous = None
790 current.previousSibling = None
792 current.nextSibling = None
795 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
796 return self.__str__(encoding, True)
798 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
799 prettyPrint=False, indentLevel=0):
800 """Renders the contents of this tag as a string in the given
801 encoding. If encoding is None, returns a Unicode string.."""
805 if isinstance(c, NavigableString):
806 text = c.__str__(encoding)
807 elif isinstance(c, Tag):
808 s.append(c.__str__(encoding, prettyPrint, indentLevel))
809 if text and prettyPrint:
813 s.append(" " * (indentLevel-1))
821 def find(self, name=None, attrs={}, recursive=True, text=None,
823 """Return only the first child of this Tag matching the given
826 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
832 def findAll(self, name=None, attrs={}, recursive=True, text=None,
833 limit=None, **kwargs):
834 """Extracts a list of Tag objects that match the given
835 criteria. You can specify the name of the Tag and any
836 attributes you want the Tag to have.
838 The value of a key-value pair in the 'attrs' map can be a
839 string, a list of strings, a regular expression object, or a
840 callable that takes a string and returns whether or not the
841 string matches for some custom definition of 'matches'. The
842 same is true of the tag name."""
843 generator = self.recursiveChildGenerator
845 generator = self.childGenerator
846 return self._findAll(name, attrs, text, limit, generator, **kwargs)
847 findChildren = findAll
849 # Pre-3.x compatibility methods
853 def fetchText(self, text=None, recursive=True, limit=None):
854 return self.findAll(text=text, recursive=recursive, limit=limit)
856 def firstText(self, text=None, recursive=True):
857 return self.find(text=text, recursive=recursive)
861 def _getAttrMap(self):
862 """Initializes a map representation of this tag's attributes,
863 if not already initialized."""
864 if not getattr(self, 'attrMap'):
866 for (key, value) in self.attrs:
867 self.attrMap[key] = value
871 def childGenerator(self):
872 # Just use the iterator from the contents
873 return iter(self.contents)
875 def recursiveChildGenerator(self):
876 if not len(self.contents):
878 stopNode = self._lastRecursiveChild().next
879 current = self.contents[0]
880 while current is not stopNode:
882 current = current.next
885 # Next, a couple classes to represent queries and their results.
887 """Encapsulates a number of ways of matching a markup element (tag or
890 def __init__(self, name=None, attrs={}, text=None, **kwargs):
892 if isinstance(attrs, basestring):
893 kwargs['class'] = _match_css_class(attrs)
908 return "%s|%s" % (self.name, self.attrs)
910 def searchTag(self, markupName=None, markupAttrs={}):
913 if isinstance(markupName, Tag):
916 callFunctionWithTagData = callable(self.name) \
917 and not isinstance(markupName, Tag)
920 or callFunctionWithTagData \
921 or (markup and self._matches(markup, self.name)) \
922 or (not markup and self._matches(markupName, self.name)):
923 if callFunctionWithTagData:
924 match = self.name(markupName, markupAttrs)
928 for attr, matchAgainst in self.attrs.items():
929 if not markupAttrMap:
930 if hasattr(markupAttrs, 'get'):
931 markupAttrMap = markupAttrs
934 for k,v in markupAttrs:
936 attrValue = markupAttrMap.get(attr)
937 if not self._matches(attrValue, matchAgainst):
947 def search(self, markup):
948 #print 'looking for %s in %s' % (self, markup)
950 # If given a list of items, scan it for a text element that
952 if hasattr(markup, "__iter__") \
953 and not isinstance(markup, Tag):
954 for element in markup:
955 if isinstance(element, NavigableString) \
956 and self.search(element):
959 # If it's a Tag, make sure its name or attributes match.
960 # Don't bother with Tags if we're searching for text.
961 elif isinstance(markup, Tag):
963 found = self.searchTag(markup)
964 # If it's text, make sure the text matches.
965 elif isinstance(markup, NavigableString) or \
966 isinstance(markup, basestring):
967 if self._matches(markup, self.text):
970 raise Exception, "I don't know how to match against a %s" \
974 def _matches(self, markup, matchAgainst):
975 #print "Matching %s against %s" % (markup, matchAgainst)
977 if matchAgainst is True:
978 result = markup is not None
979 elif callable(matchAgainst):
980 result = matchAgainst(markup)
982 #Custom match methods take the tag as an argument, but all
983 #other ways of matching match the tag name as a string.
984 if isinstance(markup, Tag):
986 if markup and not isinstance(markup, basestring):
987 markup = unicode(markup)
988 #Now we know that chunk is either a string, or None.
989 if hasattr(matchAgainst, 'match'):
990 # It's a regexp object.
991 result = markup and matchAgainst.search(markup)
992 elif hasattr(matchAgainst, '__iter__'): # list-like
993 result = markup in matchAgainst
994 elif hasattr(matchAgainst, 'items'):
995 result = markup.has_key(matchAgainst)
996 elif matchAgainst and isinstance(markup, basestring):
997 if isinstance(markup, unicode):
998 matchAgainst = unicode(matchAgainst)
1000 matchAgainst = str(matchAgainst)
1003 result = matchAgainst == markup
1006 class ResultSet(list):
1007 """A ResultSet is just a list that keeps track of the SoupStrainer
1009 def __init__(self, source):
1011 self.source = source
1013 # Now, some helper functions.
1015 def buildTagMap(default, *args):
1016 """Turns a list of maps, lists, or scalars into a single map.
1017 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1018 NESTING_RESET_TAGS maps out of lists and partial maps."""
1020 for portion in args:
1021 if hasattr(portion, 'items'):
1022 #It's a map. Merge it.
1023 for k,v in portion.items():
1025 elif hasattr(portion, '__iter__'): # is a list
1026 #It's a list. Map each item to the default.
1030 #It's a scalar. Map it to the default.
1031 built[portion] = default
1034 # Now, the parser classes.
1036 class BeautifulStoneSoup(Tag, SGMLParser):
1038 """This class contains the basic parser and search code. It defines
1039 a parser that knows nothing about tag behavior except for the
1042 You can't close a tag without closing all the tags it encloses.
1043 That is, "<foo><bar></foo>" actually means
1044 "<foo><bar></bar></foo>".
1046 [Another possible explanation is "<foo><bar /></foo>", but since
1047 this class defines no SELF_CLOSING_TAGS, it will never use that
1050 This class is useful for parsing XML or made-up markup languages,
1051 or when BeautifulSoup makes an assumption counter to what you were
1054 SELF_CLOSING_TAGS = {}
1056 RESET_NESTING_TAGS = {}
1058 PRESERVE_WHITESPACE_TAGS = []
1060 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1061 lambda x: x.group(1) + ' />'),
1062 (re.compile('<!\s+([^<>]*)>'),
1063 lambda x: '<!' + x.group(1) + '>')
1066 ROOT_TAG_NAME = u'[document]'
1068 HTML_ENTITIES = "html"
1069 XML_ENTITIES = "xml"
1070 XHTML_ENTITIES = "xhtml"
1071 # TODO: This only exists for backwards-compatibility
1072 ALL_ENTITIES = XHTML_ENTITIES
1074 # Used when determining whether a text node is all whitespace and
1075 # can be replaced with a single space. A text node that contains
1076 # fancy Unicode spaces (usually non-breaking) should be left
1078 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1080 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1081 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1082 convertEntities=None, selfClosingTags=None, isHTML=False):
1083 """The Soup object is initialized as the 'root tag', and the
1084 provided markup (which can be a string or a file-like object)
1085 is fed into the underlying parser.
1087 sgmllib will process most bad HTML, and the BeautifulSoup
1088 class has some tricks for dealing with some HTML that kills
1089 sgmllib, but Beautiful Soup can nonetheless choke or lose data
1090 if your data uses self-closing tags or declarations
1093 By default, Beautiful Soup uses regexes to sanitize input,
1094 avoiding the vast majority of these problems. If the problems
1095 don't apply to you, pass in False for markupMassage, and
1096 you'll get better performance.
1098 The default parser massage techniques fix the two most common
1099 instances of invalid HTML that choke sgmllib:
1101 <br/> (No space between name of closing tag and tag close)
1102 <! --Comment--> (Extraneous whitespace in declaration)
1104 You can pass in a custom list of (RE object, replace method)
1105 tuples to get Beautiful Soup to scrub your input the way you
1108 self.parseOnlyThese = parseOnlyThese
1109 self.fromEncoding = fromEncoding
1110 self.smartQuotesTo = smartQuotesTo
1111 self.convertEntities = convertEntities
1112 # Set the rules for how we'll deal with the entities we
1114 if self.convertEntities:
1115 # It doesn't make sense to convert encoded characters to
1116 # entities even while you're converting entities to Unicode.
1117 # Just convert it all to Unicode.
1118 self.smartQuotesTo = None
1119 if convertEntities == self.HTML_ENTITIES:
1120 self.convertXMLEntities = False
1121 self.convertHTMLEntities = True
1122 self.escapeUnrecognizedEntities = True
1123 elif convertEntities == self.XHTML_ENTITIES:
1124 self.convertXMLEntities = True
1125 self.convertHTMLEntities = True
1126 self.escapeUnrecognizedEntities = False
1127 elif convertEntities == self.XML_ENTITIES:
1128 self.convertXMLEntities = True
1129 self.convertHTMLEntities = False
1130 self.escapeUnrecognizedEntities = False
1132 self.convertXMLEntities = False
1133 self.convertHTMLEntities = False
1134 self.escapeUnrecognizedEntities = False
1136 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1137 SGMLParser.__init__(self)
1139 if hasattr(markup, 'read'): # It's a file-type object.
1140 markup = markup.read()
1141 self.markup = markup
1142 self.markupMassage = markupMassage
1144 self._feed(isHTML=isHTML)
1147 self.markup = None # The markup can now be GCed
1149 def convert_charref(self, name):
1150 """This method fixes a bug in Python's SGMLParser."""
1155 if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1157 return self.convert_codepoint(n)
1159 def _feed(self, inDocumentEncoding=None, isHTML=False):
1160 # Convert the document to Unicode.
1161 markup = self.markup
1162 if isinstance(markup, unicode):
1163 if not hasattr(self, 'originalEncoding'):
1164 self.originalEncoding = None
1166 dammit = UnicodeDammit\
1167 (markup, [self.fromEncoding, inDocumentEncoding],
1168 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1169 markup = dammit.unicode
1170 self.originalEncoding = dammit.originalEncoding
1171 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1173 if self.markupMassage:
1174 if not hasattr(self.markupMassage, "__iter__"):
1175 self.markupMassage = self.MARKUP_MASSAGE
1176 for fix, m in self.markupMassage:
1177 markup = fix.sub(m, markup)
1178 # TODO: We get rid of markupMassage so that the
1179 # soup object can be deepcopied later on. Some
1180 # Python installations can't copy regexes. If anyone
1181 # was relying on the existence of markupMassage, this
1182 # might cause problems.
1183 del(self.markupMassage)
1186 SGMLParser.feed(self, markup)
1187 # Close out any unfinished strings and close all the open tags.
1189 while self.currentTag.name != self.ROOT_TAG_NAME:
1192 def __getattr__(self, methodName):
1193 """This method routes method call requests to either the SGMLParser
1194 superclass or the Tag superclass, depending on the method name."""
1195 #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1197 if methodName.startswith('start_') or methodName.startswith('end_') \
1198 or methodName.startswith('do_'):
1199 return SGMLParser.__getattr__(self, methodName)
1200 elif not methodName.startswith('__'):
1201 return Tag.__getattr__(self, methodName)
1203 raise AttributeError
1205 def isSelfClosingTag(self, name):
1206 """Returns true iff the given string is the name of a
1207 self-closing tag according to this parser."""
1208 return self.SELF_CLOSING_TAGS.has_key(name) \
1209 or self.instanceSelfClosingTags.has_key(name)
1212 Tag.__init__(self, self, self.ROOT_TAG_NAME)
1214 SGMLParser.reset(self)
1215 self.currentData = []
1216 self.currentTag = None
1218 self.quoteStack = []
1222 tag = self.tagStack.pop()
1224 #print "Pop", tag.name
1226 self.currentTag = self.tagStack[-1]
1227 return self.currentTag
1229 def pushTag(self, tag):
1230 #print "Push", tag.name
1232 self.currentTag.contents.append(tag)
1233 self.tagStack.append(tag)
1234 self.currentTag = self.tagStack[-1]
1236 def endData(self, containerClass=NavigableString):
1237 if self.currentData:
1238 currentData = u''.join(self.currentData)
1239 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1240 not set([tag.name for tag in self.tagStack]).intersection(
1241 self.PRESERVE_WHITESPACE_TAGS)):
1242 if '\n' in currentData:
1246 self.currentData = []
1247 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1248 (not self.parseOnlyThese.text or \
1249 not self.parseOnlyThese.search(currentData)):
1251 o = containerClass(currentData)
1252 o.setup(self.currentTag, self.previous)
1254 self.previous.next = o
1256 self.currentTag.contents.append(o)
1259 def _popToTag(self, name, inclusivePop=True):
1260 """Pops the tag stack up to and including the most recent
1261 instance of the given tag. If inclusivePop is false, pops the tag
1262 stack up to but *not* including the most recent instqance of
1264 #print "Popping to %s" % name
1265 if name == self.ROOT_TAG_NAME:
1269 mostRecentTag = None
1270 for i in range(len(self.tagStack)-1, 0, -1):
1271 if name == self.tagStack[i].name:
1272 numPops = len(self.tagStack)-i
1274 if not inclusivePop:
1275 numPops = numPops - 1
1277 for i in range(0, numPops):
1278 mostRecentTag = self.popTag()
1279 return mostRecentTag
1281 def _smartPop(self, name):
1283 """We need to pop up to the previous tag of this type, unless
1284 one of this tag's nesting reset triggers comes between this
1285 tag and the previous tag of this type, OR unless this tag is a
1286 generic nesting trigger and another generic nesting trigger
1287 comes between this tag and the previous tag of this type.
1290 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1291 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1292 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1294 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1295 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1296 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1299 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1300 isNestable = nestingResetTriggers != None
1301 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1304 for i in range(len(self.tagStack)-1, 0, -1):
1305 p = self.tagStack[i]
1306 if (not p or p.name == name) and not isNestable:
1307 #Non-nestable tags get popped to the top or to their
1311 if (nestingResetTriggers is not None
1312 and p.name in nestingResetTriggers) \
1313 or (nestingResetTriggers is None and isResetNesting
1314 and self.RESET_NESTING_TAGS.has_key(p.name)):
1316 #If we encounter one of the nesting reset triggers
1317 #peculiar to this tag, or we encounter another tag
1318 #that causes nesting to reset, pop up to but not
1319 #including that tag.
1325 self._popToTag(popTo, inclusive)
1327 def unknown_starttag(self, name, attrs, selfClosing=0):
1328 #print "Start tag %s: %s" % (name, attrs)
1330 #This is not a real tag.
1331 #print "<%s> is not real!" % name
1332 attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1333 self.handle_data('<%s%s>' % (name, attrs))
1337 if not self.isSelfClosingTag(name) and not selfClosing:
1338 self._smartPop(name)
1340 if self.parseOnlyThese and len(self.tagStack) <= 1 \
1341 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1344 tag = Tag(self, name, attrs, self.currentTag, self.previous)
1346 self.previous.next = tag
1349 if selfClosing or self.isSelfClosingTag(name):
1351 if name in self.QUOTE_TAGS:
1352 #print "Beginning quote (%s)" % name
1353 self.quoteStack.append(name)
1357 def unknown_endtag(self, name):
1358 #print "End tag %s" % name
1359 if self.quoteStack and self.quoteStack[-1] != name:
1360 #This is not a real end tag.
1361 #print "</%s> is not real!" % name
1362 self.handle_data('</%s>' % name)
1365 self._popToTag(name)
1366 if self.quoteStack and self.quoteStack[-1] == name:
1367 self.quoteStack.pop()
1368 self.literal = (len(self.quoteStack) > 0)
1370 def handle_data(self, data):
1371 self.currentData.append(data)
1373 def _toStringSubclass(self, text, subclass):
1374 """Adds a certain piece of text to the tree as a NavigableString
1377 self.handle_data(text)
1378 self.endData(subclass)
1380 def handle_pi(self, text):
1381 """Handle a processing instruction as a ProcessingInstruction
1382 object, possibly one with a %SOUP-ENCODING% slot into which an
1383 encoding will be plugged later."""
1384 if text[:3] == "xml":
1385 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1386 self._toStringSubclass(text, ProcessingInstruction)
1388 def handle_comment(self, text):
1389 "Handle comments as Comment objects."
1390 self._toStringSubclass(text, Comment)
1392 def handle_charref(self, ref):
1393 "Handle character references as data."
1394 if self.convertEntities:
1395 data = unichr(int(ref))
1397 data = '&#%s;' % ref
1398 self.handle_data(data)
1400 def handle_entityref(self, ref):
1401 """Handle entity references as data, possibly converting known
1402 HTML and/or XML entity references to the corresponding Unicode
1405 if self.convertHTMLEntities:
1407 data = unichr(name2codepoint[ref])
1411 if not data and self.convertXMLEntities:
1412 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1414 if not data and self.convertHTMLEntities and \
1415 not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1416 # TODO: We've got a problem here. We're told this is
1417 # an entity reference, but it's not an XML entity
1418 # reference or an HTML entity reference. Nonetheless,
1419 # the logical thing to do is to pass it through as an
1420 # unrecognized entity reference.
1422 # Except: when the input is "&carol;" this function
1423 # will be called with input "carol". When the input is
1424 # "AT&T", this function will be called with input
1425 # "T". We have no way of knowing whether a semicolon
1426 # was present originally, so we don't know whether
1427 # this is an unknown entity or just a misplaced
1430 # The more common case is a misplaced ampersand, so I
1431 # escape the ampersand and omit the trailing semicolon.
1432 data = "&%s" % ref
1434 # This case is different from the one above, because we
1435 # haven't already gone through a supposedly comprehensive
1436 # mapping of entities to Unicode characters. We might not
1437 # have gone through any mapping at all. So the chances are
1438 # very high that this is a real entity, and not a
1439 # misplaced ampersand.
1441 self.handle_data(data)
1443 def handle_decl(self, data):
1444 "Handle DOCTYPEs and the like as Declaration objects."
1445 self._toStringSubclass(data, Declaration)
1447 def parse_declaration(self, i):
1448 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1449 declaration as a CData object."""
1451 if self.rawdata[i:i+9] == '<![CDATA[':
1452 k = self.rawdata.find(']]>', i)
1454 k = len(self.rawdata)
1455 data = self.rawdata[i+9:k]
1457 self._toStringSubclass(data, CData)
1460 j = SGMLParser.parse_declaration(self, i)
1461 except SGMLParseError:
1462 toHandle = self.rawdata[i:]
1463 self.handle_data(toHandle)
1464 j = i + len(toHandle)
1467 class BeautifulSoup(BeautifulStoneSoup):
1469 """This parser knows the following facts about HTML:
1471 * Some tags have no closing tag and should be interpreted as being
1472 closed as soon as they are encountered.
1474 * The text inside some tags (ie. 'script') may contain tags which
1475 are not really part of the document and which should be parsed
1476 as text, not tags. If you want to parse the text as tags, you can
1477 always fetch it and parse it explicitly.
1479 * Tag nesting rules:
1481 Most tags can't be nested at all. For instance, the occurance of
1482 a <p> tag should implicitly close the previous <p> tag.
1485 should be transformed into:
1486 <p>Para1</p><p>Para2
1488 Some tags can be nested arbitrarily. For instance, the occurance
1489 of a <blockquote> tag should _not_ implicitly close the previous
1492 Alice said: <blockquote>Bob said: <blockquote>Blah
1493 should NOT be transformed into:
1494 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1496 Some tags can be nested, but the nesting is reset by the
1497 interposition of other tags. For instance, a <tr> tag should
1498 implicitly close the previous <tr> tag within the same <table>,
1499 but not close a <tr> tag in another table.
1501 <table><tr>Blah<tr>Blah
1502 should be transformed into:
1503 <table><tr>Blah</tr><tr>Blah
1505 <tr>Blah<table><tr>Blah
1506 should NOT be transformed into
1507 <tr>Blah<table></tr><tr>Blah
1509 Differing assumptions about tag nesting rules are a major source
1510 of problems with the BeautifulSoup class. If BeautifulSoup is not
1511 treating as nestable a tag your page author treats as nestable,
1512 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1513 BeautifulStoneSoup before writing your own subclass."""
1515 def __init__(self, *args, **kwargs):
1516 if not kwargs.has_key('smartQuotesTo'):
1517 kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1518 kwargs['isHTML'] = True
1519 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1521 SELF_CLOSING_TAGS = buildTagMap(None,
1522 ('br' , 'hr', 'input', 'img', 'meta',
1523 'spacer', 'link', 'frame', 'base', 'col'))
1525 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1527 QUOTE_TAGS = {'script' : None, 'textarea' : None}
1529 #According to the HTML standard, each of these inline tags can
1530 #contain another tag of the same type. Furthermore, it's common
1531 #to actually use these tags this way.
1532 NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1535 #According to the HTML standard, these block tags can contain
1536 #another tag of the same type. Furthermore, it's common
1537 #to actually use these tags this way.
1538 NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1540 #Lists can contain other lists, but there are restrictions.
1541 NESTABLE_LIST_TAGS = { 'ol' : [],
1543 'li' : ['ul', 'ol'],
1548 #Tables can contain other tables, but there are restrictions.
1549 NESTABLE_TABLE_TAGS = {'table' : [],
1550 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1553 'thead' : ['table'],
1554 'tbody' : ['table'],
1555 'tfoot' : ['table'],
1558 NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1560 #If one of these tags is encountered, all tags up to the next tag of
1561 #this type are popped.
1562 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1563 NON_NESTABLE_BLOCK_TAGS,
1565 NESTABLE_TABLE_TAGS)
1567 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1568 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1570 # Used to detect the charset in a META tag; see start_meta
1571 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1573 def start_meta(self, attrs):
1574 """Beautiful Soup can detect a charset included in a META tag,
1575 try to convert the document to that charset, and re-parse the
1576 document from the beginning."""
1579 contentTypeIndex = None
1580 tagNeedsEncodingSubstitution = False
1582 for i in range(0, len(attrs)):
1583 key, value = attrs[i]
1585 if key == 'http-equiv':
1587 elif key == 'content':
1589 contentTypeIndex = i
1591 if httpEquiv and contentType: # It's an interesting meta tag.
1592 match = self.CHARSET_RE.search(contentType)
1594 if (self.declaredHTMLEncoding is not None or
1595 self.originalEncoding == self.fromEncoding):
1596 # An HTML encoding was sniffed while converting
1597 # the document to Unicode, or an HTML encoding was
1598 # sniffed during a previous pass through the
1599 # document, or an encoding was specified
1600 # explicitly and it worked. Rewrite the meta tag.
1602 return match.group(1) + "%SOUP-ENCODING%"
1603 newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1604 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1606 tagNeedsEncodingSubstitution = True
1608 # This is our first pass through the document.
1609 # Go through it again with the encoding information.
1610 newCharset = match.group(3)
1611 if newCharset and newCharset != self.originalEncoding:
1612 self.declaredHTMLEncoding = newCharset
1613 self._feed(self.declaredHTMLEncoding)
1616 tag = self.unknown_starttag("meta", attrs)
1617 if tag and tagNeedsEncodingSubstitution:
1618 tag.containsSubstitutions = True
1620 class StopParsing(Exception):
1623 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1625 """The BeautifulSoup class is oriented towards skipping over
1626 common HTML errors like unclosed tags. However, sometimes it makes
1627 errors of its own. For instance, consider this fragment:
1629 <b>Foo<b>Bar</b></b>
1631 This is perfectly valid (if bizarre) HTML. However, the
1632 BeautifulSoup class will implicitly close the first b tag when it
1633 encounters the second 'b'. It will think the author wrote
1634 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1635 there's no real-world reason to bold something that's already
1636 bold. When it encounters '</b></b>' it will close two more 'b'
1637 tags, for a grand total of three tags closed instead of two. This
1638 can throw off the rest of your document structure. The same is
1639 true of a number of other tags, listed below.
1641 It's much more common for someone to forget to close a 'b' tag
1642 than to actually use nested 'b' tags, and the BeautifulSoup class
1643 handles the common case. This class handles the not-co-common
1644 case: where you can't believe someone wrote what they did, but
1645 it's valid HTML and BeautifulSoup screwed up by assuming it
1648 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1649 ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1650 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1653 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1655 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1656 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1657 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1659 class MinimalSoup(BeautifulSoup):
1660 """The MinimalSoup class is for parsing HTML that contains
1661 pathologically bad markup. It makes no assumptions about tag
1662 nesting, but it does know which tags are self-closing, that
1663 <script> tags contain Javascript and should not be parsed, that
1664 META tags may contain encoding information, and so on.
1666 This also makes it better for subclassing than BeautifulStoneSoup
1667 or BeautifulSoup."""
1669 RESET_NESTING_TAGS = buildTagMap('noscript')
1672 class BeautifulSOAP(BeautifulStoneSoup):
1673 """This class will push a tag with only a single string child into
1674 the tag's parent as an attribute. The attribute's name is the tag
1675 name, and the value is the string child. An example should give
1676 the flavor of the change:
1678 <foo><bar>baz</bar></foo>
1680 <foo bar="baz"><bar>baz</bar></foo>
1682 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1684 This is, of course, useful for scraping structures that tend to
1685 use subelements instead of attributes, such as SOAP messages. Note
1686 that it modifies its input, so don't print the modified version
1689 I'm not sure how many people really want to use this class; let me
1690 know if you do. Mainly I like the name."""
1693 if len(self.tagStack) > 1:
1694 tag = self.tagStack[-1]
1695 parent = self.tagStack[-2]
1696 parent._getAttrMap()
1697 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1698 isinstance(tag.contents[0], NavigableString) and
1699 not parent.attrMap.has_key(tag.name)):
1700 parent[tag.name] = tag.contents[0]
1701 BeautifulStoneSoup.popTag(self)
1703 #Enterprise class names! It has come to our attention that some people
1704 #think the names of the Beautiful Soup parser classes are too silly
1705 #and "unprofessional" for use in enterprise screen-scraping. We feel
1706 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1707 #All-Night Kosher Bakery recommends renaming this file to
1708 #"RobustParser.py" (or, in cases of extreme enterprisiness,
1709 #"RobustParserBeanInterface.class") and using the following
1710 #enterprise-friendly class aliases:
1711 class RobustXMLParser(BeautifulStoneSoup):
1713 class RobustHTMLParser(BeautifulSoup):
1715 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1717 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1719 class SimplifyingSOAPParser(BeautifulSOAP):
1722 ######################################################
1724 # Bonus library: Unicode, Dammit
1726 # This class forces XML data into a standard format (usually to UTF-8
1727 # or Unicode). It is heavily based on code from Mark Pilgrim's
1728 # Universal Feed Parser. It does not rewrite the XML or HTML to
1729 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1730 # (XML) and BeautifulSoup.start_meta (HTML).
1732 # Autodetects character encodings.
1733 # Download from http://chardet.feedparser.org/
1736 # import chardet.constants
1737 # chardet.constants._debug = 1
1741 # cjkcodecs and iconv_codec make Python know about more character encodings.
1742 # Both are available from http://cjkpython.i18n.org/
1743 # They're built in if you use Python 2.4.
1745 import cjkcodecs.aliases
1753 class UnicodeDammit:
1754 """A class for detecting the encoding of a *ML document and
1755 converting it to a Unicode string. If the source encoding is
1756 windows-1252, can replace MS smart quotes with their HTML or XML
1759 # This dictionary maps commonly seen values for "charset" in HTML
1760 # meta tags to the corresponding Python codec names. It only covers
1761 # values that aren't in Python's aliases and can't be determined
1762 # by the heuristics in find_codec.
1763 CHARSET_ALIASES = { "macintosh" : "mac-roman",
1764 "x-sjis" : "shift-jis" }
1766 def __init__(self, markup, overrideEncodings=[],
1767 smartQuotesTo='xml', isHTML=False):
1768 self.declaredHTMLEncoding = None
1769 self.markup, documentEncoding, sniffedEncoding = \
1770 self._detectEncoding(markup, isHTML)
1771 self.smartQuotesTo = smartQuotesTo
1772 self.triedEncodings = []
1773 if markup == '' or isinstance(markup, unicode):
1774 self.originalEncoding = None
1775 self.unicode = unicode(markup)
1779 for proposedEncoding in overrideEncodings:
1780 u = self._convertFrom(proposedEncoding)
1783 for proposedEncoding in (documentEncoding, sniffedEncoding):
1784 u = self._convertFrom(proposedEncoding)
1787 # If no luck and we have auto-detection library, try that:
1788 if not u and chardet and not isinstance(self.markup, unicode):
1789 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1791 # As a last resort, try utf-8 and windows-1252:
1793 for proposed_encoding in ("utf-8", "windows-1252"):
1794 u = self._convertFrom(proposed_encoding)
1798 if not u: self.originalEncoding = None
1800 def _subMSChar(self, orig):
1801 """Changes a MS smart quote character to an XML or HTML
1803 sub = self.MS_CHARS.get(orig)
1804 if isinstance(sub, tuple):
1805 if self.smartQuotesTo == 'xml':
1806 sub = '&#x%s;' % sub[1]
1808 sub = '&%s;' % sub[0]
1811 def _convertFrom(self, proposed):
1812 proposed = self.find_codec(proposed)
1813 if not proposed or proposed in self.triedEncodings:
1815 self.triedEncodings.append(proposed)
1816 markup = self.markup
1818 # Convert smart quotes to HTML if coming from an encoding
1819 # that might have them.
1820 if self.smartQuotesTo and proposed.lower() in("windows-1252",
1823 markup = re.compile("([\x80-\x9f])").sub \
1824 (lambda(x): self._subMSChar(x.group(1)),
1828 # print "Trying to convert document to %s" % proposed
1829 u = self._toUnicode(markup, proposed)
1831 self.originalEncoding = proposed
1832 except Exception, e:
1833 # print "That didn't work!"
1836 #print "Correct encoding: %s" % proposed
1839 def _toUnicode(self, data, encoding):
1840 '''Given a string and its encoding, decodes the string into Unicode.
1841 %encoding is a string recognized by encodings.aliases'''
1843 # strip Byte Order Mark (if present)
1844 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1845 and (data[2:4] != '\x00\x00'):
1846 encoding = 'utf-16be'
1848 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1849 and (data[2:4] != '\x00\x00'):
1850 encoding = 'utf-16le'
1852 elif data[:3] == '\xef\xbb\xbf':
1855 elif data[:4] == '\x00\x00\xfe\xff':
1856 encoding = 'utf-32be'
1858 elif data[:4] == '\xff\xfe\x00\x00':
1859 encoding = 'utf-32le'
1861 newdata = unicode(data, encoding)
1864 def _detectEncoding(self, xml_data, isHTML=False):
1865 """Given a document, tries to detect its XML encoding."""
1866 xml_encoding = sniffed_xml_encoding = None
1868 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1870 xml_data = self._ebcdic_to_ascii(xml_data)
1871 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1873 sniffed_xml_encoding = 'utf-16be'
1874 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1875 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1876 and (xml_data[2:4] != '\x00\x00'):
1878 sniffed_xml_encoding = 'utf-16be'
1879 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1880 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1882 sniffed_xml_encoding = 'utf-16le'
1883 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1884 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1885 (xml_data[2:4] != '\x00\x00'):
1887 sniffed_xml_encoding = 'utf-16le'
1888 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1889 elif xml_data[:4] == '\x00\x00\x00\x3c':
1891 sniffed_xml_encoding = 'utf-32be'
1892 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1893 elif xml_data[:4] == '\x3c\x00\x00\x00':
1895 sniffed_xml_encoding = 'utf-32le'
1896 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1897 elif xml_data[:4] == '\x00\x00\xfe\xff':
1899 sniffed_xml_encoding = 'utf-32be'
1900 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1901 elif xml_data[:4] == '\xff\xfe\x00\x00':
1903 sniffed_xml_encoding = 'utf-32le'
1904 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1905 elif xml_data[:3] == '\xef\xbb\xbf':
1907 sniffed_xml_encoding = 'utf-8'
1908 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1910 sniffed_xml_encoding = 'ascii'
1913 xml_encoding_match = None
1914 xml_encoding_match = re.compile(
1915 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1916 if not xml_encoding_match and isHTML:
1917 regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1918 xml_encoding_match = regexp.search(xml_data)
1919 if xml_encoding_match is not None:
1920 xml_encoding = xml_encoding_match.groups()[0].lower()
1922 self.declaredHTMLEncoding = xml_encoding
1923 if sniffed_xml_encoding and \
1924 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1925 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1926 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1928 xml_encoding = sniffed_xml_encoding
1929 return xml_data, xml_encoding, sniffed_xml_encoding
1932 def find_codec(self, charset):
1933 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1934 or (charset and self._codec(charset.replace("-", ""))) \
1935 or (charset and self._codec(charset.replace("-", "_"))) \
1938 def _codec(self, charset):
1939 if not charset: return charset
1942 codecs.lookup(charset)
1944 except (LookupError, ValueError):
1948 EBCDIC_TO_ASCII_MAP = None
1949 def _ebcdic_to_ascii(self, s):
1951 if not c.EBCDIC_TO_ASCII_MAP:
1952 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1953 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1954 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1955 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1956 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1957 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1958 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1959 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1960 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1961 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1962 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1963 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1964 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1965 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1966 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1967 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1968 250,251,252,253,254,255)
1970 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1971 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1972 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1974 MS_CHARS = { '\x80' : ('euro', '20AC'),
1976 '\x82' : ('sbquo', '201A'),
1977 '\x83' : ('fnof', '192'),
1978 '\x84' : ('bdquo', '201E'),
1979 '\x85' : ('hellip', '2026'),
1980 '\x86' : ('dagger', '2020'),
1981 '\x87' : ('Dagger', '2021'),
1982 '\x88' : ('circ', '2C6'),
1983 '\x89' : ('permil', '2030'),
1984 '\x8A' : ('Scaron', '160'),
1985 '\x8B' : ('lsaquo', '2039'),
1986 '\x8C' : ('OElig', '152'),
1988 '\x8E' : ('#x17D', '17D'),
1991 '\x91' : ('lsquo', '2018'),
1992 '\x92' : ('rsquo', '2019'),
1993 '\x93' : ('ldquo', '201C'),
1994 '\x94' : ('rdquo', '201D'),
1995 '\x95' : ('bull', '2022'),
1996 '\x96' : ('ndash', '2013'),
1997 '\x97' : ('mdash', '2014'),
1998 '\x98' : ('tilde', '2DC'),
1999 '\x99' : ('trade', '2122'),
2000 '\x9a' : ('scaron', '161'),
2001 '\x9b' : ('rsaquo', '203A'),
2002 '\x9c' : ('oelig', '153'),
2004 '\x9e' : ('#x17E', '17E'),
2005 '\x9f' : ('Yuml', ''),}
2007 #######################################################################
2010 #By default, act as an HTML pretty-printer.
2011 if __name__ == '__main__':
2013 soup = BeautifulSoup(sys.stdin)
2014 print soup.prettify()