BeautifulSoup.py

   1 """Beautiful Soup
   2 Elixir and Tonic
   3 "The Screen-Scraper's Friend"
   4 http://www.crummy.com/software/BeautifulSoup/
   5
   6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
   7 tree representation. It provides methods and Pythonic idioms that make
   8 it easy to navigate, search, and modify the tree.
   9
  10 A well-formed XML/HTML document yields a well-formed data
  11 structure. An ill-formed XML/HTML document yields a correspondingly
  12 ill-formed data structure. If your document is only locally
  13 well-formed, you can use this library to find and process the
  14 well-formed part of it.
  15
  16 Beautiful Soup works with Python 2.2 and up. It has no external
  17 dependencies, but you'll have more success at converting data to UTF-8
  18 if you also install these three packages:
  19
  20 * chardet, for auto-detecting character encodings
  21   http://chardet.feedparser.org/
  22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
  23   by stock Python.
  24   http://cjkpython.i18n.org/
  25
  26 Beautiful Soup defines classes for two main parsing strategies:
  27
  28  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
  29    language that kind of looks like XML.
  30
  31  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
  32    or invalid. This class has web browser-like heuristics for
  33    obtaining a sensible parse tree in the face of common HTML errors.
  34
  35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
  36 the encoding of an HTML or XML document, and converting it to
  37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
  38
  39 For more than you ever wanted to know about Beautiful Soup, see the
  40 documentation:
  41 http://www.crummy.com/software/BeautifulSoup/documentation.html
  42
  43 Here, have some legalese:
  44
  45 Copyright (c) 2004-2010, Leonard Richardson
  46
  47 All rights reserved.
  48
  49 Redistribution and use in source and binary forms, with or without
  50 modification, are permitted provided that the following conditions are
  51 met:
  52
  53   * Redistributions of source code must retain the above copyright
  54     notice, this list of conditions and the following disclaimer.
  55
  56   * Redistributions in binary form must reproduce the above
  57     copyright notice, this list of conditions and the following
  58     disclaimer in the documentation and/or other materials provided
  59     with the distribution.
  60
  61   * Neither the name of the the Beautiful Soup Consortium and All
  62     Night Kosher Bakery nor the names of its contributors may be
  63     used to endorse or promote products derived from this software
  64     without specific prior written permission.
  65
  66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
  77
  78 """
  79 from __future__ import generators
  80
  81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
  82 __version__ = "3.2.0"
  83 __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
  84 __license__ = "New-style BSD"
  85
  86 from sgmllib import SGMLParser, SGMLParseError
  87 import codecs
  88 import markupbase
  89 import types
  90 import re
  91 import sgmllib
  92 try:
  93   from htmlentitydefs import name2codepoint
  94 except ImportError:
  95   name2codepoint = {}
  96 try:
  97     set
  98 except NameError:
  99     from sets import Set as set
 100
 101 #These hacks make Beautiful Soup able to parse XML with namespaces
 102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
 104
 105 DEFAULT_OUTPUT_ENCODING = "utf-8"
 106
 107 def _match_css_class(str):
 108     """Build a RE to match the given CSS class."""
 109     return re.compile(r"(^|.*\s)%s($|\s)" % str)
 110
 111 # First, the classes that represent markup elements.
 112
 113 class PageElement(object):
 114     """Contains the navigational information for some part of the page
 115     (either a tag or a piece of text)"""
 116
 117     def setup(self, parent=None, previous=None):
 118         """Sets up the initial relations between this element and
 119         other elements."""
 120         self.parent = parent
 121         self.previous = previous
 122         self.next = None
 123         self.previousSibling = None
 124         self.nextSibling = None
 125         if self.parent and self.parent.contents:
 126             self.previousSibling = self.parent.contents[-1]
 127             self.previousSibling.nextSibling = self
 128
 129     def replaceWith(self, replaceWith):
 130         oldParent = self.parent
 131         myIndex = self.parent.index(self)
 132         if hasattr(replaceWith, "parent")\
 133                   and replaceWith.parent is self.parent:
 134             # We're replacing this element with one of its siblings.
 135             index = replaceWith.parent.index(replaceWith)
 136             if index and index < myIndex:
 137                 # Furthermore, it comes before this element. That
 138                 # means that when we extract it, the index of this
 139                 # element will change.
 140                 myIndex = myIndex - 1
 141         self.extract()
 142         oldParent.insert(myIndex, replaceWith)
 143
 144     def replaceWithChildren(self):
 145         myParent = self.parent
 146         myIndex = self.parent.index(self)
 147         self.extract()
 148         reversedChildren = list(self.contents)
 149         reversedChildren.reverse()
 150         for child in reversedChildren:
 151             myParent.insert(myIndex, child)
 152
 153     def extract(self):
 154         """Destructively rips this element out of the tree."""
 155         if self.parent:
 156             try:
 157                 del self.parent.contents[self.parent.index(self)]
 158             except ValueError:
 159                 pass
 160
 161         #Find the two elements that would be next to each other if
 162         #this element (and any children) hadn't been parsed. Connect
 163         #the two.
 164         lastChild = self._lastRecursiveChild()
 165         nextElement = lastChild.next
 166
 167         if self.previous:
 168             self.previous.next = nextElement
 169         if nextElement:
 170             nextElement.previous = self.previous
 171         self.previous = None
 172         lastChild.next = None
 173
 174         self.parent = None
 175         if self.previousSibling:
 176             self.previousSibling.nextSibling = self.nextSibling
 177         if self.nextSibling:
 178             self.nextSibling.previousSibling = self.previousSibling
 179         self.previousSibling = self.nextSibling = None
 180         return self
 181
 182     def _lastRecursiveChild(self):
 183         "Finds the last element beneath this object to be parsed."
 184         lastChild = self
 185         while hasattr(lastChild, 'contents') and lastChild.contents:
 186             lastChild = lastChild.contents[-1]
 187         return lastChild
 188
 189     def insert(self, position, newChild):
 190         if isinstance(newChild, basestring) \
 191             and not isinstance(newChild, NavigableString):
 192             newChild = NavigableString(newChild)
 193
 194         position =  min(position, len(self.contents))
 195         if hasattr(newChild, 'parent') and newChild.parent is not None:
 196             # We're 'inserting' an element that's already one
 197             # of this object's children.
 198             if newChild.parent is self:
 199                 index = self.index(newChild)
 200                 if index > position:
 201                     # Furthermore we're moving it further down the
 202                     # list of this object's children. That means that
 203                     # when we extract this element, our target index
 204                     # will jump down one.
 205                     position = position - 1
 206             newChild.extract()
 207
 208         newChild.parent = self
 209         previousChild = None
 210         if position == 0:
 211             newChild.previousSibling = None
 212             newChild.previous = self
 213         else:
 214             previousChild = self.contents[position-1]
 215             newChild.previousSibling = previousChild
 216             newChild.previousSibling.nextSibling = newChild
 217             newChild.previous = previousChild._lastRecursiveChild()
 218         if newChild.previous:
 219             newChild.previous.next = newChild
 220
 221         newChildsLastElement = newChild._lastRecursiveChild()
 222
 223         if position >= len(self.contents):
 224             newChild.nextSibling = None
 225
 226             parent = self
 227             parentsNextSibling = None
 228             while not parentsNextSibling:
 229                 parentsNextSibling = parent.nextSibling
 230                 parent = parent.parent
 231                 if not parent: # This is the last element in the document.
 232                     break
 233             if parentsNextSibling:
 234                 newChildsLastElement.next = parentsNextSibling
 235             else:
 236                 newChildsLastElement.next = None
 237         else:
 238             nextChild = self.contents[position]
 239             newChild.nextSibling = nextChild
 240             if newChild.nextSibling:
 241                 newChild.nextSibling.previousSibling = newChild
 242             newChildsLastElement.next = nextChild
 243
 244         if newChildsLastElement.next:
 245             newChildsLastElement.next.previous = newChildsLastElement
 246         self.contents.insert(position, newChild)
 247
 248     def append(self, tag):
 249         """Appends the given tag to the contents of this tag."""
 250         self.insert(len(self.contents), tag)
 251
 252     def findNext(self, name=None, attrs={}, text=None, **kwargs):
 253         """Returns the first item that matches the given criteria and
 254         appears after this Tag in the document."""
 255         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
 256
 257     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
 258                     **kwargs):
 259         """Returns all items that match the given criteria and appear
 260         after this Tag in the document."""
 261         return self._findAll(name, attrs, text, limit, self.nextGenerator,
 262                              **kwargs)
 263
 264     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
 265         """Returns the closest sibling to this Tag that matches the
 266         given criteria and appears after this Tag in the document."""
 267         return self._findOne(self.findNextSiblings, name, attrs, text,
 268                              **kwargs)
 269
 270     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
 271                          **kwargs):
 272         """Returns the siblings of this Tag that match the given
 273         criteria and appear after this Tag in the document."""
 274         return self._findAll(name, attrs, text, limit,
 275                              self.nextSiblingGenerator, **kwargs)
 276     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
 277
 278     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
 279         """Returns the first item that matches the given criteria and
 280         appears before this Tag in the document."""
 281         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
 282
 283     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
 284                         **kwargs):
 285         """Returns all items that match the given criteria and appear
 286         before this Tag in the document."""
 287         return self._findAll(name, attrs, text, limit, self.previousGenerator,
 288                            **kwargs)
 289     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
 290
 291     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
 292         """Returns the closest sibling to this Tag that matches the
 293         given criteria and appears before this Tag in the document."""
 294         return self._findOne(self.findPreviousSiblings, name, attrs, text,
 295                              **kwargs)
 296
 297     def findPreviousSiblings(self, name=None, attrs={}, text=None,
 298                              limit=None, **kwargs):
 299         """Returns the siblings of this Tag that match the given
 300         criteria and appear before this Tag in the document."""
 301         return self._findAll(name, attrs, text, limit,
 302                              self.previousSiblingGenerator, **kwargs)
 303     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
 304
 305     def findParent(self, name=None, attrs={}, **kwargs):
 306         """Returns the closest parent of this Tag that matches the given
 307         criteria."""
 308         # NOTE: We can't use _findOne because findParents takes a different
 309         # set of arguments.
 310         r = None
 311         l = self.findParents(name, attrs, 1)
 312         if l:
 313             r = l[0]
 314         return r
 315
 316     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
 317         """Returns the parents of this Tag that match the given
 318         criteria."""
 319
 320         return self._findAll(name, attrs, None, limit, self.parentGenerator,
 321                              **kwargs)
 322     fetchParents = findParents # Compatibility with pre-3.x
 323
 324     #These methods do the real heavy lifting.
 325
 326     def _findOne(self, method, name, attrs, text, **kwargs):
 327         r = None
 328         l = method(name, attrs, text, 1, **kwargs)
 329         if l:
 330             r = l[0]
 331         return r
 332
 333     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
 334         "Iterates over a generator looking for things that match."
 335
 336         if isinstance(name, SoupStrainer):
 337             strainer = name
 338         # (Possibly) special case some findAll*(...) searches
 339         elif text is None and not limit and not attrs and not kwargs:
 340             # findAll*(True)
 341             if name is True:
 342                 return [element for element in generator()
 343                         if isinstance(element, Tag)]
 344             # findAll*('tag-name')
 345             elif isinstance(name, basestring):
 346                 return [element for element in generator()
 347                         if isinstance(element, Tag) and
 348                         element.name == name]
 349             else:
 350                 strainer = SoupStrainer(name, attrs, text, **kwargs)
 351         # Build a SoupStrainer
 352         else:
 353             strainer = SoupStrainer(name, attrs, text, **kwargs)
 354         results = ResultSet(strainer)
 355         g = generator()
 356         while True:
 357             try:
 358                 i = g.next()
 359             except StopIteration:
 360                 break
 361             if i:
 362                 found = strainer.search(i)
 363                 if found:
 364                     results.append(found)
 365                     if limit and len(results) >= limit:
 366                         break
 367         return results
 368
 369     #These Generators can be used to navigate starting from both
 370     #NavigableStrings and Tags.
 371     def nextGenerator(self):
 372         i = self
 373         while i is not None:
 374             i = i.next
 375             yield i
 376
 377     def nextSiblingGenerator(self):
 378         i = self
 379         while i is not None:
 380             i = i.nextSibling
 381             yield i
 382
 383     def previousGenerator(self):
 384         i = self
 385         while i is not None:
 386             i = i.previous
 387             yield i
 388
 389     def previousSiblingGenerator(self):
 390         i = self
 391         while i is not None:
 392             i = i.previousSibling
 393             yield i
 394
 395     def parentGenerator(self):
 396         i = self
 397         while i is not None:
 398             i = i.parent
 399             yield i
 400
 401     # Utility methods
 402     def substituteEncoding(self, str, encoding=None):
 403         encoding = encoding or "utf-8"
 404         return str.replace("%SOUP-ENCODING%", encoding)
 405
 406     def toEncoding(self, s, encoding=None):
 407         """Encodes an object to a string in some encoding, or to Unicode.
 408         ."""
 409         if isinstance(s, unicode):
 410             if encoding:
 411                 s = s.encode(encoding)
 412         elif isinstance(s, str):
 413             if encoding:
 414                 s = s.encode(encoding)
 415             else:
 416                 s = unicode(s)
 417         else:
 418             if encoding:
 419                 s  = self.toEncoding(str(s), encoding)
 420             else:
 421                 s = unicode(s)
 422         return s
 423
 424 class NavigableString(unicode, PageElement):
 425
 426     def __new__(cls, value):
 427         """Create a new NavigableString.
 428
 429         When unpickling a NavigableString, this method is called with
 430         the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
 431         passed in to the superclass's __new__ or the superclass won't know
 432         how to handle non-ASCII characters.
 433         """
 434         if isinstance(value, unicode):
 435             return unicode.__new__(cls, value)
 436         return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
 437
 438     def __getnewargs__(self):
 439         return (NavigableString.__str__(self),)
 440
 441     def __getattr__(self, attr):
 442         """text.string gives you text. This is for backwards
 443         compatibility for Navigable*String, but for CData* it lets you
 444         get the string without the CData wrapper."""
 445         if attr == 'string':
 446             return self
 447         else:
 448             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 449
 450     def __unicode__(self):
 451         return str(self).decode(DEFAULT_OUTPUT_ENCODING)
 452
 453     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 454         if encoding:
 455             return self.encode(encoding)
 456         else:
 457             return self
 458
 459 class CData(NavigableString):
 460
 461     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 462         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 463
 464 class ProcessingInstruction(NavigableString):
 465     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 466         output = self
 467         if "%SOUP-ENCODING%" in output:
 468             output = self.substituteEncoding(output, encoding)
 469         return "<?%s?>" % self.toEncoding(output, encoding)
 470
 471 class Comment(NavigableString):
 472     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 473         return "<!--%s-->" % NavigableString.__str__(self, encoding)
 474
 475 class Declaration(NavigableString):
 476     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 477         return "<!%s>" % NavigableString.__str__(self, encoding)
 478
 479 class Tag(PageElement):
 480
 481     """Represents a found HTML tag with its attributes and contents."""
 482
 483     def _invert(h):
 484         "Cheap function to invert a hash."
 485         i = {}
 486         for k,v in h.items():
 487             i[v] = k
 488         return i
 489
 490     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
 491                                       "quot" : '"',
 492                                       "amp" : "&",
 493                                       "lt" : "<",
 494                                       "gt" : ">" }
 495
 496     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 497
 498     def _convertEntities(self, match):
 499         """Used in a call to re.sub to replace HTML, XML, and numeric
 500         entities with the appropriate Unicode characters. If HTML
 501         entities are being converted, any unrecognized entities are
 502         escaped."""
 503         x = match.group(1)
 504         if self.convertHTMLEntities and x in name2codepoint:
 505             return unichr(name2codepoint[x])
 506         elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
 507             if self.convertXMLEntities:
 508                 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
 509             else:
 510                 return u'&%s;' % x
 511         elif len(x) > 0 and x[0] == '#':
 512             # Handle numeric entities
 513             if len(x) > 1 and x[1] == 'x':
 514                 return unichr(int(x[2:], 16))
 515             else:
 516                 return unichr(int(x[1:]))
 517
 518         elif self.escapeUnrecognizedEntities:
 519             return u'&amp;%s;' % x
 520         else:
 521             return u'&%s;' % x
 522
 523     def __init__(self, parser, name, attrs=None, parent=None,
 524                  previous=None):
 525         "Basic constructor."
 526
 527         # We don't actually store the parser object: that lets extracted
 528         # chunks be garbage-collected
 529         self.parserClass = parser.__class__
 530         self.isSelfClosing = parser.isSelfClosingTag(name)
 531         self.name = name
 532         if attrs is None:
 533             attrs = []
 534         elif isinstance(attrs, dict):
 535             attrs = attrs.items()
 536         self.attrs = attrs
 537         self.contents = []
 538         self.setup(parent, previous)
 539         self.hidden = False
 540         self.containsSubstitutions = False
 541         self.convertHTMLEntities = parser.convertHTMLEntities
 542         self.convertXMLEntities = parser.convertXMLEntities
 543         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
 544
 545         # Convert any HTML, XML, or numeric entities in the attribute values.
 546         convert = lambda(k, val): (k,
 547                                    re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
 548                                           self._convertEntities,
 549                                           val))
 550         self.attrs = map(convert, self.attrs)
 551
 552     def getString(self):
 553         if (len(self.contents) == 1
 554             and isinstance(self.contents[0], NavigableString)):
 555             return self.contents[0]
 556
 557     def setString(self, string):
 558         """Replace the contents of the tag with a string"""
 559         self.clear()
 560         self.append(string)
 561
 562     string = property(getString, setString)
 563
 564     def getText(self, separator=u""):
 565         if not len(self.contents):
 566             return u""
 567         stopNode = self._lastRecursiveChild().next
 568         strings = []
 569         current = self.contents[0]
 570         while current is not stopNode:
 571             if isinstance(current, NavigableString):
 572                 strings.append(current.strip())
 573             current = current.next
 574         return separator.join(strings)
 575
 576     text = property(getText)
 577
 578     def get(self, key, default=None):
 579         """Returns the value of the 'key' attribute for the tag, or
 580         the value given for 'default' if it doesn't have that
 581         attribute."""
 582         return self._getAttrMap().get(key, default)
 583
 584     def clear(self):
 585         """Extract all children."""
 586         for child in self.contents[:]:
 587             child.extract()
 588
 589     def index(self, element):
 590         for i, child in enumerate(self.contents):
 591             if child is element:
 592                 return i
 593         raise ValueError("Tag.index: element not in tag")
 594
 595     def has_key(self, key):
 596         return self._getAttrMap().has_key(key)
 597
 598     def __getitem__(self, key):
 599         """tag[key] returns the value of the 'key' attribute for the tag,
 600         and throws an exception if it's not there."""
 601         return self._getAttrMap()[key]
 602
 603     def __iter__(self):
 604         "Iterating over a tag iterates over its contents."
 605         return iter(self.contents)
 606
 607     def __len__(self):
 608         "The length of a tag is the length of its list of contents."
 609         return len(self.contents)
 610
 611     def __contains__(self, x):
 612         return x in self.contents
 613
 614     def __nonzero__(self):
 615         "A tag is non-None even if it has no contents."
 616         return True
 617
 618     def __setitem__(self, key, value):
 619         """Setting tag[key] sets the value of the 'key' attribute for the
 620         tag."""
 621         self._getAttrMap()
 622         self.attrMap[key] = value
 623         found = False
 624         for i in range(0, len(self.attrs)):
 625             if self.attrs[i][0] == key:
 626                 self.attrs[i] = (key, value)
 627                 found = True
 628         if not found:
 629             self.attrs.append((key, value))
 630         self._getAttrMap()[key] = value
 631
 632     def __delitem__(self, key):
 633         "Deleting tag[key] deletes all 'key' attributes for the tag."
 634         for item in self.attrs:
 635             if item[0] == key:
 636                 self.attrs.remove(item)
 637                 #We don't break because bad HTML can define the same
 638                 #attribute multiple times.
 639             self._getAttrMap()
 640             if self.attrMap.has_key(key):
 641                 del self.attrMap[key]
 642
 643     def __call__(self, *args, **kwargs):
 644         """Calling a tag like a function is the same as calling its
 645         findAll() method. Eg. tag('a') returns a list of all the A tags
 646         found within this tag."""
 647         return apply(self.findAll, args, kwargs)
 648
 649     def __getattr__(self, tag):
 650         #print "Getattr %s.%s" % (self.__class__, tag)
 651         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
 652             return self.find(tag[:-3])
 653         elif tag.find('__') != 0:
 654             return self.find(tag)
 655         raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
 656
 657     def __eq__(self, other):
 658         """Returns true iff this tag has the same name, the same attributes,
 659         and the same contents (recursively) as the given tag.
 660
 661         NOTE: right now this will return false if two tags have the
 662         same attributes in a different order. Should this be fixed?"""
 663         if other is self:
 664             return True
 665         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
 666             return False
 667         for i in range(0, len(self.contents)):
 668             if self.contents[i] != other.contents[i]:
 669                 return False
 670         return True
 671
 672     def __ne__(self, other):
 673         """Returns true iff this tag is not identical to the other tag,
 674         as defined in __eq__."""
 675         return not self == other
 676
 677     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 678         """Renders this tag as a string."""
 679         return self.__str__(encoding)
 680
 681     def __unicode__(self):
 682         return self.__str__(None)
 683
 684     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 685                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
 686                                            + ")")
 687
 688     def _sub_entity(self, x):
 689         """Used with a regular expression to substitute the
 690         appropriate XML entity for an XML special character."""
 691         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
 692
 693     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
 694                 prettyPrint=False, indentLevel=0):
 695         """Returns a string or Unicode representation of this tag and
 696         its contents. To get Unicode, pass None for encoding.
 697
 698         NOTE: since Python's HTML parser consumes whitespace, this
 699         method is not certain to reproduce the whitespace present in
 700         the original string."""
 701
 702         encodedName = self.toEncoding(self.name, encoding)
 703
 704         attrs = []
 705         if self.attrs:
 706             for key, val in self.attrs:
 707                 fmt = '%s="%s"'
 708                 if isinstance(val, basestring):
 709                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
 710                         val = self.substituteEncoding(val, encoding)
 711
 712                     # The attribute value either:
 713                     #
 714                     # * Contains no embedded double quotes or single quotes.
 715                     #   No problem: we enclose it in double quotes.
 716                     # * Contains embedded single quotes. No problem:
 717                     #   double quotes work here too.
 718                     # * Contains embedded double quotes. No problem:
 719                     #   we enclose it in single quotes.
 720                     # * Embeds both single _and_ double quotes. This
 721                     #   can't happen naturally, but it can happen if
 722                     #   you modify an attribute value after parsing
 723                     #   the document. Now we have a bit of a
 724                     #   problem. We solve it by enclosing the
 725                     #   attribute in single quotes, and escaping any
 726                     #   embedded single quotes to XML entities.
 727                     if '"' in val:
 728                         fmt = "%s='%s'"
 729                         if "'" in val:
 730                             # TODO: replace with apos when
 731                             # appropriate.
 732                             val = val.replace("'", "&squot;")
 733
 734                     # Now we're okay w/r/t quotes. But the attribute
 735                     # value might also contain angle brackets, or
 736                     # ampersands that aren't part of entities. We need
 737                     # to escape those to XML entities too.
 738                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
 739
 740                 attrs.append(fmt % (self.toEncoding(key, encoding),
 741                                     self.toEncoding(val, encoding)))
 742         close = ''
 743         closeTag = ''
 744         if self.isSelfClosing:
 745             close = ' /'
 746         else:
 747             closeTag = '</%s>' % encodedName
 748
 749         indentTag, indentContents = 0, 0
 750         if prettyPrint:
 751             indentTag = indentLevel
 752             space = (' ' * (indentTag-1))
 753             indentContents = indentTag + 1
 754         contents = self.renderContents(encoding, prettyPrint, indentContents)
 755         if self.hidden:
 756             s = contents
 757         else:
 758             s = []
 759             attributeString = ''
 760             if attrs:
 761                 attributeString = ' ' + ' '.join(attrs)
 762             if prettyPrint:
 763                 s.append(space)
 764             s.append('<%s%s%s>' % (encodedName, attributeString, close))
 765             if prettyPrint:
 766                 s.append("\n")
 767             s.append(contents)
 768             if prettyPrint and contents and contents[-1] != "\n":
 769                 s.append("\n")
 770             if prettyPrint and closeTag:
 771                 s.append(space)
 772             s.append(closeTag)
 773             if prettyPrint and closeTag and self.nextSibling:
 774                 s.append("\n")
 775             s = ''.join(s)
 776         return s
 777
 778     def decompose(self):
 779         """Recursively destroys the contents of this tree."""
 780         self.extract()
 781         if len(self.contents) == 0:
 782             return
 783         current = self.contents[0]
 784         while current is not None:
 785             next = current.next
 786             if isinstance(current, Tag):
 787                 del current.contents[:]
 788             current.parent = None
 789             current.previous = None
 790             current.previousSibling = None
 791             current.next = None
 792             current.nextSibling = None
 793             current = next
 794
 795     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
 796         return self.__str__(encoding, True)
 797
 798     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
 799                        prettyPrint=False, indentLevel=0):
 800         """Renders the contents of this tag as a string in the given
 801         encoding. If encoding is None, returns a Unicode string.."""
 802         s=[]
 803         for c in self:
 804             text = None
 805             if isinstance(c, NavigableString):
 806                 text = c.__str__(encoding)
 807             elif isinstance(c, Tag):
 808                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
 809             if text and prettyPrint:
 810                 text = text.strip()
 811             if text:
 812                 if prettyPrint:
 813                     s.append(" " * (indentLevel-1))
 814                 s.append(text)
 815                 if prettyPrint:
 816                     s.append("\n")
 817         return ''.join(s)
 818
 819     #Soup methods
 820
 821     def find(self, name=None, attrs={}, recursive=True, text=None,
 822              **kwargs):
 823         """Return only the first child of this Tag matching the given
 824         criteria."""
 825         r = None
 826         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
 827         if l:
 828             r = l[0]
 829         return r
 830     findChild = find
 831
 832     def findAll(self, name=None, attrs={}, recursive=True, text=None,
 833                 limit=None, **kwargs):
 834         """Extracts a list of Tag objects that match the given
 835         criteria.  You can specify the name of the Tag and any
 836         attributes you want the Tag to have.
 837
 838         The value of a key-value pair in the 'attrs' map can be a
 839         string, a list of strings, a regular expression object, or a
 840         callable that takes a string and returns whether or not the
 841         string matches for some custom definition of 'matches'. The
 842         same is true of the tag name."""
 843         generator = self.recursiveChildGenerator
 844         if not recursive:
 845             generator = self.childGenerator
 846         return self._findAll(name, attrs, text, limit, generator, **kwargs)
 847     findChildren = findAll
 848
 849     # Pre-3.x compatibility methods
 850     first = find
 851     fetch = findAll
 852
 853     def fetchText(self, text=None, recursive=True, limit=None):
 854         return self.findAll(text=text, recursive=recursive, limit=limit)
 855
 856     def firstText(self, text=None, recursive=True):
 857         return self.find(text=text, recursive=recursive)
 858
 859     #Private methods
 860
 861     def _getAttrMap(self):
 862         """Initializes a map representation of this tag's attributes,
 863         if not already initialized."""
 864         if not getattr(self, 'attrMap'):
 865             self.attrMap = {}
 866             for (key, value) in self.attrs:
 867                 self.attrMap[key] = value
 868         return self.attrMap
 869
 870     #Generator methods
 871     def childGenerator(self):
 872         # Just use the iterator from the contents
 873         return iter(self.contents)
 874
 875     def recursiveChildGenerator(self):
 876         if not len(self.contents):
 877             raise StopIteration
 878         stopNode = self._lastRecursiveChild().next
 879         current = self.contents[0]
 880         while current is not stopNode:
 881             yield current
 882             current = current.next
 883
 884
 885 # Next, a couple classes to represent queries and their results.
 886 class SoupStrainer:
 887     """Encapsulates a number of ways of matching a markup element (tag or
 888     text)."""
 889
 890     def __init__(self, name=None, attrs={}, text=None, **kwargs):
 891         self.name = name
 892         if isinstance(attrs, basestring):
 893             kwargs['class'] = _match_css_class(attrs)
 894             attrs = None
 895         if kwargs:
 896             if attrs:
 897                 attrs = attrs.copy()
 898                 attrs.update(kwargs)
 899             else:
 900                 attrs = kwargs
 901         self.attrs = attrs
 902         self.text = text
 903
 904     def __str__(self):
 905         if self.text:
 906             return self.text
 907         else:
 908             return "%s|%s" % (self.name, self.attrs)
 909
 910     def searchTag(self, markupName=None, markupAttrs={}):
 911         found = None
 912         markup = None
 913         if isinstance(markupName, Tag):
 914             markup = markupName
 915             markupAttrs = markup
 916         callFunctionWithTagData = callable(self.name) \
 917                                 and not isinstance(markupName, Tag)
 918
 919         if (not self.name) \
 920                or callFunctionWithTagData \
 921                or (markup and self._matches(markup, self.name)) \
 922                or (not markup and self._matches(markupName, self.name)):
 923             if callFunctionWithTagData:
 924                 match = self.name(markupName, markupAttrs)
 925             else:
 926                 match = True
 927                 markupAttrMap = None
 928                 for attr, matchAgainst in self.attrs.items():
 929                     if not markupAttrMap:
 930                          if hasattr(markupAttrs, 'get'):
 931                             markupAttrMap = markupAttrs
 932                          else:
 933                             markupAttrMap = {}
 934                             for k,v in markupAttrs:
 935                                 markupAttrMap[k] = v
 936                     attrValue = markupAttrMap.get(attr)
 937                     if not self._matches(attrValue, matchAgainst):
 938                         match = False
 939                         break
 940             if match:
 941                 if markup:
 942                     found = markup
 943                 else:
 944                     found = markupName
 945         return found
 946
 947     def search(self, markup):
 948         #print 'looking for %s in %s' % (self, markup)
 949         found = None
 950         # If given a list of items, scan it for a text element that
 951         # matches.
 952         if hasattr(markup, "__iter__") \
 953                 and not isinstance(markup, Tag):
 954             for element in markup:
 955                 if isinstance(element, NavigableString) \
 956                        and self.search(element):
 957                     found = element
 958                     break
 959         # If it's a Tag, make sure its name or attributes match.
 960         # Don't bother with Tags if we're searching for text.
 961         elif isinstance(markup, Tag):
 962             if not self.text:
 963                 found = self.searchTag(markup)
 964         # If it's text, make sure the text matches.
 965         elif isinstance(markup, NavigableString) or \
 966                  isinstance(markup, basestring):
 967             if self._matches(markup, self.text):
 968                 found = markup
 969         else:
 970             raise Exception, "I don't know how to match against a %s" \
 971                   % markup.__class__
 972         return found
 973
 974     def _matches(self, markup, matchAgainst):
 975         #print "Matching %s against %s" % (markup, matchAgainst)
 976         result = False
 977         if matchAgainst is True:
 978             result = markup is not None
 979         elif callable(matchAgainst):
 980             result = matchAgainst(markup)
 981         else:
 982             #Custom match methods take the tag as an argument, but all
 983             #other ways of matching match the tag name as a string.
 984             if isinstance(markup, Tag):
 985                 markup = markup.name
 986             if markup and not isinstance(markup, basestring):
 987                 markup = unicode(markup)
 988             #Now we know that chunk is either a string, or None.
 989             if hasattr(matchAgainst, 'match'):
 990                 # It's a regexp object.
 991                 result = markup and matchAgainst.search(markup)
 992             elif hasattr(matchAgainst, '__iter__'): # list-like
 993                 result = markup in matchAgainst
 994             elif hasattr(matchAgainst, 'items'):
 995                 result = markup.has_key(matchAgainst)
 996             elif matchAgainst and isinstance(markup, basestring):
 997                 if isinstance(markup, unicode):
 998                     matchAgainst = unicode(matchAgainst)
 999                 else:
1000                     matchAgainst = str(matchAgainst)
1001
1002             if not result:
1003                 result = matchAgainst == markup
1004         return result
1005
1006 class ResultSet(list):
1007     """A ResultSet is just a list that keeps track of the SoupStrainer
1008     that created it."""
1009     def __init__(self, source):
1010         list.__init__([])
1011         self.source = source
1012
1013 # Now, some helper functions.
1014
1015 def buildTagMap(default, *args):
1016     """Turns a list of maps, lists, or scalars into a single map.
1017     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1018     NESTING_RESET_TAGS maps out of lists and partial maps."""
1019     built = {}
1020     for portion in args:
1021         if hasattr(portion, 'items'):
1022             #It's a map. Merge it.
1023             for k,v in portion.items():
1024                 built[k] = v
1025         elif hasattr(portion, '__iter__'): # is a list
1026             #It's a list. Map each item to the default.
1027             for k in portion:
1028                 built[k] = default
1029         else:
1030             #It's a scalar. Map it to the default.
1031             built[portion] = default
1032     return built
1033
1034 # Now, the parser classes.
1035
1036 class BeautifulStoneSoup(Tag, SGMLParser):
1037
1038     """This class contains the basic parser and search code. It defines
1039     a parser that knows nothing about tag behavior except for the
1040     following:
1041
1042       You can't close a tag without closing all the tags it encloses.
1043       That is, "<foo><bar></foo>" actually means
1044       "<foo><bar></bar></foo>".
1045
1046     [Another possible explanation is "<foo><bar /></foo>", but since
1047     this class defines no SELF_CLOSING_TAGS, it will never use that
1048     explanation.]
1049
1050     This class is useful for parsing XML or made-up markup languages,
1051     or when BeautifulSoup makes an assumption counter to what you were
1052     expecting."""
1053
1054     SELF_CLOSING_TAGS = {}
1055     NESTABLE_TAGS = {}
1056     RESET_NESTING_TAGS = {}
1057     QUOTE_TAGS = {}
1058     PRESERVE_WHITESPACE_TAGS = []
1059
1060     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1061                        lambda x: x.group(1) + ' />'),
1062                       (re.compile('<!\s+([^<>]*)>'),
1063                        lambda x: '<!' + x.group(1) + '>')
1064                       ]
1065
1066     ROOT_TAG_NAME = u'[document]'
1067
1068     HTML_ENTITIES = "html"
1069     XML_ENTITIES = "xml"
1070     XHTML_ENTITIES = "xhtml"
1071     # TODO: This only exists for backwards-compatibility
1072     ALL_ENTITIES = XHTML_ENTITIES
1073
1074     # Used when determining whether a text node is all whitespace and
1075     # can be replaced with a single space. A text node that contains
1076     # fancy Unicode spaces (usually non-breaking) should be left
1077     # alone.
1078     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1079
1080     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1081                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
1082                  convertEntities=None, selfClosingTags=None, isHTML=False):
1083         """The Soup object is initialized as the 'root tag', and the
1084         provided markup (which can be a string or a file-like object)
1085         is fed into the underlying parser.
1086
1087         sgmllib will process most bad HTML, and the BeautifulSoup
1088         class has some tricks for dealing with some HTML that kills
1089         sgmllib, but Beautiful Soup can nonetheless choke or lose data
1090         if your data uses self-closing tags or declarations
1091         incorrectly.
1092
1093         By default, Beautiful Soup uses regexes to sanitize input,
1094         avoiding the vast majority of these problems. If the problems
1095         don't apply to you, pass in False for markupMassage, and
1096         you'll get better performance.
1097
1098         The default parser massage techniques fix the two most common
1099         instances of invalid HTML that choke sgmllib:
1100
1101          <br/> (No space between name of closing tag and tag close)
1102          <! --Comment--> (Extraneous whitespace in declaration)
1103
1104         You can pass in a custom list of (RE object, replace method)
1105         tuples to get Beautiful Soup to scrub your input the way you
1106         want."""
1107
1108         self.parseOnlyThese = parseOnlyThese
1109         self.fromEncoding = fromEncoding
1110         self.smartQuotesTo = smartQuotesTo
1111         self.convertEntities = convertEntities
1112         # Set the rules for how we'll deal with the entities we
1113         # encounter
1114         if self.convertEntities:
1115             # It doesn't make sense to convert encoded characters to
1116             # entities even while you're converting entities to Unicode.
1117             # Just convert it all to Unicode.
1118             self.smartQuotesTo = None
1119             if convertEntities == self.HTML_ENTITIES:
1120                 self.convertXMLEntities = False
1121                 self.convertHTMLEntities = True
1122                 self.escapeUnrecognizedEntities = True
1123             elif convertEntities == self.XHTML_ENTITIES:
1124                 self.convertXMLEntities = True
1125                 self.convertHTMLEntities = True
1126                 self.escapeUnrecognizedEntities = False
1127             elif convertEntities == self.XML_ENTITIES:
1128                 self.convertXMLEntities = True
1129                 self.convertHTMLEntities = False
1130                 self.escapeUnrecognizedEntities = False
1131         else:
1132             self.convertXMLEntities = False
1133             self.convertHTMLEntities = False
1134             self.escapeUnrecognizedEntities = False
1135
1136         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1137         SGMLParser.__init__(self)
1138
1139         if hasattr(markup, 'read'):        # It's a file-type object.
1140             markup = markup.read()
1141         self.markup = markup
1142         self.markupMassage = markupMassage
1143         try:
1144             self._feed(isHTML=isHTML)
1145         except StopParsing:
1146             pass
1147         self.markup = None                 # The markup can now be GCed
1148
1149     def convert_charref(self, name):
1150         """This method fixes a bug in Python's SGMLParser."""
1151         try:
1152             n = int(name)
1153         except ValueError:
1154             return
1155         if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1156             return
1157         return self.convert_codepoint(n)
1158
1159     def _feed(self, inDocumentEncoding=None, isHTML=False):
1160         # Convert the document to Unicode.
1161         markup = self.markup
1162         if isinstance(markup, unicode):
1163             if not hasattr(self, 'originalEncoding'):
1164                 self.originalEncoding = None
1165         else:
1166             dammit = UnicodeDammit\
1167                      (markup, [self.fromEncoding, inDocumentEncoding],
1168                       smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1169             markup = dammit.unicode
1170             self.originalEncoding = dammit.originalEncoding
1171             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1172         if markup:
1173             if self.markupMassage:
1174                 if not hasattr(self.markupMassage, "__iter__"):
1175                     self.markupMassage = self.MARKUP_MASSAGE
1176                 for fix, m in self.markupMassage:
1177                     markup = fix.sub(m, markup)
1178                 # TODO: We get rid of markupMassage so that the
1179                 # soup object can be deepcopied later on. Some
1180                 # Python installations can't copy regexes. If anyone
1181                 # was relying on the existence of markupMassage, this
1182                 # might cause problems.
1183                 del(self.markupMassage)
1184         self.reset()
1185
1186         SGMLParser.feed(self, markup)
1187         # Close out any unfinished strings and close all the open tags.
1188         self.endData()
1189         while self.currentTag.name != self.ROOT_TAG_NAME:
1190             self.popTag()
1191
1192     def __getattr__(self, methodName):
1193         """This method routes method call requests to either the SGMLParser
1194         superclass or the Tag superclass, depending on the method name."""
1195         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1196
1197         if methodName.startswith('start_') or methodName.startswith('end_') \
1198                or methodName.startswith('do_'):
1199             return SGMLParser.__getattr__(self, methodName)
1200         elif not methodName.startswith('__'):
1201             return Tag.__getattr__(self, methodName)
1202         else:
1203             raise AttributeError
1204
1205     def isSelfClosingTag(self, name):
1206         """Returns true iff the given string is the name of a
1207         self-closing tag according to this parser."""
1208         return self.SELF_CLOSING_TAGS.has_key(name) \
1209                or self.instanceSelfClosingTags.has_key(name)
1210
1211     def reset(self):
1212         Tag.__init__(self, self, self.ROOT_TAG_NAME)
1213         self.hidden = 1
1214         SGMLParser.reset(self)
1215         self.currentData = []
1216         self.currentTag = None
1217         self.tagStack = []
1218         self.quoteStack = []
1219         self.pushTag(self)
1220
1221     def popTag(self):
1222         tag = self.tagStack.pop()
1223
1224         #print "Pop", tag.name
1225         if self.tagStack:
1226             self.currentTag = self.tagStack[-1]
1227         return self.currentTag
1228
1229     def pushTag(self, tag):
1230         #print "Push", tag.name
1231         if self.currentTag:
1232             self.currentTag.contents.append(tag)
1233         self.tagStack.append(tag)
1234         self.currentTag = self.tagStack[-1]
1235
1236     def endData(self, containerClass=NavigableString):
1237         if self.currentData:
1238             currentData = u''.join(self.currentData)
1239             if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1240                 not set([tag.name for tag in self.tagStack]).intersection(
1241                     self.PRESERVE_WHITESPACE_TAGS)):
1242                 if '\n' in currentData:
1243                     currentData = '\n'
1244                 else:
1245                     currentData = ' '
1246             self.currentData = []
1247             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1248                    (not self.parseOnlyThese.text or \
1249                     not self.parseOnlyThese.search(currentData)):
1250                 return
1251             o = containerClass(currentData)
1252             o.setup(self.currentTag, self.previous)
1253             if self.previous:
1254                 self.previous.next = o
1255             self.previous = o
1256             self.currentTag.contents.append(o)
1257
1258
1259     def _popToTag(self, name, inclusivePop=True):
1260         """Pops the tag stack up to and including the most recent
1261         instance of the given tag. If inclusivePop is false, pops the tag
1262         stack up to but *not* including the most recent instqance of
1263         the given tag."""
1264         #print "Popping to %s" % name
1265         if name == self.ROOT_TAG_NAME:
1266             return
1267
1268         numPops = 0
1269         mostRecentTag = None
1270         for i in range(len(self.tagStack)-1, 0, -1):
1271             if name == self.tagStack[i].name:
1272                 numPops = len(self.tagStack)-i
1273                 break
1274         if not inclusivePop:
1275             numPops = numPops - 1
1276
1277         for i in range(0, numPops):
1278             mostRecentTag = self.popTag()
1279         return mostRecentTag
1280
1281     def _smartPop(self, name):
1282
1283         """We need to pop up to the previous tag of this type, unless
1284         one of this tag's nesting reset triggers comes between this
1285         tag and the previous tag of this type, OR unless this tag is a
1286         generic nesting trigger and another generic nesting trigger
1287         comes between this tag and the previous tag of this type.
1288
1289         Examples:
1290          <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1291          <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1292          <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1293
1294          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1295          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1296          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1297         """
1298
1299         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1300         isNestable = nestingResetTriggers != None
1301         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1302         popTo = None
1303         inclusive = True
1304         for i in range(len(self.tagStack)-1, 0, -1):
1305             p = self.tagStack[i]
1306             if (not p or p.name == name) and not isNestable:
1307                 #Non-nestable tags get popped to the top or to their
1308                 #last occurance.
1309                 popTo = name
1310                 break
1311             if (nestingResetTriggers is not None
1312                 and p.name in nestingResetTriggers) \
1313                 or (nestingResetTriggers is None and isResetNesting
1314                     and self.RESET_NESTING_TAGS.has_key(p.name)):
1315
1316                 #If we encounter one of the nesting reset triggers
1317                 #peculiar to this tag, or we encounter another tag
1318                 #that causes nesting to reset, pop up to but not
1319                 #including that tag.
1320                 popTo = p.name
1321                 inclusive = False
1322                 break
1323             p = p.parent
1324         if popTo:
1325             self._popToTag(popTo, inclusive)
1326
1327     def unknown_starttag(self, name, attrs, selfClosing=0):
1328         #print "Start tag %s: %s" % (name, attrs)
1329         if self.quoteStack:
1330             #This is not a real tag.
1331             #print "<%s> is not real!" % name
1332             attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1333             self.handle_data('<%s%s>' % (name, attrs))
1334             return
1335         self.endData()
1336
1337         if not self.isSelfClosingTag(name) and not selfClosing:
1338             self._smartPop(name)
1339
1340         if self.parseOnlyThese and len(self.tagStack) <= 1 \
1341                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1342             return
1343
1344         tag = Tag(self, name, attrs, self.currentTag, self.previous)
1345         if self.previous:
1346             self.previous.next = tag
1347         self.previous = tag
1348         self.pushTag(tag)
1349         if selfClosing or self.isSelfClosingTag(name):
1350             self.popTag()
1351         if name in self.QUOTE_TAGS:
1352             #print "Beginning quote (%s)" % name
1353             self.quoteStack.append(name)
1354             self.literal = 1
1355         return tag
1356
1357     def unknown_endtag(self, name):
1358         #print "End tag %s" % name
1359         if self.quoteStack and self.quoteStack[-1] != name:
1360             #This is not a real end tag.
1361             #print "</%s> is not real!" % name
1362             self.handle_data('</%s>' % name)
1363             return
1364         self.endData()
1365         self._popToTag(name)
1366         if self.quoteStack and self.quoteStack[-1] == name:
1367             self.quoteStack.pop()
1368             self.literal = (len(self.quoteStack) > 0)
1369
1370     def handle_data(self, data):
1371         self.currentData.append(data)
1372
1373     def _toStringSubclass(self, text, subclass):
1374         """Adds a certain piece of text to the tree as a NavigableString
1375         subclass."""
1376         self.endData()
1377         self.handle_data(text)
1378         self.endData(subclass)
1379
1380     def handle_pi(self, text):
1381         """Handle a processing instruction as a ProcessingInstruction
1382         object, possibly one with a %SOUP-ENCODING% slot into which an
1383         encoding will be plugged later."""
1384         if text[:3] == "xml":
1385             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1386         self._toStringSubclass(text, ProcessingInstruction)
1387
1388     def handle_comment(self, text):
1389         "Handle comments as Comment objects."
1390         self._toStringSubclass(text, Comment)
1391
1392     def handle_charref(self, ref):
1393         "Handle character references as data."
1394         if self.convertEntities:
1395             data = unichr(int(ref))
1396         else:
1397             data = '&#%s;' % ref
1398         self.handle_data(data)
1399
1400     def handle_entityref(self, ref):
1401         """Handle entity references as data, possibly converting known
1402         HTML and/or XML entity references to the corresponding Unicode
1403         characters."""
1404         data = None
1405         if self.convertHTMLEntities:
1406             try:
1407                 data = unichr(name2codepoint[ref])
1408             except KeyError:
1409                 pass
1410
1411         if not data and self.convertXMLEntities:
1412                 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1413
1414         if not data and self.convertHTMLEntities and \
1415             not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1416                 # TODO: We've got a problem here. We're told this is
1417                 # an entity reference, but it's not an XML entity
1418                 # reference or an HTML entity reference. Nonetheless,
1419                 # the logical thing to do is to pass it through as an
1420                 # unrecognized entity reference.
1421                 #
1422                 # Except: when the input is "&carol;" this function
1423                 # will be called with input "carol". When the input is
1424                 # "AT&T", this function will be called with input
1425                 # "T". We have no way of knowing whether a semicolon
1426                 # was present originally, so we don't know whether
1427                 # this is an unknown entity or just a misplaced
1428                 # ampersand.
1429                 #
1430                 # The more common case is a misplaced ampersand, so I
1431                 # escape the ampersand and omit the trailing semicolon.
1432                 data = "&amp;%s" % ref
1433         if not data:
1434             # This case is different from the one above, because we
1435             # haven't already gone through a supposedly comprehensive
1436             # mapping of entities to Unicode characters. We might not
1437             # have gone through any mapping at all. So the chances are
1438             # very high that this is a real entity, and not a
1439             # misplaced ampersand.
1440             data = "&%s;" % ref
1441         self.handle_data(data)
1442
1443     def handle_decl(self, data):
1444         "Handle DOCTYPEs and the like as Declaration objects."
1445         self._toStringSubclass(data, Declaration)
1446
1447     def parse_declaration(self, i):
1448         """Treat a bogus SGML declaration as raw data. Treat a CDATA
1449         declaration as a CData object."""
1450         j = None
1451         if self.rawdata[i:i+9] == '<![CDATA[':
1452              k = self.rawdata.find(']]>', i)
1453              if k == -1:
1454                  k = len(self.rawdata)
1455              data = self.rawdata[i+9:k]
1456              j = k+3
1457              self._toStringSubclass(data, CData)
1458         else:
1459             try:
1460                 j = SGMLParser.parse_declaration(self, i)
1461             except SGMLParseError:
1462                 toHandle = self.rawdata[i:]
1463                 self.handle_data(toHandle)
1464                 j = i + len(toHandle)
1465         return j
1466
1467 class BeautifulSoup(BeautifulStoneSoup):
1468
1469     """This parser knows the following facts about HTML:
1470
1471     * Some tags have no closing tag and should be interpreted as being
1472       closed as soon as they are encountered.
1473
1474     * The text inside some tags (ie. 'script') may contain tags which
1475       are not really part of the document and which should be parsed
1476       as text, not tags. If you want to parse the text as tags, you can
1477       always fetch it and parse it explicitly.
1478
1479     * Tag nesting rules:
1480
1481       Most tags can't be nested at all. For instance, the occurance of
1482       a <p> tag should implicitly close the previous <p> tag.
1483
1484        <p>Para1<p>Para2
1485         should be transformed into:
1486        <p>Para1</p><p>Para2
1487
1488       Some tags can be nested arbitrarily. For instance, the occurance
1489       of a <blockquote> tag should _not_ implicitly close the previous
1490       <blockquote> tag.
1491
1492        Alice said: <blockquote>Bob said: <blockquote>Blah
1493         should NOT be transformed into:
1494        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1495
1496       Some tags can be nested, but the nesting is reset by the
1497       interposition of other tags. For instance, a <tr> tag should
1498       implicitly close the previous <tr> tag within the same <table>,
1499       but not close a <tr> tag in another table.
1500
1501        <table><tr>Blah<tr>Blah
1502         should be transformed into:
1503        <table><tr>Blah</tr><tr>Blah
1504         but,
1505        <tr>Blah<table><tr>Blah
1506         should NOT be transformed into
1507        <tr>Blah<table></tr><tr>Blah
1508
1509     Differing assumptions about tag nesting rules are a major source
1510     of problems with the BeautifulSoup class. If BeautifulSoup is not
1511     treating as nestable a tag your page author treats as nestable,
1512     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1513     BeautifulStoneSoup before writing your own subclass."""
1514
1515     def __init__(self, *args, **kwargs):
1516         if not kwargs.has_key('smartQuotesTo'):
1517             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1518         kwargs['isHTML'] = True
1519         BeautifulStoneSoup.__init__(self, *args, **kwargs)
1520
1521     SELF_CLOSING_TAGS = buildTagMap(None,
1522                                     ('br' , 'hr', 'input', 'img', 'meta',
1523                                     'spacer', 'link', 'frame', 'base', 'col'))
1524
1525     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1526
1527     QUOTE_TAGS = {'script' : None, 'textarea' : None}
1528
1529     #According to the HTML standard, each of these inline tags can
1530     #contain another tag of the same type. Furthermore, it's common
1531     #to actually use these tags this way.
1532     NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1533                             'center')
1534
1535     #According to the HTML standard, these block tags can contain
1536     #another tag of the same type. Furthermore, it's common
1537     #to actually use these tags this way.
1538     NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1539
1540     #Lists can contain other lists, but there are restrictions.
1541     NESTABLE_LIST_TAGS = { 'ol' : [],
1542                            'ul' : [],
1543                            'li' : ['ul', 'ol'],
1544                            'dl' : [],
1545                            'dd' : ['dl'],
1546                            'dt' : ['dl'] }
1547
1548     #Tables can contain other tables, but there are restrictions.
1549     NESTABLE_TABLE_TAGS = {'table' : [],
1550                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1551                            'td' : ['tr'],
1552                            'th' : ['tr'],
1553                            'thead' : ['table'],
1554                            'tbody' : ['table'],
1555                            'tfoot' : ['table'],
1556                            }
1557
1558     NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1559
1560     #If one of these tags is encountered, all tags up to the next tag of
1561     #this type are popped.
1562     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1563                                      NON_NESTABLE_BLOCK_TAGS,
1564                                      NESTABLE_LIST_TAGS,
1565                                      NESTABLE_TABLE_TAGS)
1566
1567     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1568                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1569
1570     # Used to detect the charset in a META tag; see start_meta
1571     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1572
1573     def start_meta(self, attrs):
1574         """Beautiful Soup can detect a charset included in a META tag,
1575         try to convert the document to that charset, and re-parse the
1576         document from the beginning."""
1577         httpEquiv = None
1578         contentType = None
1579         contentTypeIndex = None
1580         tagNeedsEncodingSubstitution = False
1581
1582         for i in range(0, len(attrs)):
1583             key, value = attrs[i]
1584             key = key.lower()
1585             if key == 'http-equiv':
1586                 httpEquiv = value
1587             elif key == 'content':
1588                 contentType = value
1589                 contentTypeIndex = i
1590
1591         if httpEquiv and contentType: # It's an interesting meta tag.
1592             match = self.CHARSET_RE.search(contentType)
1593             if match:
1594                 if (self.declaredHTMLEncoding is not None or
1595                     self.originalEncoding == self.fromEncoding):
1596                     # An HTML encoding was sniffed while converting
1597                     # the document to Unicode, or an HTML encoding was
1598                     # sniffed during a previous pass through the
1599                     # document, or an encoding was specified
1600                     # explicitly and it worked. Rewrite the meta tag.
1601                     def rewrite(match):
1602                         return match.group(1) + "%SOUP-ENCODING%"
1603                     newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1604                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1605                                                newAttr)
1606                     tagNeedsEncodingSubstitution = True
1607                 else:
1608                     # This is our first pass through the document.
1609                     # Go through it again with the encoding information.
1610                     newCharset = match.group(3)
1611                     if newCharset and newCharset != self.originalEncoding:
1612                         self.declaredHTMLEncoding = newCharset
1613                         self._feed(self.declaredHTMLEncoding)
1614                         raise StopParsing
1615                     pass
1616         tag = self.unknown_starttag("meta", attrs)
1617         if tag and tagNeedsEncodingSubstitution:
1618             tag.containsSubstitutions = True
1619
1620 class StopParsing(Exception):
1621     pass
1622
1623 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1624
1625     """The BeautifulSoup class is oriented towards skipping over
1626     common HTML errors like unclosed tags. However, sometimes it makes
1627     errors of its own. For instance, consider this fragment:
1628
1629      <b>Foo<b>Bar</b></b>
1630
1631     This is perfectly valid (if bizarre) HTML. However, the
1632     BeautifulSoup class will implicitly close the first b tag when it
1633     encounters the second 'b'. It will think the author wrote
1634     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1635     there's no real-world reason to bold something that's already
1636     bold. When it encounters '</b></b>' it will close two more 'b'
1637     tags, for a grand total of three tags closed instead of two. This
1638     can throw off the rest of your document structure. The same is
1639     true of a number of other tags, listed below.
1640
1641     It's much more common for someone to forget to close a 'b' tag
1642     than to actually use nested 'b' tags, and the BeautifulSoup class
1643     handles the common case. This class handles the not-co-common
1644     case: where you can't believe someone wrote what they did, but
1645     it's valid HTML and BeautifulSoup screwed up by assuming it
1646     wouldn't be."""
1647
1648     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1649      ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1650       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1651       'big')
1652
1653     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1654
1655     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1656                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1657                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1658
1659 class MinimalSoup(BeautifulSoup):
1660     """The MinimalSoup class is for parsing HTML that contains
1661     pathologically bad markup. It makes no assumptions about tag
1662     nesting, but it does know which tags are self-closing, that
1663     <script> tags contain Javascript and should not be parsed, that
1664     META tags may contain encoding information, and so on.
1665
1666     This also makes it better for subclassing than BeautifulStoneSoup
1667     or BeautifulSoup."""
1668
1669     RESET_NESTING_TAGS = buildTagMap('noscript')
1670     NESTABLE_TAGS = {}
1671
1672 class BeautifulSOAP(BeautifulStoneSoup):
1673     """This class will push a tag with only a single string child into
1674     the tag's parent as an attribute. The attribute's name is the tag
1675     name, and the value is the string child. An example should give
1676     the flavor of the change:
1677
1678     <foo><bar>baz</bar></foo>
1679      =>
1680     <foo bar="baz"><bar>baz</bar></foo>
1681
1682     You can then access fooTag['bar'] instead of fooTag.barTag.string.
1683
1684     This is, of course, useful for scraping structures that tend to
1685     use subelements instead of attributes, such as SOAP messages. Note
1686     that it modifies its input, so don't print the modified version
1687     out.
1688
1689     I'm not sure how many people really want to use this class; let me
1690     know if you do. Mainly I like the name."""
1691
1692     def popTag(self):
1693         if len(self.tagStack) > 1:
1694             tag = self.tagStack[-1]
1695             parent = self.tagStack[-2]
1696             parent._getAttrMap()
1697             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1698                 isinstance(tag.contents[0], NavigableString) and
1699                 not parent.attrMap.has_key(tag.name)):
1700                 parent[tag.name] = tag.contents[0]
1701         BeautifulStoneSoup.popTag(self)
1702
1703 #Enterprise class names! It has come to our attention that some people
1704 #think the names of the Beautiful Soup parser classes are too silly
1705 #and "unprofessional" for use in enterprise screen-scraping. We feel
1706 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1707 #All-Night Kosher Bakery recommends renaming this file to
1708 #"RobustParser.py" (or, in cases of extreme enterprisiness,
1709 #"RobustParserBeanInterface.class") and using the following
1710 #enterprise-friendly class aliases:
1711 class RobustXMLParser(BeautifulStoneSoup):
1712     pass
1713 class RobustHTMLParser(BeautifulSoup):
1714     pass
1715 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1716     pass
1717 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1718     pass
1719 class SimplifyingSOAPParser(BeautifulSOAP):
1720     pass
1721
1722 ######################################################
1723 #
1724 # Bonus library: Unicode, Dammit
1725 #
1726 # This class forces XML data into a standard format (usually to UTF-8
1727 # or Unicode).  It is heavily based on code from Mark Pilgrim's
1728 # Universal Feed Parser. It does not rewrite the XML or HTML to
1729 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1730 # (XML) and BeautifulSoup.start_meta (HTML).
1731
1732 # Autodetects character encodings.
1733 # Download from http://chardet.feedparser.org/
1734 try:
1735     import chardet
1736 #    import chardet.constants
1737 #    chardet.constants._debug = 1
1738 except ImportError:
1739     chardet = None
1740
1741 # cjkcodecs and iconv_codec make Python know about more character encodings.
1742 # Both are available from http://cjkpython.i18n.org/
1743 # They're built in if you use Python 2.4.
1744 try:
1745     import cjkcodecs.aliases
1746 except ImportError:
1747     pass
1748 try:
1749     import iconv_codec
1750 except ImportError:
1751     pass
1752
1753 class UnicodeDammit:
1754     """A class for detecting the encoding of a *ML document and
1755     converting it to a Unicode string. If the source encoding is
1756     windows-1252, can replace MS smart quotes with their HTML or XML
1757     equivalents."""
1758
1759     # This dictionary maps commonly seen values for "charset" in HTML
1760     # meta tags to the corresponding Python codec names. It only covers
1761     # values that aren't in Python's aliases and can't be determined
1762     # by the heuristics in find_codec.
1763     CHARSET_ALIASES = { "macintosh" : "mac-roman",
1764                         "x-sjis" : "shift-jis" }
1765
1766     def __init__(self, markup, overrideEncodings=[],
1767                  smartQuotesTo='xml', isHTML=False):
1768         self.declaredHTMLEncoding = None
1769         self.markup, documentEncoding, sniffedEncoding = \
1770                      self._detectEncoding(markup, isHTML)
1771         self.smartQuotesTo = smartQuotesTo
1772         self.triedEncodings = []
1773         if markup == '' or isinstance(markup, unicode):
1774             self.originalEncoding = None
1775             self.unicode = unicode(markup)
1776             return
1777
1778         u = None
1779         for proposedEncoding in overrideEncodings:
1780             u = self._convertFrom(proposedEncoding)
1781             if u: break
1782         if not u:
1783             for proposedEncoding in (documentEncoding, sniffedEncoding):
1784                 u = self._convertFrom(proposedEncoding)
1785                 if u: break
1786
1787         # If no luck and we have auto-detection library, try that:
1788         if not u and chardet and not isinstance(self.markup, unicode):
1789             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1790
1791         # As a last resort, try utf-8 and windows-1252:
1792         if not u:
1793             for proposed_encoding in ("utf-8", "windows-1252"):
1794                 u = self._convertFrom(proposed_encoding)
1795                 if u: break
1796
1797         self.unicode = u
1798         if not u: self.originalEncoding = None
1799
1800     def _subMSChar(self, orig):
1801         """Changes a MS smart quote character to an XML or HTML
1802         entity."""
1803         sub = self.MS_CHARS.get(orig)
1804         if isinstance(sub, tuple):
1805             if self.smartQuotesTo == 'xml':
1806                 sub = '&#x%s;' % sub[1]
1807             else:
1808                 sub = '&%s;' % sub[0]
1809         return sub
1810
1811     def _convertFrom(self, proposed):
1812         proposed = self.find_codec(proposed)
1813         if not proposed or proposed in self.triedEncodings:
1814             return None
1815         self.triedEncodings.append(proposed)
1816         markup = self.markup
1817
1818         # Convert smart quotes to HTML if coming from an encoding
1819         # that might have them.
1820         if self.smartQuotesTo and proposed.lower() in("windows-1252",
1821                                                       "iso-8859-1",
1822                                                       "iso-8859-2"):
1823             markup = re.compile("([\x80-\x9f])").sub \
1824                      (lambda(x): self._subMSChar(x.group(1)),
1825                       markup)
1826
1827         try:
1828             # print "Trying to convert document to %s" % proposed
1829             u = self._toUnicode(markup, proposed)
1830             self.markup = u
1831             self.originalEncoding = proposed
1832         except Exception, e:
1833             # print "That didn't work!"
1834             # print e
1835             return None
1836         #print "Correct encoding: %s" % proposed
1837         return self.markup
1838
1839     def _toUnicode(self, data, encoding):
1840         '''Given a string and its encoding, decodes the string into Unicode.
1841         %encoding is a string recognized by encodings.aliases'''
1842
1843         # strip Byte Order Mark (if present)
1844         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1845                and (data[2:4] != '\x00\x00'):
1846             encoding = 'utf-16be'
1847             data = data[2:]
1848         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1849                  and (data[2:4] != '\x00\x00'):
1850             encoding = 'utf-16le'
1851             data = data[2:]
1852         elif data[:3] == '\xef\xbb\xbf':
1853             encoding = 'utf-8'
1854             data = data[3:]
1855         elif data[:4] == '\x00\x00\xfe\xff':
1856             encoding = 'utf-32be'
1857             data = data[4:]
1858         elif data[:4] == '\xff\xfe\x00\x00':
1859             encoding = 'utf-32le'
1860             data = data[4:]
1861         newdata = unicode(data, encoding)
1862         return newdata
1863
1864     def _detectEncoding(self, xml_data, isHTML=False):
1865         """Given a document, tries to detect its XML encoding."""
1866         xml_encoding = sniffed_xml_encoding = None
1867         try:
1868             if xml_data[:4] == '\x4c\x6f\xa7\x94':
1869                 # EBCDIC
1870                 xml_data = self._ebcdic_to_ascii(xml_data)
1871             elif xml_data[:4] == '\x00\x3c\x00\x3f':
1872                 # UTF-16BE
1873                 sniffed_xml_encoding = 'utf-16be'
1874                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1875             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1876                      and (xml_data[2:4] != '\x00\x00'):
1877                 # UTF-16BE with BOM
1878                 sniffed_xml_encoding = 'utf-16be'
1879                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1880             elif xml_data[:4] == '\x3c\x00\x3f\x00':
1881                 # UTF-16LE
1882                 sniffed_xml_encoding = 'utf-16le'
1883                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1884             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1885                      (xml_data[2:4] != '\x00\x00'):
1886                 # UTF-16LE with BOM
1887                 sniffed_xml_encoding = 'utf-16le'
1888                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1889             elif xml_data[:4] == '\x00\x00\x00\x3c':
1890                 # UTF-32BE
1891                 sniffed_xml_encoding = 'utf-32be'
1892                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1893             elif xml_data[:4] == '\x3c\x00\x00\x00':
1894                 # UTF-32LE
1895                 sniffed_xml_encoding = 'utf-32le'
1896                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1897             elif xml_data[:4] == '\x00\x00\xfe\xff':
1898                 # UTF-32BE with BOM
1899                 sniffed_xml_encoding = 'utf-32be'
1900                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1901             elif xml_data[:4] == '\xff\xfe\x00\x00':
1902                 # UTF-32LE with BOM
1903                 sniffed_xml_encoding = 'utf-32le'
1904                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1905             elif xml_data[:3] == '\xef\xbb\xbf':
1906                 # UTF-8 with BOM
1907                 sniffed_xml_encoding = 'utf-8'
1908                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1909             else:
1910                 sniffed_xml_encoding = 'ascii'
1911                 pass
1912         except:
1913             xml_encoding_match = None
1914         xml_encoding_match = re.compile(
1915             '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1916         if not xml_encoding_match and isHTML:
1917             regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1918             xml_encoding_match = regexp.search(xml_data)
1919         if xml_encoding_match is not None:
1920             xml_encoding = xml_encoding_match.groups()[0].lower()
1921             if isHTML:
1922                 self.declaredHTMLEncoding = xml_encoding
1923             if sniffed_xml_encoding and \
1924                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1925                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1926                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
1927                                  'utf16', 'u16')):
1928                 xml_encoding = sniffed_xml_encoding
1929         return xml_data, xml_encoding, sniffed_xml_encoding
1930
1931
1932     def find_codec(self, charset):
1933         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1934                or (charset and self._codec(charset.replace("-", ""))) \
1935                or (charset and self._codec(charset.replace("-", "_"))) \
1936                or charset
1937
1938     def _codec(self, charset):
1939         if not charset: return charset
1940         codec = None
1941         try:
1942             codecs.lookup(charset)
1943             codec = charset
1944         except (LookupError, ValueError):
1945             pass
1946         return codec
1947
1948     EBCDIC_TO_ASCII_MAP = None
1949     def _ebcdic_to_ascii(self, s):
1950         c = self.__class__
1951         if not c.EBCDIC_TO_ASCII_MAP:
1952             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1953                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1954                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1955                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1956                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1957                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1958                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1959                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1960                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1961                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1962                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1963                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1964                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1965                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1966                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1967                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1968                     250,251,252,253,254,255)
1969             import string
1970             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1971             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1972         return s.translate(c.EBCDIC_TO_ASCII_MAP)
1973
1974     MS_CHARS = { '\x80' : ('euro', '20AC'),
1975                  '\x81' : ' ',
1976                  '\x82' : ('sbquo', '201A'),
1977                  '\x83' : ('fnof', '192'),
1978                  '\x84' : ('bdquo', '201E'),
1979                  '\x85' : ('hellip', '2026'),
1980                  '\x86' : ('dagger', '2020'),
1981                  '\x87' : ('Dagger', '2021'),
1982                  '\x88' : ('circ', '2C6'),
1983                  '\x89' : ('permil', '2030'),
1984                  '\x8A' : ('Scaron', '160'),
1985                  '\x8B' : ('lsaquo', '2039'),
1986                  '\x8C' : ('OElig', '152'),
1987                  '\x8D' : '?',
1988                  '\x8E' : ('#x17D', '17D'),
1989                  '\x8F' : '?',
1990                  '\x90' : '?',
1991                  '\x91' : ('lsquo', '2018'),
1992                  '\x92' : ('rsquo', '2019'),
1993                  '\x93' : ('ldquo', '201C'),
1994                  '\x94' : ('rdquo', '201D'),
1995                  '\x95' : ('bull', '2022'),
1996                  '\x96' : ('ndash', '2013'),
1997                  '\x97' : ('mdash', '2014'),
1998                  '\x98' : ('tilde', '2DC'),
1999                  '\x99' : ('trade', '2122'),
2000                  '\x9a' : ('scaron', '161'),
2001                  '\x9b' : ('rsaquo', '203A'),
2002                  '\x9c' : ('oelig', '153'),
2003                  '\x9d' : '?',
2004                  '\x9e' : ('#x17E', '17E'),
2005                  '\x9f' : ('Yuml', ''),}
2006
2007 #######################################################################
2008
2009
2010 #By default, act as an HTML pretty-printer.
2011 if __name__ == '__main__':
2012     import sys
2013     soup = BeautifulSoup(sys.stdin)
2014     print soup.prettify()