parse_html/beautifulsoup.py

   1 """HTML Parser using BeautifulSoup
   2
   3 This file is a part of Bookmarks database and Internet robot.
   4 """
   5
   6 __version__ = "$Revision$"[11:-2]
   7 __revision__ = "$Id$"[5:-2]
   8 __date__ = "$Date$"[7:-2]
   9 __author__ = "Oleg Broytman <phd@phdru.name>"
  10 __copyright__ = "Copyright (C) 2007-2011 PhiloSoft Design"
  11 __license__ = "GNU GPL"
  12
  13 import re
  14 from sgmllib import SGMLParser, SGMLParseError
  15 from BeautifulSoup import BeautifulSoup, CData
  16 from .util import HTMLParser
  17
  18
  19 # http://groups.google.com/group/beautifulsoup/browse_thread/thread/69093cb0d3a3cf63
  20 class BadDeclParser(BeautifulSoup):
  21     def parse_declaration(self, i):
  22          """Treat a bogus SGML declaration as raw data. Treat a CDATA
  23          declaration as a CData object."""
  24          j = None
  25          if self.rawdata[i:i+9] == '<![CDATA[':
  26               k = self.rawdata.find(']]>', i)
  27               if k == -1:
  28                   k = len(self.rawdata)
  29               data = self.rawdata[i+9:k]
  30               j = k+3
  31               self._toStringSubclass(data, CData)
  32          else:
  33              try:
  34                  j = SGMLParser.parse_declaration(self, i)
  35              except SGMLParseError:
  36                  # Could not parse the DOCTYPE declaration
  37                  # Try to just skip the actual declaration
  38                  match = re.search(r'<!DOCTYPE([^>]*?)>', self.rawdata[i:], re.MULTILINE|re.IGNORECASE)
  39                  if match:
  40                      toHandle = self.rawdata[i:match.end()]
  41                  else:
  42                      toHandle = self.rawdata[i:]
  43                  self.handle_data(toHandle)
  44                  j = i + len(toHandle)
  45          return j
  46
  47
  48 def _parse_html(filename, charset):
  49    infile = open(filename, 'r')
  50    try:
  51       return BadDeclParser(infile, fromEncoding=charset)
  52    except TypeError:
  53       return None
  54    finally:
  55       infile.close()
  56
  57 def parse_html(filename, charset=None, log=None):
  58    root = _parse_html(filename, charset)
  59    if root is None:
  60       return None
  61
  62    _charset = root.originalEncoding
  63    if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"): # Replace default
  64       _charset = DEFAULT_CHARSET
  65       root = _parse_html(filename, _charset)
  66       if root is None:
  67          return None
  68
  69    html = root.html
  70    if html is None:
  71       html = root
  72
  73    head = html.head
  74    if head is None:
  75       head = html # Some sites put TITLE in HTML without HEAD
  76
  77    title = head.title
  78    if (title is None) and (html is not head):
  79       # Some sites put TITLE in HTML outside of HEAD
  80       title = html.title
  81
  82    if title is None:
  83       # Lookup TITLE in the root
  84       title = root.title
  85
  86    if title is None:
  87       return None
  88
  89    if title.string:
  90       title = title.string
  91    else:
  92       parts = []
  93       for part in title:
  94          if not isinstance(part, basestring):
  95             part = unicode(part)
  96          parts.append(part.strip())
  97       title = ''.join(parts)
  98
  99    meta = head.find(_find_contenttype, recursive=False)
 100    if meta:
 101       try:
 102          meta_content = meta.get("content")
 103          if meta_content:
 104              __charset = meta_content.lower().split('charset=')[1].split(';')[0]
 105          else:
 106              __charset = False
 107       except IndexError: # No charset in the META Content-Type
 108          meta_charset = False
 109       else:
 110          meta_charset = _charset == __charset
 111    else:
 112       meta_charset = False
 113
 114    if _charset or meta_charset:
 115       title = title.encode(_charset or meta_charset)
 116
 117    meta = head.find(_find_refresh, recursive=False)
 118    if meta:
 119       refresh = meta.get("content")
 120    else:
 121       refresh = None
 122
 123    meta = head.find(_find_icon, recursive=False)
 124    if meta:
 125       icon = meta.get("href")
 126    else:
 127       icon = None
 128
 129    return HTMLParser(_charset, meta_charset, title, refresh, icon)
 130
 131 def _find_contenttype(Tag):
 132    return (Tag.name == "meta") and \
 133       (Tag.get("http-equiv", '').lower() == "content-type")
 134
 135 def _find_refresh(Tag):
 136    return (Tag.name == "meta") and \
 137       (Tag.get("http-equiv", '').lower() == "refresh")
 138
 139 def _find_icon(Tag):
 140    return (Tag.name == "link") and \
 141       (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))