Robots/parse_html_beautifulsoup.py

   1 """
   2    HTML Parser using BeautifulSoup
   3
   4    Written by BroytMann. Copyright (C) 2007 PhiloSoft Design
   5 """
   6
   7 from HTMLParser import HTMLParser
   8 from BeautifulSoup import BeautifulSoup
   9
  10
  11 class BSoupParser(HTMLParser):
  12    def __init__(self, charset, meta, title, refresh, icon):
  13       object.__init__(self)
  14       self.charset = charset
  15       self.meta_charset = meta
  16       self.title = title
  17       self.refresh = refresh
  18       self.icon = icon
  19
  20
  21 def parse_html(filename, charset=None):
  22    infile = open(filename, 'r')
  23    try:
  24       root = BeautifulSoup(infile, fromEncoding=charset)
  25    except TypeError:
  26       return None
  27    infile.close()
  28
  29    _charset = root.originalEncoding
  30    try:
  31       title = root.html.head.title.string.encode(_charset)
  32    except AttributeError:
  33       return None
  34
  35    meta = root.html.head.find(_find_refresh, recursive=False)
  36    if meta:
  37       refresh = meta.get("content")
  38    else:
  39       refresh = None
  40
  41    meta = root.html.head.find(_find_icon, recursive=False)
  42    if meta:
  43       icon = meta.get("href")
  44    else:
  45       icon = None
  46
  47    return BSoupParser(_charset, _charset != charset, title, refresh, icon)
  48
  49 def _find_refresh(Tag):
  50    return (Tag.name == "meta") and \
  51       (Tag.get("http-equiv", '').lower() == "refresh")
  52
  53 def _find_icon(Tag):
  54    return (Tag.name == "link") and \
  55       (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))