Robots/parse_html_beautifulsoup.py

   1 """
   2    HTML Parser using BeautifulSoup
   3
   4    Written by BroytMann. Copyright (C) 2007 PhiloSoft Design
   5 """
   6
   7 from HTMLParser import HTMLParser
   8 from BeautifulSoup import BeautifulSoup
   9
  10
  11 class BSoupParser(HTMLParser):
  12    def __init__(self, charset, meta, title, refresh, icon):
  13       object.__init__(self)
  14       self.charset = charset
  15       self.meta_charset = meta
  16       self.title = title
  17       self.refresh = refresh
  18       self.icon = icon
  19
  20
  21 def parse_html(filename, charset=None):
  22    infile = open(filename, 'r')
  23    root = BeautifulSoup(infile, fromEncoding=charset)
  24    infile.close()
  25
  26    charset = root.originalEncoding
  27    try:
  28       title = root.html.head.title.string.encode(charset)
  29    except AttributeError:
  30       title = ''
  31
  32    try:
  33       meta = root.html.head.find(_find_refresh, recursive=False)
  34    except AttributeError:
  35       refresh = None
  36    else:
  37       if meta:
  38          refresh = meta.get("content")
  39       else:
  40          refresh = None
  41
  42    try:
  43       meta = root.html.head.find(_find_icon, recursive=False)
  44    except AttributeError:
  45       icon = None
  46    else:
  47       if meta:
  48          icon = meta.get("href")
  49       else:
  50          icon = None
  51
  52    parser = BSoupParser(charset, False, title, refresh, icon)
  53    return parser
  54
  55 def _find_refresh(Tag):
  56    return (Tag.name == "meta") and \
  57       (Tag.get("http-equiv", '').lower() == "refresh")
  58
  59 def _find_icon(Tag):
  60    return (Tag.name == "link") and \
  61       (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))