Robots/parse_html_beautifulsoup.py

   1 """
   2    HTML Parser using BeautifulSoup
   3
   4    Written by BroytMann. Copyright (C) 2007 PhiloSoft Design
   5 """
   6
   7 from BeautifulSoup import BeautifulSoup
   8
   9
  10 class DummyParser(object):
  11    def __init__(self, charset, meta, title, refresh, icon):
  12       object.__init__(self)
  13       self.charset = charset
  14       self.meta_charset = meta
  15       self.title = title
  16       self.refresh = refresh
  17       self.icon = icon
  18
  19 def parse_html(filename, charset=None):
  20    infile = open(filename, 'r')
  21    root = BeautifulSoup(infile, fromEncoding=charset)
  22    infile.close()
  23
  24    charset = root.originalEncoding
  25    try:
  26       title = root.html.head.title.string.encode(charset)
  27    except AttributeError:
  28       title = ''
  29
  30    try:
  31       meta = root.html.head.find(_find_refresh, recursive=False)
  32    except AttributeError:
  33       refresh = None
  34    else:
  35       if meta:
  36          refresh = meta.get("content")
  37       else:
  38          refresh = None
  39
  40    try:
  41       meta = root.html.head.find(_find_icon, recursive=False)
  42    except AttributeError:
  43       icon = None
  44    else:
  45       if meta:
  46          icon = meta.get("href")
  47       else:
  48          icon = None
  49
  50    parser = DummyParser(charset, False, title, refresh, icon)
  51    return parser
  52
  53 def _find_refresh(Tag):
  54    return (Tag.name == "meta") and \
  55       (Tag.get("http-equiv", '').lower() == "refresh")
  56
  57 def _find_icon(Tag):
  58    return (Tag.name == "link") and \
  59       (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))