Robots/parse_html_html5.py

   1 """
   2     HTML Parser using html5.
   3
   4     Written by Broytman. Copyright (C) 2010 PhiloSoft Design
   5 """
   6
   7 from html5lib import HTMLParser as HTML5Parser
   8 from parse_html_util import HTMLParser
   9
  10
  11 def parse_html(filename, charset=None, log=None):
  12     fp = open(filename)
  13     html_tree = HTML5Parser().parse(fp, charset)
  14     fp.close()
  15
  16     if not html_tree.childNodes:
  17         return None
  18
  19     html = html_tree.childNodes[-1]
  20     for node in html.childNodes:
  21         if node.name == 'head':
  22             head = node
  23             break
  24     else:
  25         head = None
  26
  27     meta_charset = False
  28     title = None
  29     refresh = None
  30     icon = None
  31
  32     if head:
  33         for node in head.childNodes:
  34             if node.name == 'meta' and \
  35                     ('http-equiv' in node.attributes) and \
  36                     (node.attributes['http-equiv'] == 'content-type'):
  37                 meta_content = node.attributes['content']
  38                 if meta_content:
  39                     try:
  40                         meta_charset = \
  41                             meta_content.lower().split('charset=')[1].split(';')[0]
  42                     except IndexError:
  43                         meta_charset = False
  44                     else:
  45                         break
  46
  47         for node in head.childNodes:
  48             if node.name == 'title':
  49                 if node.childNodes:
  50                     title = node.childNodes[0].value
  51                     break
  52                 else:
  53                     title = ''
  54
  55         if title and (charset or meta_charset):
  56             title = title.encode(charset or meta_charset)
  57
  58         for node in head.childNodes:
  59             if node.name == 'meta' and \
  60                     ('http-equiv' in node.attributes) and \
  61                     (node.attributes['http-equiv'] == 'refresh'):
  62                 refresh = node.attributes['content']
  63                 break
  64
  65         for node in head.childNodes:
  66             if node.name == 'link' and \
  67                     ('rel' in node.attributes) and \
  68                     (node.attributes['rel'] in ('icon', 'shortcut icon')):
  69                 icon = node.attributes['href']
  70                 break
  71
  72     return HTMLParser(charset, meta_charset, title, refresh, icon)