Robots/parse_html_html5.py

   1 """
   2     HTML Parser using html5.
   3
   4     Written by Broytman. Copyright (C) 2010 PhiloSoft Design
   5 """
   6
   7 from html5lib import HTMLParser as HTML5Parser
   8 from parse_html_util import HTMLParser
   9
  10
  11 def parse_html(filename, charset=None, log=None):
  12     fp = open(filename)
  13     html_tree = HTML5Parser().parse(fp, charset)
  14     fp.close()
  15
  16     html = html_tree.childNodes[-1]
  17     for node in html.childNodes:
  18         if node.name == 'head':
  19             head = node
  20             break
  21     else:
  22         head = None
  23
  24     meta_charset = False
  25     title = None
  26     refresh = None
  27     icon = None
  28
  29     if head:
  30         for node in head.childNodes:
  31             if node.name == 'meta' and \
  32                     ('http-equiv' in node.attributes) and \
  33                     (node.attributes['http-equiv'] == 'content-type'):
  34                 meta_content = node.attributes['content']
  35                 if meta_content:
  36                     try:
  37                         meta_charset = \
  38                             meta_content.lower().split('charset=')[1].split(';')[0]
  39                     except IndexError:
  40                         meta_charset = False
  41                     else:
  42                         break
  43
  44         for node in head.childNodes:
  45             if node.name == 'title':
  46                 title = node.childNodes[0].value
  47                 break
  48
  49         if title and (charset or meta_charset):
  50             title = title.encode(charset or meta_charset)
  51
  52         for node in head.childNodes:
  53             if node.name == 'meta' and \
  54                     ('http-equiv' in node.attributes) and \
  55                     (node.attributes['http-equiv'] == 'refresh'):
  56                 refresh = node.attributes['content']
  57                 break
  58
  59         for node in head.childNodes:
  60             if node.name == 'link' and \
  61                     ('rel' in node.attributes) and \
  62                     (node.attributes['rel'] in ('icon', 'shortcut icon')):
  63                 icon = node.attributes['href']
  64                 break
  65
  66     return HTMLParser(charset, meta_charset, title, refresh, icon)