parse_html/html5.py

   1 """HTML Parser using html5
   2
   3 This file is a part of Bookmarks database and Internet robot.
   4 """
   5
   6 __version__ = "$Revision$"[11:-2]
   7 __revision__ = "$Id$"[5:-2]
   8 __date__ = "$Date$"[7:-2]
   9 __author__ = "Oleg Broytman <phd@phdru.name>"
  10 __copyright__ = "Copyright (C) 2010, 2011 PhiloSoft Design"
  11 __license__ = "GNU GPL"
  12
  13 __all__ = ['parse_html']
  14
  15
  16 from html5lib import HTMLParser as HTML5Parser
  17 from .util import HTMLParser
  18
  19
  20 def parse_html(filename, charset=None, log=None):
  21     parser = HTML5Parser()
  22     fp = open(filename)
  23     parser._parse(fp, encoding=charset, parseMeta=bool(charset))
  24     fp.close()
  25     html_tree = parser.tree.getDocument()
  26
  27     for node in html_tree.childNodes:
  28         if (node.name == 'html') and (node.type != 3): # Skip DocType element
  29             html = node
  30             break
  31     else:
  32         html = None
  33
  34     if not html:
  35         return None
  36
  37     for node in html.childNodes:
  38         if node.name == 'head':
  39             head = node
  40             break
  41     else:
  42         head = None
  43
  44     meta_charset = False
  45     title = None
  46     refresh = None
  47     icon = None
  48
  49     if head:
  50         for node in head.childNodes:
  51             if node.name == 'title':
  52                 if node.childNodes:
  53                     title = node.childNodes[0].value
  54                     break
  55                 else:
  56                     title = ''
  57
  58         if title is None:
  59             return None
  60
  61         for node in head.childNodes:
  62             if node.name == 'meta' and \
  63                     ('http-equiv' in node.attributes) and \
  64                     (node.attributes['http-equiv'] == 'content-type'):
  65                 meta_content = node.attributes['content']
  66                 if meta_content:
  67                     try:
  68                         meta_charset = \
  69                             meta_content.lower().split('charset=')[1].split(';')[0]
  70                     except IndexError:
  71                         meta_charset = False
  72                     else:
  73                         break
  74
  75         if not charset:
  76             charset = parser.tokenizer.stream.charEncoding[0]
  77
  78         if charset or meta_charset:
  79             title = title.encode(charset or meta_charset)
  80
  81         for node in head.childNodes:
  82             if node.name == 'meta' and \
  83                     ('http-equiv' in node.attributes) and \
  84                     (node.attributes['http-equiv'] == 'refresh'):
  85                 refresh = node.attributes['content']
  86                 break
  87
  88         for node in head.childNodes:
  89             if node.name == 'link' and \
  90                     ('rel' in node.attributes) and \
  91                     (node.attributes['rel'] in ('icon', 'shortcut icon')):
  92                 icon = node.attributes['href']
  93                 break
  94
  95     else:
  96         for node in html.childNodes:
  97             if node.name == 'title':
  98                 if node.childNodes:
  99                     title = node.childNodes[0].value
 100                     break
 101                 else:
 102                     title = ''
 103
 104         if title is None:
 105             return None
 106
 107     return HTMLParser(charset, meta_charset, title, refresh, icon)