Robots/parse_html.py

   1 #! /usr/bin/env python
   2 """
   3    HTML Parsers wrapper
   4
   5    Written by BroytMann. Copyright (C) 1997-2007 PhiloSoft Design
   6 """
   7
   8 import codecs
   9
  10 from m_lib.defenc import default_encoding
  11 current_charset = default_encoding.replace("windows-", "cp")
  12 DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
  13
  14 from parse_html_htmlparser import parse_html as _parse_html
  15
  16
  17 class HTMLParser(object):
  18    def __init__(self, charset=None):
  19       _HTMLParser.__init__(self)
  20       self.charset = charset
  21       self.meta_charset = 0
  22       self.title = ''
  23       self.refresh = ''
  24       self.icon = None
  25
  26
  27 import re
  28 entity_re = re.compile("(&#[0-9]+;)")
  29
  30 def recode_entities(title, charset):
  31    output = []
  32    for part in entity_re.split(title):
  33       if entity_re.match(part):
  34          part = unichr(int(part[2:-1])).encode(charset, "replace")
  35       output.append(part)
  36
  37    return ''.join(output)
  38
  39
  40 def parse_html(filename, charset=None, log=None):
  41    if charset:
  42       try:
  43          codecs.lookup(charset) # In case of unknown charset...
  44       except (ValueError, LookupError):
  45          charset = None         # ...try charset from HTML
  46
  47    parser = _parse_html(filename, charset)
  48    title = parser.title
  49
  50    if not parser.charset:
  51       try:
  52          unicode(title, "ascii")
  53       except UnicodeDecodeError:
  54          parser.charset = DEFAULT_CHARSET
  55
  56    if parser.charset:
  57       parser.charset = parser.charset.replace("windows-", "cp").lower()
  58
  59    if parser.charset and (parser.charset <> current_charset):
  60       try:
  61          if parser.meta_charset:
  62             if log: log("   META charset   : %s" % parser.charset)
  63          else:
  64             if log: log("   charset        : %s" % parser.charset)
  65          if log: log("   title          : %s" % title)
  66          title = unicode(title, parser.charset, "replace").encode(current_charset, "replace")
  67          if log: log("   current charset: %s" % current_charset)
  68          if log: log("   converted title: %s" % title)
  69       except LookupError:
  70          if log: log("   unknown charset: `%s' or `%s'" % (parser.charset, current_charset))
  71
  72    parser.title = recode_entities(title, current_charset)
  73    return parser
  74
  75
  76 if __name__ == '__main__':
  77    import sys
  78    parser = parse_html(sys.argv[1])
  79    print parser.charset
  80    print parser.title
  81    print parser.refresh
  82    print parser.icon