current_charset = default_encoding.replace("windows-", "cp")
DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
-from parse_html_htmlparser import parse_html as _parse_html
-
-
-class HTMLParser(object):
- def __init__(self, charset=None):
- _HTMLParser.__init__(self)
- self.charset = charset
- self.meta_charset = 0
- self.title = ''
- self.refresh = ''
- self.icon = None
+try:
+ from parse_html_beautifulsoup import parse_html as _parse_html
+except ImportError:
+ from parse_html_htmlparser import parse_html as _parse_html
import re
if __name__ == '__main__':
import sys
- parser = parse_html(sys.argv[1])
+ parser = parse_html(sys.argv[1], current_charset)
print parser.charset
print parser.title
print parser.refresh
--- /dev/null
+"""
+ HTML Parser using BeautifulSoup
+
+ Written by BroytMann. Copyright (C) 2007 PhiloSoft Design
+"""
+
+from BeautifulSoup import BeautifulSoup
+
+
+class DummyParser(object):
+ def __init__(self, charset, meta, title, refresh, icon):
+ object.__init__(self)
+ self.charset = charset
+ self.meta_charset = meta
+ self.title = title
+ self.refresh = refresh
+ self.icon = icon
+
+def parse_html(filename, charset=None):
+ infile = open(filename, 'r')
+ root = BeautifulSoup(infile, fromEncoding=charset)
+ infile.close()
+
+ charset = root.originalEncoding
+ try:
+ title = root.html.head.title.string.encode(charset)
+ except AttributeError:
+ title = ''
+
+ try:
+ meta = root.html.head.find(_find_refresh, recursive=False)
+ except AttributeError:
+ refresh = None
+ else:
+ if meta:
+ refresh = meta.get("content")
+ else:
+ refresh = None
+
+ try:
+ meta = root.html.head.find(_find_icon, recursive=False)
+ except AttributeError:
+ icon = None
+ else:
+ if meta:
+ icon = meta.get("href")
+ else:
+ icon = None
+
+ parser = DummyParser(charset, False, title, refresh, icon)
+ return parser
+
+def _find_refresh(Tag):
+ return (Tag.name == "meta") and \
+ (Tag.get("http-equiv", '').lower() == "refresh")
+
+def _find_icon(Tag):
+ return (Tag.name == "link") and \
+ (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))