-
-import sys
-current_charset = sys.getdefaultencoding()
-DEFAULT_CHARSET = "windows-1251"
-
-
-from HTMLParser import HTMLParseError
-from m_lib.www.html import HTMLParser as _HTMLParser
-
-
-class HTMLHeadDone(Exception): pass
-
-
-class HTMLParser(_HTMLParser):
- def __init__(self, charset=None):
- _HTMLParser.__init__(self)
- self.charset = charset
- self.meta_charset = 0
- self.title = ''
- self.refresh = ''
-
- def end_head(self):
- raise HTMLHeadDone()
-
-
- def do_meta(self, attrs):
- http_equiv = ""
- content = ""
-
- for attrname, value in attrs:
- if value:
- value = value.strip()
- if attrname == 'http-equiv':
- http_equiv = value.lower()
- elif attrname == 'content':
- content = value
-
- if (not self.charset) and (http_equiv == "content-type"):
+import codecs
+
+universal_charset = "utf-8"
+DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
+
+parsers = []
+
+try:
+ import parse_html_beautifulsoup
+ parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET
+except ImportError:
+ pass
+else:
+ parsers.append(parse_html_beautifulsoup.parse_html)
+
+try:
+ from parse_html_lxml import parse_html
+except ImportError:
+ pass
+else:
+ parsers.append(parse_html)
+
+try:
+ from parse_html_htmlparser import parse_html
+except ImportError:
+ pass
+else:
+ parsers.append(parse_html)
+
+try:
+ import parse_html_html5
+except ImportError:
+ pass
+else:
+ parsers.append(parse_html_html5.parse_html)
+
+
+import re
+from htmlentitydefs import name2codepoint
+
+entity_re = re.compile("(&\w+;)")
+num_entity_re = re.compile("(&#[0-9]+;)")
+
+def recode_entities(title, charset):
+ output = []
+ for part in entity_re.split(title):
+ if part not in ("&", "<", ">", """) and \
+ entity_re.match(part):
+ _part = name2codepoint.get(part[1:-1], None)
+ if _part is not None:
+ part = unichr(_part).encode(charset)
+ output.append(part)
+ title = ''.join(output)
+
+ output = []
+ for part in num_entity_re.split(title):
+ if num_entity_re.match(part):