DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
parsers = []
-try:
- from . import bkmk_ph_beautifulsoup4
-except ImportError:
- pass
-else:
- bkmk_ph_beautifulsoup4.DEFAULT_CHARSET = DEFAULT_CHARSET
- parsers.append(bkmk_ph_beautifulsoup4.parse_html)
-
try:
from . import bkmk_ph_beautifulsoup
except ImportError:
parsers.append(bkmk_ph_beautifulsoup.parse_html)
try:
- from . import bkmk_ph_html5
+ from . import bkmk_ph_beautifulsoup4
except ImportError:
pass
else:
- parsers.append(bkmk_ph_html5.parse_html)
+ bkmk_ph_beautifulsoup4.DEFAULT_CHARSET = DEFAULT_CHARSET
+ parsers.append(bkmk_ph_beautifulsoup4.parse_html)
try:
- from . import bkmk_ph_lxml
+ from . import bkmk_ph_htmlparser
except ImportError:
pass
else:
- parsers.append(bkmk_ph_lxml.parse_html)
+ parsers.append(bkmk_ph_htmlparser.parse_html)
try:
- from . import bkmk_ph_htmlparser
+ from . import bkmk_ph_lxml
except ImportError:
pass
else:
- parsers.append(bkmk_ph_htmlparser.parse_html)
-
-# ElementTidy often segfaults
-# try:
-# from . import bkmk_ph_etreetidy
-# except ImportError:
-# pass
-# else:
-# parsers.append(bkmk_ph_etreetidy.parse_html)
+ parsers.append(bkmk_ph_lxml.parse_html)
universal_charset = "utf-8"
entity_re = re.compile("(&\\w+;)")
def parse_filename(filename, charset=None, log=None):
- fp = open(filename, 'rt')
+ fp = open(filename, 'rt', encoding=charset)
try:
parser = parse_html(fp.read(), charset=charset, log=log)
finally: