From 7fc9a9ac1bfa749aa30e3ae1d730ac4f266db950 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Sun, 13 Aug 2017 20:41:17 +0300 Subject: [PATCH] Fix(parse_html): encode title to utf-8 as the last resort --- parse_html/bkmk_parse_html.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index fa72e1f..af9395b 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -152,8 +152,15 @@ def parse_html(html_text, charset=None, log=None): try: parser.title = title.encode('ascii') except UnicodeEncodeError: - parser.charset = DEFAULT_CHARSET - parser.title = title.encode(DEFAULT_CHARSET) + try: + parser.title = title.encode(DEFAULT_CHARSET) + except UnicodeEncodeError: + parser.title = title.encode(universal_charset) + parser.charset = universal_charset + else: + parser.charset = DEFAULT_CHARSET + else: + parser.charset = 'ascii' converted_title = title = parser.title if title and (not parser.charset): -- 2.39.2