From be7cec61fa405f38dea2edde623174ca47ca7dc3 Mon Sep 17 00:00:00 2001
From: Oleg Broytman <phd@phdru.name>
Date: Sun, 13 Aug 2017 20:40:22 +0300
Subject: [PATCH] Fix(parse_html): Fix BS4 parser: encode title to utf-8 as the
 last resort

---
 parse_html/bkmk_ph_beautifulsoup4.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/parse_html/bkmk_ph_beautifulsoup4.py b/parse_html/bkmk_ph_beautifulsoup4.py
index aad3f8f..10e06a9 100644
--- a/parse_html/bkmk_ph_beautifulsoup4.py
+++ b/parse_html/bkmk_ph_beautifulsoup4.py
@@ -15,6 +15,7 @@ import re
 from bs4 import BeautifulSoup
 from .bkmk_ph_util import HTMLParser
 
+universal_charset = "utf-8"
 DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
 
 def _parse_html(html_text, charset):
@@ -80,7 +81,11 @@ def parse_html(html_text, charset=None, log=None):
                 meta_charset = _charset = meta_content.lower()
 
     if title and (_charset or meta_charset):
-        title = title.encode(_charset or meta_charset)
+        try:
+            title = title.encode(_charset or meta_charset)
+        except LookupError:
+            title = title.encode(universal_charset)
+            _charset = universal_charset
 
     meta = head.find(_find_refresh, recursive=False)
     if meta:
-- 
2.39.2