Fix(parse_html): Fix BS4 parser: encode title to utf-8 as the last resort

author Oleg Broytman <phd@phdru.name>

Sun, 13 Aug 2017 17:40:22 +0000 (20:40 +0300)

committer Oleg Broytman <phd@phdru.name>

Sun, 13 Aug 2017 17:40:22 +0000 (20:40 +0300)
author Oleg Broytman <phd@phdru.name>
Sun, 13 Aug 2017 17:40:22 +0000 (20:40 +0300)
committer Oleg Broytman <phd@phdru.name>
Sun, 13 Aug 2017 17:40:22 +0000 (20:40 +0300)
diff --git a/parse_html/bkmk_ph_beautifulsoup4.py b/parse_html/bkmk_ph_beautifulsoup4.py

index aad3f8fb9092bdba2daec856a93f4309f9714795..10e06a97cd4fc4e25689cd3a92040878a5f2775a 100644 (file)
--- a/parse_html/bkmk_ph_beautifulsoup4.py
+++ b/parse_html/bkmk_ph_beautifulsoup4.py
@@ -15,6 +15,7 @@ import re
  from bs4 import BeautifulSoup
  from .bkmk_ph_util import HTMLParser
  
+universal_charset = "utf-8"
  DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
  
  def _parse_html(html_text, charset):
@@ -80,7 +81,11 @@ def parse_html(html_text, charset=None, log=None):
                  meta_charset = _charset = meta_content.lower()
  
      if title and (_charset or meta_charset):
-        title = title.encode(_charset or meta_charset)
+        try:
+            title = title.encode(_charset or meta_charset)
+        except LookupError:
+            title = title.encode(universal_charset)
+            _charset = universal_charset
  
      meta = head.find(_find_refresh, recursive=False)
      if meta:
author	Oleg Broytman <phd@phdru.name>
	Sun, 13 Aug 2017 17:40:22 +0000 (20:40 +0300)
committer	Oleg Broytman <phd@phdru.name>
	Sun, 13 Aug 2017 17:40:22 +0000 (20:40 +0300)