X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=parse_html%2Fbkmk_ph_beautifulsoup.py;h=f2f042e33b18aae1303f5fdbfa5f365f7aeeca78;hb=cb9c36b39ed72cd1fa272130d2bcf162a89c3013;hp=e1969f3250aa73d6e46e24883dd264f6df7ccd94;hpb=b9c8d112b8d8d0f7c726ee7dd07a89b6569c90a1;p=bookmarks_db.git diff --git a/parse_html/bkmk_ph_beautifulsoup.py b/parse_html/bkmk_ph_beautifulsoup.py index e1969f3..f2f042e 100644 --- a/parse_html/bkmk_ph_beautifulsoup.py +++ b/parse_html/bkmk_ph_beautifulsoup.py @@ -65,7 +65,8 @@ def parse_html(html_text, charset=None, log=None): return None _charset = root.originalEncoding - if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"): # Replace default + if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"): + # Replace with default and re-parse _charset = DEFAULT_CHARSET root = _parse_html(html_text, _charset) if root is None: @@ -104,7 +105,8 @@ def parse_html(html_text, charset=None, log=None): try: meta_content = meta.get("content") if meta_content: - __charset = meta_content.lower().split('charset=')[1].split(';')[0] + __charset = meta_content.lower().split('charset=')[1].\ + split(';')[0] else: __charset = False except IndexError: # No charset in the META Content-Type