From 8f6a9a521d21c398e100c254eda47d55498cbb54 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Wed, 15 Nov 2023 19:58:36 +0300 Subject: [PATCH] Fix(Py3): Fix HTML parsers --- parse_html/bkmk_parse_html.py | 2 +- parse_html/bkmk_ph_beautifulsoup4.py | 13 ++++++++++++- parse_html/bkmk_ph_html5.py | 7 +++++-- parse_html/bkmk_ph_htmlparser.py | 5 ++++- parse_html/bkmk_ph_lxml.py | 27 ++++++++++++++++++++++----- parse_html/bkmk_ph_util.py | 4 ++-- 6 files changed, 46 insertions(+), 12 deletions(-) diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index 85aeb88..be5daab 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -168,7 +168,7 @@ def parse_html(html_text, charset=None, log=None): # parser.charset = 'ascii' converted_title = title = parser.title - if title and (not parser.charset): + if title and isinstance(title, bytes) and (not parser.charset): try: title.decode("ascii") except UnicodeDecodeError: diff --git a/parse_html/bkmk_ph_beautifulsoup4.py b/parse_html/bkmk_ph_beautifulsoup4.py index 1095ebc..6549683 100644 --- a/parse_html/bkmk_ph_beautifulsoup4.py +++ b/parse_html/bkmk_ph_beautifulsoup4.py @@ -11,18 +11,29 @@ __license__ = "GNU GPL" __all__ = ['parse_html'] +import warnings + from bs4 import BeautifulSoup from .bkmk_ph_util import HTMLParser from compat import string_type +warnings.filterwarnings( + 'ignore', 'No parser was explicitly specified') +warnings.filterwarnings( + 'ignore', + "It looks like you're parsing an XML document using an HTML parser.") + universal_charset = "utf-8" DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic def _parse_html(html_text, charset): try: - return BeautifulSoup(html_text, from_encoding=charset) + if isinstance(html_text, bytes): + return BeautifulSoup(html_text, from_encoding=charset) + else: + return BeautifulSoup(html_text) except TypeError: return None diff --git a/parse_html/bkmk_ph_html5.py b/parse_html/bkmk_ph_html5.py index 68c1aba..1fabd82 100644 --- a/parse_html/bkmk_ph_html5.py +++ b/parse_html/bkmk_ph_html5.py @@ -17,8 +17,11 @@ from .bkmk_ph_util import HTMLParser def parse_html(html_text, charset=None, log=None): parser = HTML5Parser() - html_tree = parser.parse( - html_text, encoding=charset, parseMeta=bool(charset)) + if isinstance(html_text, bytes): + html_tree = parser.parse( + html_text, encoding=charset, parseMeta=bool(charset)) + else: + html_tree = parser.parse(html_text) html = None if hasattr(html_tree, 'childNodes'): diff --git a/parse_html/bkmk_ph_htmlparser.py b/parse_html/bkmk_ph_htmlparser.py index b90618f..fd7b687 100644 --- a/parse_html/bkmk_ph_htmlparser.py +++ b/parse_html/bkmk_ph_htmlparser.py @@ -11,7 +11,10 @@ __license__ = "GNU GPL" __all__ = ['parse_html'] -from HTMLParser import HTMLParseError +try: + from HTMLParser import HTMLParseError +except ImportError: + class HTMLParseError(Exception): pass from m_lib.net.www.html import HTMLParser as _HTMLParser diff --git a/parse_html/bkmk_ph_lxml.py b/parse_html/bkmk_ph_lxml.py index 03dd6f4..a02de91 100644 --- a/parse_html/bkmk_ph_lxml.py +++ b/parse_html/bkmk_ph_lxml.py @@ -11,15 +11,32 @@ __license__ = "GNU GPL" __all__ = ['parse_html'] -from lxml.html import fromtring +import re +from lxml.html import fromstring from .bkmk_ph_util import HTMLParser def parse_html(html_text, charset=None, log=None): - html_tree = fromtring(html_text) - - if html_tree.getroot() is None: - return None + try: + html_tree = fromstring(html_text) + except ValueError as e: + if e.args[0].startswith( + 'Unicode strings with encoding declaration are not supported.' + ' Please use bytes input' + ): + if not charset: + match = re.search( + '<\\?xml version="(\\d|.)+" encoding="([^"]+)"\\?>', + html_text, re.U) + if match: + charset = match.group(2) + if charset: + html_text = html_text.encode(charset) + html_tree = fromstring(html_text) + else: + return None + else: + raise title = html_tree.findtext('head/title') if title is None: diff --git a/parse_html/bkmk_ph_util.py b/parse_html/bkmk_ph_util.py index 0fd09f2..1aeb306 100644 --- a/parse_html/bkmk_ph_util.py +++ b/parse_html/bkmk_ph_util.py @@ -10,10 +10,10 @@ __license__ = "GNU GPL" __all__ = ['HTMLParser'] -from HTMLParser import HTMLParser +from m_lib.net.www.html import HTMLParser as _HTMLParser -class HTMLParser(HTMLParser): +class HTMLParser(_HTMLParser): def __init__(self, charset, meta_charset, title, refresh, icon): object.__init__(self) self.charset = charset -- 2.39.5