From e69ee847224026d24684847d19d178f5b0db9309 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Thu, 16 Nov 2023 08:33:45 +0300 Subject: [PATCH] Fix(parse_html): Do not parse empty strings --- Robots/bkmk_robot_base.py | 11 +++++++---- parse_html/bkmk_parse_html.py | 2 +- parse_html/bkmk_ph_lxml.py | 2 ++ 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index 1d762d9..f32e1b3 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -144,13 +144,12 @@ class robot_base(Robot): except (ValueError, IndexError): charset = None self.log(" no charset in Content-Type header") + is_html = False for ctype in ("text/html", "application/xhtml+xml"): if content_type.startswith(ctype): - html = True + is_html = True break - else: - html = False - if html: + if content and is_html: parser = parse_html(content, charset, self.log) if parser: bookmark.real_title = parser.title @@ -247,6 +246,10 @@ class robot_base(Robot): % (url, timeout) ) + if not content: + self.log(" empty response, no content") + if not is_html: + self.log(" not html") except KeyError as key: self.log(" no header: %s" % key) diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index be5daab..7764303 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -101,7 +101,7 @@ BKMK_DEBUG_HTML_PARSERS = os.environ.get("BKMK_DEBUG_HTML_PARSERS") def parse_html(html_text, charset=None, log=None): - if not parsers: + if not html_text or not parsers: return None if charset: diff --git a/parse_html/bkmk_ph_lxml.py b/parse_html/bkmk_ph_lxml.py index a02de91..2427482 100644 --- a/parse_html/bkmk_ph_lxml.py +++ b/parse_html/bkmk_ph_lxml.py @@ -17,6 +17,8 @@ from .bkmk_ph_util import HTMLParser def parse_html(html_text, charset=None, log=None): + if not html_text: + return None try: html_tree = fromstring(html_text) except ValueError as e: -- 2.39.5