Fix(parse_html): Do not parse empty strings

author Oleg Broytman <phd@phdru.name>

Thu, 16 Nov 2023 05:33:45 +0000 (08:33 +0300)

committer Oleg Broytman <phd@phdru.name>

Fri, 17 Nov 2023 10:58:35 +0000 (13:58 +0300)
author Oleg Broytman <phd@phdru.name>
Thu, 16 Nov 2023 05:33:45 +0000 (08:33 +0300)
committer Oleg Broytman <phd@phdru.name>
Fri, 17 Nov 2023 10:58:35 +0000 (13:58 +0300)
diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py

index 1d762d94d1713f951d583f120d658f2f9f8d17dd..f32e1b304d45aab959b047d51db4140241b90633 100644 (file)
--- a/Robots/bkmk_robot_base.py
+++ b/Robots/bkmk_robot_base.py
@@ -144,13 +144,12 @@ class robot_base(Robot):
                      except (ValueError, IndexError):
                          charset = None
                          self.log("   no charset in Content-Type header")
+                    is_html = False
                      for ctype in ("text/html", "application/xhtml+xml"):
                          if content_type.startswith(ctype):
-                            html = True
+                            is_html = True
                              break
-                    else:
-                        html = False
-                    if html:
+                    if content and is_html:
                          parser = parse_html(content, charset, self.log)
                          if parser:
                              bookmark.real_title = parser.title
@@ -247,6 +246,10 @@ class robot_base(Robot):
                                                    % (url, timeout)
                                                    )
  
+                    if not content:
+                        self.log("   empty response, no content")
+                    if not is_html:
+                        self.log("   not html")
                  except KeyError as key:
                      self.log("   no header: %s" % key)
  
diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py

index be5daab26e85ec47620838f506b49085c392e78f..7764303a4e754715a86a3a12e5d0b48d1d5acb8c 100644 (file)
--- a/parse_html/bkmk_parse_html.py
+++ b/parse_html/bkmk_parse_html.py
@@ -101,7 +101,7 @@ BKMK_DEBUG_HTML_PARSERS = os.environ.get("BKMK_DEBUG_HTML_PARSERS")
  
  
  def parse_html(html_text, charset=None, log=None):
-    if not parsers:
+    if not html_text or not parsers:
          return None
  
      if charset:
diff --git a/parse_html/bkmk_ph_lxml.py b/parse_html/bkmk_ph_lxml.py

index a02de919fb3cfef336b287ceb753e2d3d0cd980f..24274826c52ee40179a83d96ff1eb07ceb60e6ec 100644 (file)
--- a/parse_html/bkmk_ph_lxml.py
+++ b/parse_html/bkmk_ph_lxml.py
@@ -17,6 +17,8 @@ from .bkmk_ph_util import HTMLParser
  
  
  def parse_html(html_text, charset=None, log=None):
+    if not html_text:
+        return None
      try:
          html_tree = fromstring(html_text)
      except ValueError as e:
author	Oleg Broytman <phd@phdru.name>
	Thu, 16 Nov 2023 05:33:45 +0000 (08:33 +0300)
committer	Oleg Broytman <phd@phdru.name>
	Fri, 17 Nov 2023 10:58:35 +0000 (13:58 +0300)
Robots/bkmk_robot_base.py		patch \| blob \| history
parse_html/bkmk_parse_html.py		patch \| blob \| history
parse_html/bkmk_ph_lxml.py		patch \| blob \| history