Fix(Robots/bkmk_robot_base): Add forgotten spaces in log

[bookmarks_db.git] / parse_html / bkmk_ph_htmlparser.py
diff --git a/parse_html/bkmk_ph_htmlparser.py b/parse_html/bkmk_ph_htmlparser.py

index 05ad6584d47ec37f67ae5883181fdc6de5328880..d11a2ff9fbeab4b5e5ec8daa1c2a7b4205ac63e4 100644 (file)
--- a/parse_html/bkmk_ph_htmlparser.py
+++ b/parse_html/bkmk_ph_htmlparser.py
@@ -11,7 +11,10 @@ __license__ = "GNU GPL"
  __all__ = ['parse_html']
  
  
-from HTMLParser import HTMLParseError
+try:
+    from HTMLParser import HTMLParseError
+except ImportError:
+    class HTMLParseError(Exception): pass
  from m_lib.net.www.html import HTMLParser as _HTMLParser
  
  
@@ -47,8 +50,10 @@ class HTMLParser(_HTMLParser):
  
          if (not self.charset) and (http_equiv == "content-type"):
              try:
-                # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
-                self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0]
+                # extract charset from
+                # "text/html; foo; charset=UTF-8, bar; baz;"
+                self.charset = content.lower().split('charset=')[1].\
+                    split(';')[0].split(',')[0]
                  # Remember that the charset was retrieved from
                  # META tag, not from the Content-Type header
                  self.meta_charset = 1
@@ -72,7 +77,9 @@ class HTMLParser(_HTMLParser):
          for attrname, value in attrs:
              if value:
                  value = value.strip()
-                if (attrname == 'rel') and (value.lower() in ('icon', 'shortcut icon')):
+                if (attrname == 'rel') and (
+                        value.lower() in ('icon', 'shortcut icon')
+                ):
                      has_icon = True
                  elif attrname == 'href':
                      href = value
@@ -82,6 +89,13 @@ class HTMLParser(_HTMLParser):
  
  
  def parse_html(html_text, charset=None, log=None):
+    if not html_text:
+        return None
+    if charset is None and isinstance(html_text, bytes):
+        return None  # html.parser cannot parse bytes
+    if charset and isinstance(html_text, bytes):
+        html_text = html_text.decode(charset)
+
      parser = HTMLParser(charset)
  
      try:
@@ -94,6 +108,7 @@ def parse_html(html_text, charset=None, log=None):
      except (HTMLParseError, HTMLHeadDone):
          pass
  
-    if (parser.title is None) and (parser.refresh is None) and (parser.icon is None):
+    if (parser.title is None) and (parser.refresh is None) \
+            and (parser.icon is None):
          return None
      return parser