]> git.phdru.name Git - bookmarks_db.git/blobdiff - Robots/bkmk_robot_base.py
Refactor(Robots): Refactor request headers
[bookmarks_db.git] / Robots / bkmk_robot_base.py
index 2024ab85efd74594eb4700078f19f7e012ae2b78..df33a26bd43162d92252e04df933bcb378abaffe 100644 (file)
@@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['robot_base', 'get_error']
@@ -24,6 +24,22 @@ from bkmk_objects import Robot
 from parse_html import parse_html
 
 
+# Fake headers to pretend this is a real browser
+_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)"
+" Gecko/20001221 Firefox/2.0.0"
+_x_user_agent = "bookmarks_db (Python %d.%d.%d)" % sys.version_info[:3]
+
+request_headers = {
+    'Accept': '*/*',
+    'Accept-Language': 'ru,en',
+    'Cache-Control': 'max-age=300',
+    'Connection': 'close',
+    'Referer': '/',
+    'User-Agent': _user_agent,
+    'X-User-Agent': _x_user_agent,
+}
+
+
 reloc_dict = {
   301: "perm1.",
   302: "temp2.",
@@ -135,8 +151,13 @@ class robot_base(Robot):
                             break
                     content_stripped = content.strip()
                     if content_stripped and charset:
-                        content_stripped = content_stripped.decode(
-                            charset, 'replace')
+                        try:
+                            content_stripped = content_stripped.decode(
+                                charset, 'replace')
+                        except LookupError:
+                            charset = None
+                            self.log("   unknown charset "
+                                     "in Content-Type header")
                     if content_stripped and is_html:
                         parser = parse_html(
                             content_stripped, charset, self.log)
@@ -211,7 +232,7 @@ class robot_base(Robot):
                                                        bookmark.icon
                                                        )
                                 else:
-                                    self.log("   no icon        :"
+                                    self.log("   no icon        : "
                                              "bad content type '%s'"
                                              % content_type
                                              )