]> git.phdru.name Git - bookmarks_db.git/commitdiff
Fix(Py3): Fix `urllib`-based robot
authorOleg Broytman <phd@phdru.name>
Sun, 12 Nov 2023 10:01:29 +0000 (13:01 +0300)
committerOleg Broytman <phd@phdru.name>
Sun, 12 Nov 2023 19:21:09 +0000 (22:21 +0300)
Robots/bkmk_rurllib_py3.py [new file with mode: 0644]
robots.py

diff --git a/Robots/bkmk_rurllib_py3.py b/Robots/bkmk_rurllib_py3.py
new file mode 100644 (file)
index 0000000..b392887
--- /dev/null
@@ -0,0 +1,136 @@
+"""Simple, strightforward robot based on urllib
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_urllib_py3']
+
+
+import sys
+import urllib.request
+
+from Robots.bkmk_robot_base import robot_base, get_error
+
+
+class RedirectException(Exception):
+    def __init__(self, errcode, newurl):
+        Exception.__init__(self)
+        self.errcode = errcode
+        self.newurl = newurl
+
+
+class MyURLopener(urllib.request.URLopener):
+    # Error 301 -- relocated (permanently)
+    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
+        if 'location' in headers:
+            newurl = headers['location']
+        elif 'uri' in headers:
+            newurl = headers['uri']
+        else:
+            newurl = "Nowhere"
+        raise RedirectException(errcode, newurl)
+
+    # Error 302 -- relocated (temporarily)
+    http_error_302 = http_error_301
+    # Error 303 -- relocated (see other)
+    http_error_303 = http_error_301
+    # Error 307 -- relocated (temporarily)
+    http_error_307 = http_error_301
+
+    # Error 401 -- authentication required
+    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
+        raise IOError(
+            ('http error', errcode, "Authentication required ", headers))
+
+    def http_error_default(self, url, fp, errcode, errmsg, headers):
+        if fp:
+            fp.read()
+            fp.close()
+        raise IOError(('http error', errcode, errmsg, headers))
+
+
+urllib.request._opener = MyURLopener()
+
+# Fake headers to pretend this is a real browser
+_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)"
+" Gecko/20001221 Firefox/2.0.0"
+urllib.request._opener.addheaders[0] = ('User-Agent', _user_agent)
+_x_user_agent = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % (
+   sys.version_info[0], sys.version_info[1],
+   sys.version_info[2], urllib.request.__version__
+)
+urllib.request._opener.addheader('X-User-Agent', _x_user_agent)
+urllib.request._opener.addheader('Referer', '')
+
+urllib.request._opener.addheader('Accept', '*/*')
+urllib.request._opener.addheader('Accept-Language', 'ru,en')
+urllib.request._opener.addheader('Cache-Control', 'max-age=300')
+urllib.request._opener.addheader('Connection', 'close')
+
+
+urllib_ftpwrapper = urllib.request.ftpwrapper
+ftpcache_key = None
+
+
+class myftpwrapper(urllib_ftpwrapper):
+    def __init__(self, user, passwd, host, port, dirs):
+        urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs)
+        global ftpcache_key
+        ftpcache_key = (user, host, port, '/'.join(dirs))
+
+
+urllib.request.ftpwrapper = myftpwrapper
+
+
+class robot_urllib_py3(robot_base):
+    def get(self, bookmark, url, accept_charset=False):
+        try:
+            # Set fake referer to the base URL
+            urllib.request._opener.addheaders[2] = ('Referer', url)
+
+            if accept_charset and bookmark.charset:
+                urllib.request._opener.addheader('Accept-Charset', bookmark.charset)
+            try:
+                fname, headers = urllib.request.urlretrieve(url)
+            finally:
+                if accept_charset and bookmark.charset:
+                    # Remove Accept-Charset
+                    del urllib.request._opener.addheaders[-1]
+
+            infile = open(fname, 'rt')
+            content = infile.read()
+            infile.close()
+
+            return None, None, None, headers, content
+
+        except RedirectException as e:
+            return None, e.errcode, e.newurl, None, None
+
+        except IOError as e:
+            if (e[0] == "http error") and (e[1] == -1):
+                error = None
+                bookmark.no_error = "The server did not return any header - "
+                "it is not an error, actually"
+                self.log('   no headers: %s' % bookmark.no_error)
+            else:
+                error = get_error(e)
+                self.log('   Error: %s' % error)
+
+            return error, None, None, None, None
+
+    def get_ftp_welcome(self):
+        global ftpcache_key
+        _welcome = urllib.request._opener.ftpcache[ftpcache_key].ftp.welcome
+        # I am assuming there are no duplicate ftp URLs in db.
+        # If there are - ftpcache_key in next line is invalid.
+        ftpcache_key = None
+        return _welcome
+
+    def finish_check_url(self, bookmark):
+        robot_base.finish_check_url(self, bookmark)
+        urllib.request.urlcleanup()
index 8fac908dd92bec6117d8a59a36eab17bd73b41e4..1571d58e2af9b3b63c3e3ed1f0cd5bd7d1651e07 100644 (file)
--- a/robots.py
+++ b/robots.py
@@ -11,6 +11,7 @@ __license__ = "GNU GPL"
 __all__ = ['import_robot', 'robot']
 
 
+import sys
 from os import environ
 from bkmk_objects import parse_params, set_params
 
@@ -18,6 +19,8 @@ robot_name, robot_params = parse_params(environ.get("BKMK_ROBOT", "forking"))
 
 
 def import_robot(robot_name):
+    if (robot_name == 'urllib') and (sys.version_info[0] >= 3):
+        robot_name = 'urllib_py3'
     ns = locals()
     exec("from Robots import bkmk_r%s" % robot_name, globals(), ns)
     exec("robot = bkmk_r%s.robot_%s" % (robot_name, robot_name), globals(), ns)