]> git.phdru.name Git - bookmarks_db.git/commitdiff
Refactor(Robots): Move proxy handling to base class
authorOleg Broytman <phd@phdru.name>
Wed, 31 Jul 2024 15:49:11 +0000 (18:49 +0300)
committerOleg Broytman <phd@phdru.name>
Wed, 31 Jul 2024 22:48:37 +0000 (01:48 +0300)
This greatly simplifies robots.

Robots/bkmk_robot_base.py
Robots/bkmk_rrequests.py
Robots/bkmk_rurllib.py
Robots/bkmk_rurllib2.py
Robots/bkmk_rurllib_py3.py
get_url.py

index cc2574a00a8a9b31301203e3b17748f03d993bcf..90d287543e08d81eb776d70bcb7c14411ab6db41 100644 (file)
@@ -12,7 +12,7 @@ __all__ = ['robot_base', 'get_error']
 
 
 from base64 import b64encode
-from urllib.parse import urljoin
+from urllib.parse import urlsplit, urljoin
 import sys
 import socket
 import time
@@ -67,6 +67,16 @@ icons = {}
 
 
 class robot_base(Robot):
+    # Pass proxy from the environment like this:
+    # BKMK_ROBOT=requests:proxy=http%3a//localhost%3a8080
+    # BKMK_ROBOT=requests:proxy=socks5h%3a//localhost%3a1080
+    proxy = None
+
+    # Store hosts for which we already know they require proxy...
+    proxy_ok = set()
+    # ...but aren't accessible even through proxy
+    proxy_error = set()
+
     timeout = 60
 
     def __init__(self, *args, **kw):
@@ -79,7 +89,7 @@ class robot_base(Robot):
             bookmark.icon = None
 
             error, http_status_code, redirect_to, headers, content = \
-                self.get(bookmark, bookmark.href, True)
+                self.smart_get(bookmark, bookmark.href, True)
 
             if error:
                 bookmark.error = error
@@ -190,7 +200,7 @@ class robot_base(Robot):
                                     error, icon_status_code, \
                                         icon_redirect_to, icon_headers, \
                                         icon_data = \
-                                        self.get(bookmark, _icon_url)
+                                        self.smart_get(bookmark, _icon_url)
                                     if error:
                                         raise IOError("No icon")
                                         break
@@ -312,6 +322,43 @@ class robot_base(Robot):
         # Tested
         return 1
 
+    def smart_get(self, bookmark, url, accept_charset=False):
+        split_results = urlsplit(url)
+        url_host = split_results.hostname
+
+        if url_host in self.proxy_error:
+            return 'see prev. error', None, None, None, None
+
+        if url_host in self.proxy_ok:
+            self.log('   Immediately trying with the proxy')
+            error, http_status_code, redirect_to, headers, content = \
+                self.get(bookmark, url,
+                         accept_charset=accept_charset,
+                         use_proxy=True)
+        else:
+            error, http_status_code, redirect_to, headers, content = \
+                self.get(bookmark, url,
+                         accept_charset=accept_charset)
+            if error is not None:
+                self.log('   Error          : %s' % error)
+                if self.proxy and error != '404 not_found':
+                    self.log('   Retrying with the proxy...')
+                    error, http_status_code, redirect_to, headers, content = \
+                        self.get(bookmark, url,
+                                 accept_charset=accept_charset,
+                                 use_proxy=True)
+                    if error is None:
+                        self.proxy_ok.add(url_host)
+        if error is not None:
+            if self.proxy and http_status_code != 404:
+                self.log('   Proxy error    : %s' % error)
+                if url_host not in self.proxy_ok:
+                    self.proxy_error.add(url_host)
+            return error, http_status_code, None, None, None
+        if http_status_code:
+            return None, http_status_code, redirect_to, None, None
+        return None, None, None, headers, content
+
     def set_redirect(self, bookmark, errcode, newurl):
         bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl)
         try:
index 317c473a8750196564380a628f8d84721289b48d..e0e4d2d3a90db09de5643ff694f2bf3cb463ee5d 100644 (file)
@@ -11,7 +11,6 @@ __license__ = "GNU GPL"
 __all__ = ['robot_requests']
 
 
-from urllib.parse import urlsplit
 import warnings
 
 from requests.adapters import HTTPAdapter
@@ -25,48 +24,35 @@ requests_ftp.monkeypatch_session()
 
 
 class robot_requests(robot_base):
-    # Pass proxy from the environment like this:
-    # BKMK_ROBOT=requests:proxy=http%3a//localhost%3a8080
-    # BKMK_ROBOT=requests:proxy=socks5h%3a//localhost%3a1080
-    proxy = None
-
-    # Store hosts for which we already know they require proxy...
-    proxy_ok = set()
-    # ...but aren't accessible even through proxy
-    proxy_error = set()
-
-    def get(self, bookmark, url, accept_charset=False):
-        split_results = urlsplit(url)
-        url_host = split_results.hostname
-
-        if url_host in self.proxy_error:
-            return 'see prev. error', None, None, None, None
-
+    def get(self, bookmark, url, accept_charset=False, use_proxy=False):
         if accept_charset and bookmark.charset:
             headers = request_headers.copy()
             headers['Accept-Charset'] = bookmark.charset
         else:
             headers = request_headers
 
-        if url_host in self.proxy_ok:
-            self.log('   Immediately trying with the proxy')
-            error, r = request_get(url, headers, self.timeout, self.proxy)
+        if use_proxy:
+            proxies = {'http': self.proxy, 'https': self.proxy}
         else:
-            error, r = request_get(url, headers, self.timeout, None)
-            if error is not None:
-                self.log('   Error          : %s' % error)
-                if self.proxy and error != '404 not_found':
-                    self.log('   Retrying with the proxy...')
-                    error, r = request_get(url, headers,
-                                           self.timeout, self.proxy)
-                    if error is None:
-                        self.proxy_ok.add(url_host)
+            proxies = None
+
+        s = requests.Session()
+        s.mount('https://', AllCiphersAdapter())
+
+        error = r = None
+        try:
+            r = s.get(url, headers=headers, timeout=self.timeout,
+                      allow_redirects=False, proxies=proxies,
+                      verify=False)
+        except requests.RequestException as e:
+            error = str(e)
+        else:
+            if r.status_code >= 400:
+                error = requests.status_codes._codes[r.status_code][0]
+                error = '%d %s' % (r.status_code, error)
+
         if error is not None:
-            if self.proxy and r.status_code != 404:
-                self.log('   Proxy error    : %s' % error)
-                if url_host not in self.proxy_ok:
-                    self.proxy_error.add(url_host)
-            return error, r.status_code, None, None, None
+            return error, r.status_code if r else None, None, None, None
         if r.is_redirect:
             return None, r.status_code, r.next.url, None, None
         return None, None, None, r.headers, r.content
@@ -96,25 +82,3 @@ class AllCiphersAdapter(HTTPAdapter):
 
 
 warnings.filterwarnings('ignore', 'Unverified HTTPS request is being made')
-
-
-def request_get(url, headers, timeout, proxy):
-    if proxy:
-        proxies = {'http': proxy, 'https': proxy}
-    else:
-        proxies = None
-
-    s = requests.Session()
-    s.mount('https://', AllCiphersAdapter())
-
-    try:
-        r = s.get(url, headers=headers, timeout=timeout,
-                  allow_redirects=False, proxies=proxies,
-                  verify=False)
-    except requests.RequestException as e:
-        return str(e), None
-    else:
-        if r.status_code >= 400:
-            error = requests.status_codes._codes[r.status_code][0]
-            return '%d %s' % (r.status_code, error), None
-        return None, r
index 057c018e510c9fc6ff4de4e992e6cbd765a0ce63..5b818a6a95ed377526e6d5aa5bd724c93b11ff11 100644 (file)
@@ -84,7 +84,7 @@ urllib.ftpwrapper = myftpwrapper
 
 
 class robot_urllib(robot_base):
-    def get(self, bookmark, url, accept_charset=False):
+    def get(self, bookmark, url, accept_charset=False, use_proxy=False):
         try:
             # Set fake referer to the base URL
             opener.addheaders[2] = ('Referer', url)
index 4b8927bfcf83a4fa2bb5c9e9a2b615ce8c9e8a88..c33c27568973de187d04b50bb48762af54d75b66 100644 (file)
@@ -40,7 +40,7 @@ urllib2.install_opener(opener)
 
 
 class robot_urllib2(robot_base):
-    def get(self, bookmark, url, accept_charset=False):
+    def get(self, bookmark, url, accept_charset=False, use_proxy=False):
         request = urllib2.Request(url)
         for h, v in request_headers.items():
             request.add_header(h, v)
index 268dad43cbb3444b910c8428d436603282503b49..b5c798ae840efb9a4b51d6f22b93b17ab7aed4e4 100644 (file)
@@ -85,7 +85,7 @@ urllib.request.ftpwrapper = myftpwrapper
 
 
 class robot_urllib_py3(robot_base):
-    def get(self, bookmark, url, accept_charset=False):
+    def get(self, bookmark, url, accept_charset=False, use_proxy=False):
         try:
             # Set fake referer to the base URL
             opener.addheaders[2] = ('Referer', url)
index 90e857000b13a2418f37c2d616e758c3ec91221c..47b9e4427788e92ed2cee5ba61124f7e12b3295c 100755 (executable)
@@ -30,7 +30,7 @@ def run():
     bookmark.parent = None
 
     error, redirect_code, redirect_to, headers, content = \
-        robot.get(bookmark, url, True)
+        robot.smart_get(bookmark, url, True)
 
     if error:
         print(error)