From a58264565a5ef1af5800d0b89505640739ae0212 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Wed, 31 Jul 2024 18:49:11 +0300 Subject: [PATCH] Refactor(Robots): Move proxy handling to base class This greatly simplifies robots. --- Robots/bkmk_robot_base.py | 53 ++++++++++++++++++++++++-- Robots/bkmk_rrequests.py | 78 ++++++++++---------------------------- Robots/bkmk_rurllib.py | 2 +- Robots/bkmk_rurllib2.py | 2 +- Robots/bkmk_rurllib_py3.py | 2 +- get_url.py | 2 +- 6 files changed, 75 insertions(+), 64 deletions(-) diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index cc2574a..90d2875 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -12,7 +12,7 @@ __all__ = ['robot_base', 'get_error'] from base64 import b64encode -from urllib.parse import urljoin +from urllib.parse import urlsplit, urljoin import sys import socket import time @@ -67,6 +67,16 @@ icons = {} class robot_base(Robot): + # Pass proxy from the environment like this: + # BKMK_ROBOT=requests:proxy=http%3a//localhost%3a8080 + # BKMK_ROBOT=requests:proxy=socks5h%3a//localhost%3a1080 + proxy = None + + # Store hosts for which we already know they require proxy... + proxy_ok = set() + # ...but aren't accessible even through proxy + proxy_error = set() + timeout = 60 def __init__(self, *args, **kw): @@ -79,7 +89,7 @@ class robot_base(Robot): bookmark.icon = None error, http_status_code, redirect_to, headers, content = \ - self.get(bookmark, bookmark.href, True) + self.smart_get(bookmark, bookmark.href, True) if error: bookmark.error = error @@ -190,7 +200,7 @@ class robot_base(Robot): error, icon_status_code, \ icon_redirect_to, icon_headers, \ icon_data = \ - self.get(bookmark, _icon_url) + self.smart_get(bookmark, _icon_url) if error: raise IOError("No icon") break @@ -312,6 +322,43 @@ class robot_base(Robot): # Tested return 1 + def smart_get(self, bookmark, url, accept_charset=False): + split_results = urlsplit(url) + url_host = split_results.hostname + + if url_host in self.proxy_error: + return 'see prev. error', None, None, None, None + + if url_host in self.proxy_ok: + self.log(' Immediately trying with the proxy') + error, http_status_code, redirect_to, headers, content = \ + self.get(bookmark, url, + accept_charset=accept_charset, + use_proxy=True) + else: + error, http_status_code, redirect_to, headers, content = \ + self.get(bookmark, url, + accept_charset=accept_charset) + if error is not None: + self.log(' Error : %s' % error) + if self.proxy and error != '404 not_found': + self.log(' Retrying with the proxy...') + error, http_status_code, redirect_to, headers, content = \ + self.get(bookmark, url, + accept_charset=accept_charset, + use_proxy=True) + if error is None: + self.proxy_ok.add(url_host) + if error is not None: + if self.proxy and http_status_code != 404: + self.log(' Proxy error : %s' % error) + if url_host not in self.proxy_ok: + self.proxy_error.add(url_host) + return error, http_status_code, None, None, None + if http_status_code: + return None, http_status_code, redirect_to, None, None + return None, None, None, headers, content + def set_redirect(self, bookmark, errcode, newurl): bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl) try: diff --git a/Robots/bkmk_rrequests.py b/Robots/bkmk_rrequests.py index 317c473..e0e4d2d 100644 --- a/Robots/bkmk_rrequests.py +++ b/Robots/bkmk_rrequests.py @@ -11,7 +11,6 @@ __license__ = "GNU GPL" __all__ = ['robot_requests'] -from urllib.parse import urlsplit import warnings from requests.adapters import HTTPAdapter @@ -25,48 +24,35 @@ requests_ftp.monkeypatch_session() class robot_requests(robot_base): - # Pass proxy from the environment like this: - # BKMK_ROBOT=requests:proxy=http%3a//localhost%3a8080 - # BKMK_ROBOT=requests:proxy=socks5h%3a//localhost%3a1080 - proxy = None - - # Store hosts for which we already know they require proxy... - proxy_ok = set() - # ...but aren't accessible even through proxy - proxy_error = set() - - def get(self, bookmark, url, accept_charset=False): - split_results = urlsplit(url) - url_host = split_results.hostname - - if url_host in self.proxy_error: - return 'see prev. error', None, None, None, None - + def get(self, bookmark, url, accept_charset=False, use_proxy=False): if accept_charset and bookmark.charset: headers = request_headers.copy() headers['Accept-Charset'] = bookmark.charset else: headers = request_headers - if url_host in self.proxy_ok: - self.log(' Immediately trying with the proxy') - error, r = request_get(url, headers, self.timeout, self.proxy) + if use_proxy: + proxies = {'http': self.proxy, 'https': self.proxy} else: - error, r = request_get(url, headers, self.timeout, None) - if error is not None: - self.log(' Error : %s' % error) - if self.proxy and error != '404 not_found': - self.log(' Retrying with the proxy...') - error, r = request_get(url, headers, - self.timeout, self.proxy) - if error is None: - self.proxy_ok.add(url_host) + proxies = None + + s = requests.Session() + s.mount('https://', AllCiphersAdapter()) + + error = r = None + try: + r = s.get(url, headers=headers, timeout=self.timeout, + allow_redirects=False, proxies=proxies, + verify=False) + except requests.RequestException as e: + error = str(e) + else: + if r.status_code >= 400: + error = requests.status_codes._codes[r.status_code][0] + error = '%d %s' % (r.status_code, error) + if error is not None: - if self.proxy and r.status_code != 404: - self.log(' Proxy error : %s' % error) - if url_host not in self.proxy_ok: - self.proxy_error.add(url_host) - return error, r.status_code, None, None, None + return error, r.status_code if r else None, None, None, None if r.is_redirect: return None, r.status_code, r.next.url, None, None return None, None, None, r.headers, r.content @@ -96,25 +82,3 @@ class AllCiphersAdapter(HTTPAdapter): warnings.filterwarnings('ignore', 'Unverified HTTPS request is being made') - - -def request_get(url, headers, timeout, proxy): - if proxy: - proxies = {'http': proxy, 'https': proxy} - else: - proxies = None - - s = requests.Session() - s.mount('https://', AllCiphersAdapter()) - - try: - r = s.get(url, headers=headers, timeout=timeout, - allow_redirects=False, proxies=proxies, - verify=False) - except requests.RequestException as e: - return str(e), None - else: - if r.status_code >= 400: - error = requests.status_codes._codes[r.status_code][0] - return '%d %s' % (r.status_code, error), None - return None, r diff --git a/Robots/bkmk_rurllib.py b/Robots/bkmk_rurllib.py index 057c018..5b818a6 100644 --- a/Robots/bkmk_rurllib.py +++ b/Robots/bkmk_rurllib.py @@ -84,7 +84,7 @@ urllib.ftpwrapper = myftpwrapper class robot_urllib(robot_base): - def get(self, bookmark, url, accept_charset=False): + def get(self, bookmark, url, accept_charset=False, use_proxy=False): try: # Set fake referer to the base URL opener.addheaders[2] = ('Referer', url) diff --git a/Robots/bkmk_rurllib2.py b/Robots/bkmk_rurllib2.py index 4b8927b..c33c275 100644 --- a/Robots/bkmk_rurllib2.py +++ b/Robots/bkmk_rurllib2.py @@ -40,7 +40,7 @@ urllib2.install_opener(opener) class robot_urllib2(robot_base): - def get(self, bookmark, url, accept_charset=False): + def get(self, bookmark, url, accept_charset=False, use_proxy=False): request = urllib2.Request(url) for h, v in request_headers.items(): request.add_header(h, v) diff --git a/Robots/bkmk_rurllib_py3.py b/Robots/bkmk_rurllib_py3.py index 268dad4..b5c798a 100644 --- a/Robots/bkmk_rurllib_py3.py +++ b/Robots/bkmk_rurllib_py3.py @@ -85,7 +85,7 @@ urllib.request.ftpwrapper = myftpwrapper class robot_urllib_py3(robot_base): - def get(self, bookmark, url, accept_charset=False): + def get(self, bookmark, url, accept_charset=False, use_proxy=False): try: # Set fake referer to the base URL opener.addheaders[2] = ('Referer', url) diff --git a/get_url.py b/get_url.py index 90e8570..47b9e44 100755 --- a/get_url.py +++ b/get_url.py @@ -30,7 +30,7 @@ def run(): bookmark.parent = None error, redirect_code, redirect_to, headers, content = \ - robot.get(bookmark, url, True) + robot.smart_get(bookmark, url, True) if error: print(error) -- 2.39.5