]> git.phdru.name Git - bookmarks_db.git/blobdiff - Robots/bkmk_rrequests.py
Perf(Rebobt/requests): Speedup second access
[bookmarks_db.git] / Robots / bkmk_rrequests.py
index d1bf98bfca2e9b5c758cbf762d51697922f614c6..114d67094586ed97d928768803aed3704f075790 100644 (file)
@@ -11,6 +11,8 @@ __license__ = "GNU GPL"
 __all__ = ['robot_requests']
 
 
+from urllib.parse import urlsplit
+
 import requests
 import requests_ftp
 
@@ -20,18 +22,58 @@ requests_ftp.monkeypatch_session()
 
 
 class robot_requests(robot_base):
+    # Pass proxy from the environment like this:
+    # BKMK_ROBOT=requests:proxy=socks5h%3a//localhost%3a1080
+    proxy = None
+
+    # Store hosts for which we already know they require proxy...
+    proxy_ok = set()
+    # ...but aren't accessible even through proxy
+    proxy_error = set()
+
     def get(self, bookmark, url, accept_charset=False):
-        try:
-            r = requests.Session().get(
-                url, timeout=self.timeout, allow_redirects=False)
-        except requests.RequestException as e:
-            error = str(e)
-            self.log('   Error: %s' % error)
-            return error, None, None, None, None
+        split_results = urlsplit(url)
+        url_host = split_results.hostname
+
+        if url_host in self.proxy_error:
+            return 'proxy error', None, None, None, None
+
+        if url_host in self.proxy_ok:
+            self.log('   Immediately trying with the proxy')
+            error, r = request_get(url, self.timeout, self.proxy)
         else:
-            if r.is_redirect:
-                return None, r.status_code, r.next.url, None, None
-            return None, None, None, r.headers, r.content
+            error, r = request_get(url, self.timeout, None)
+            if error is not None:
+                self.log('   Error: %s' % error)
+                if self.proxy:
+                    self.log('   Retrying with the proxy...')
+                    error, r = request_get(url, self.timeout, self.proxy)
+                    if error is None:
+                        self.proxy_ok.add(url_host)
+        if error is not None:
+            if self.proxy:
+                self.log('   Proxy error: %s' % error)
+                if url_host not in self.proxy_ok:
+                    self.proxy_error.add(url_host)
+            return error, None, None, None, None
+        if r.is_redirect:
+            return None, r.status_code, r.next.url, None, None
+        return None, None, None, r.headers, r.content
 
     def get_ftp_welcome(self):
         return ''  # Alas, requests_ftp doesn't store welcome message
+
+
+def request_get(url, timeout, proxy):
+    if proxy:
+        proxies = {'http': proxy, 'https': proxy}
+    else:
+        proxies = None
+
+    try:
+        r = requests.Session().get(
+            url, timeout=timeout, allow_redirects=False, proxies=proxies)
+    except requests.RequestException as e:
+        return str(e), None
+    else:
+        return None, r