This greatly simplifies robots.
from base64 import b64encode
-from urllib.parse import urljoin
+from urllib.parse import urlsplit, urljoin
import sys
import socket
import time
class robot_base(Robot):
+ # Pass proxy from the environment like this:
+ # BKMK_ROBOT=requests:proxy=http%3a//localhost%3a8080
+ # BKMK_ROBOT=requests:proxy=socks5h%3a//localhost%3a1080
+ proxy = None
+
+ # Store hosts for which we already know they require proxy...
+ proxy_ok = set()
+ # ...but aren't accessible even through proxy
+ proxy_error = set()
+
timeout = 60
def __init__(self, *args, **kw):
bookmark.icon = None
error, http_status_code, redirect_to, headers, content = \
- self.get(bookmark, bookmark.href, True)
+ self.smart_get(bookmark, bookmark.href, True)
if error:
bookmark.error = error
error, icon_status_code, \
icon_redirect_to, icon_headers, \
icon_data = \
- self.get(bookmark, _icon_url)
+ self.smart_get(bookmark, _icon_url)
if error:
raise IOError("No icon")
break
# Tested
return 1
+ def smart_get(self, bookmark, url, accept_charset=False):
+ split_results = urlsplit(url)
+ url_host = split_results.hostname
+
+ if url_host in self.proxy_error:
+ return 'see prev. error', None, None, None, None
+
+ if url_host in self.proxy_ok:
+ self.log(' Immediately trying with the proxy')
+ error, http_status_code, redirect_to, headers, content = \
+ self.get(bookmark, url,
+ accept_charset=accept_charset,
+ use_proxy=True)
+ else:
+ error, http_status_code, redirect_to, headers, content = \
+ self.get(bookmark, url,
+ accept_charset=accept_charset)
+ if error is not None:
+ self.log(' Error : %s' % error)
+ if self.proxy and error != '404 not_found':
+ self.log(' Retrying with the proxy...')
+ error, http_status_code, redirect_to, headers, content = \
+ self.get(bookmark, url,
+ accept_charset=accept_charset,
+ use_proxy=True)
+ if error is None:
+ self.proxy_ok.add(url_host)
+ if error is not None:
+ if self.proxy and http_status_code != 404:
+ self.log(' Proxy error : %s' % error)
+ if url_host not in self.proxy_ok:
+ self.proxy_error.add(url_host)
+ return error, http_status_code, None, None, None
+ if http_status_code:
+ return None, http_status_code, redirect_to, None, None
+ return None, None, None, headers, content
+
def set_redirect(self, bookmark, errcode, newurl):
bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl)
try:
__all__ = ['robot_requests']
-from urllib.parse import urlsplit
import warnings
from requests.adapters import HTTPAdapter
class robot_requests(robot_base):
- # Pass proxy from the environment like this:
- # BKMK_ROBOT=requests:proxy=http%3a//localhost%3a8080
- # BKMK_ROBOT=requests:proxy=socks5h%3a//localhost%3a1080
- proxy = None
-
- # Store hosts for which we already know they require proxy...
- proxy_ok = set()
- # ...but aren't accessible even through proxy
- proxy_error = set()
-
- def get(self, bookmark, url, accept_charset=False):
- split_results = urlsplit(url)
- url_host = split_results.hostname
-
- if url_host in self.proxy_error:
- return 'see prev. error', None, None, None, None
-
+ def get(self, bookmark, url, accept_charset=False, use_proxy=False):
if accept_charset and bookmark.charset:
headers = request_headers.copy()
headers['Accept-Charset'] = bookmark.charset
else:
headers = request_headers
- if url_host in self.proxy_ok:
- self.log(' Immediately trying with the proxy')
- error, r = request_get(url, headers, self.timeout, self.proxy)
+ if use_proxy:
+ proxies = {'http': self.proxy, 'https': self.proxy}
else:
- error, r = request_get(url, headers, self.timeout, None)
- if error is not None:
- self.log(' Error : %s' % error)
- if self.proxy and error != '404 not_found':
- self.log(' Retrying with the proxy...')
- error, r = request_get(url, headers,
- self.timeout, self.proxy)
- if error is None:
- self.proxy_ok.add(url_host)
+ proxies = None
+
+ s = requests.Session()
+ s.mount('https://', AllCiphersAdapter())
+
+ error = r = None
+ try:
+ r = s.get(url, headers=headers, timeout=self.timeout,
+ allow_redirects=False, proxies=proxies,
+ verify=False)
+ except requests.RequestException as e:
+ error = str(e)
+ else:
+ if r.status_code >= 400:
+ error = requests.status_codes._codes[r.status_code][0]
+ error = '%d %s' % (r.status_code, error)
+
if error is not None:
- if self.proxy and r.status_code != 404:
- self.log(' Proxy error : %s' % error)
- if url_host not in self.proxy_ok:
- self.proxy_error.add(url_host)
- return error, r.status_code, None, None, None
+ return error, r.status_code if r else None, None, None, None
if r.is_redirect:
return None, r.status_code, r.next.url, None, None
return None, None, None, r.headers, r.content
warnings.filterwarnings('ignore', 'Unverified HTTPS request is being made')
-
-
-def request_get(url, headers, timeout, proxy):
- if proxy:
- proxies = {'http': proxy, 'https': proxy}
- else:
- proxies = None
-
- s = requests.Session()
- s.mount('https://', AllCiphersAdapter())
-
- try:
- r = s.get(url, headers=headers, timeout=timeout,
- allow_redirects=False, proxies=proxies,
- verify=False)
- except requests.RequestException as e:
- return str(e), None
- else:
- if r.status_code >= 400:
- error = requests.status_codes._codes[r.status_code][0]
- return '%d %s' % (r.status_code, error), None
- return None, r
class robot_urllib(robot_base):
- def get(self, bookmark, url, accept_charset=False):
+ def get(self, bookmark, url, accept_charset=False, use_proxy=False):
try:
# Set fake referer to the base URL
opener.addheaders[2] = ('Referer', url)
class robot_urllib2(robot_base):
- def get(self, bookmark, url, accept_charset=False):
+ def get(self, bookmark, url, accept_charset=False, use_proxy=False):
request = urllib2.Request(url)
for h, v in request_headers.items():
request.add_header(h, v)
class robot_urllib_py3(robot_base):
- def get(self, bookmark, url, accept_charset=False):
+ def get(self, bookmark, url, accept_charset=False, use_proxy=False):
try:
# Set fake referer to the base URL
opener.addheaders[2] = ('Referer', url)
bookmark.parent = None
error, redirect_code, redirect_to, headers, content = \
- robot.get(bookmark, url, True)
+ robot.smart_get(bookmark, url, True)
if error:
print(error)