From: Oleg Broytman Date: Tue, 20 Aug 2024 22:21:26 +0000 (+0300) Subject: Refactor(Robots): Pass headers instead of charset X-Git-Tag: 6.1.0~13 X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=db6d0bf8c59507f518de5da57c37593246356ef9;p=bookmarks_db.git Refactor(Robots): Pass headers instead of charset --- diff --git a/Robots/base.py b/Robots/base.py index d9551be..e3cf461 100644 --- a/Robots/base.py +++ b/Robots/base.py @@ -105,8 +105,14 @@ class robot_base(Robot): self.start = int(time.time()) bookmark.icon = None + if bookmark.charset: + headers = request_headers.copy() + headers['Accept-Charset'] = bookmark.charset + else: + headers = request_headers + error, http_status_code, redirect_to, headers, content = \ - await self.get_url(bookmark.href, bookmark.charset) + await self.get_url(bookmark.href, headers) if error is not None: bookmark.error = error @@ -221,7 +227,8 @@ class robot_base(Robot): icon_error, \ icon_status_code, icon_redirect_to, \ icon_headers, icon_data = \ - await self.get_url(_icon_url) + await self.get_url( + _icon_url, request_headers) if icon_error: raise IOError("No icon: " + icon_error) break @@ -345,7 +352,7 @@ class robot_base(Robot): finally: self.finish_check_url(bookmark) - async def get_url(self, url, accept_charset=None): + async def get_url(self, url, headers): split_results = urlsplit(url) url_proto = split_results.scheme url_host = split_results.hostname @@ -367,11 +374,10 @@ class robot_base(Robot): if use_proxy and url_host in self.proxy_ok: self.log(' Immediately trying with the proxy') error, http_status_code, redirect_to, headers, content = \ - await self.get(url, accept_charset=accept_charset, - use_proxy=True) + await self.get(url, headers, use_proxy=True) else: error, http_status_code, redirect_to, headers, content = \ - await self.get(url, accept_charset=accept_charset) + await self.get(url, headers) if error is not None and ( not url_host.startswith('localhost') and not url_host.startswith('127.') @@ -380,8 +386,7 @@ class robot_base(Robot): if use_proxy and http_status_code != 404: self.log(' Retrying with the proxy...') error, http_status_code, redirect_to, headers, content = \ - await self.get(url, accept_charset=accept_charset, - use_proxy=True) + await self.get(url, headers, use_proxy=True) if error is None: self.proxy_ok.add(url_host) if (error is not None) or ( diff --git a/Robots/bkmk_raiohttp.py b/Robots/bkmk_raiohttp.py index 30294c0..fa59360 100644 --- a/Robots/bkmk_raiohttp.py +++ b/Robots/bkmk_raiohttp.py @@ -22,14 +22,14 @@ import aioftp import aiohttp import aiohttp.client_exceptions -from Robots.base import robot_base, request_headers +from Robots.base import robot_base class robot_aiohttp(robot_base): def version_str(self): return 'aiohttp/%s' % aiohttp.__version__ - async def get(self, url, accept_charset=None, use_proxy=False): + async def get(self, url, headers, use_proxy=False): if url.startswith('ftp://'): error, body = await _get_ftp( url, timeout=self.ftp_timeout, @@ -39,12 +39,6 @@ class robot_aiohttp(robot_base): return error, None, None, None, None return None, None, None, None, body - if accept_charset: - headers = request_headers.copy() - headers['Accept-Charset'] = accept_charset - else: - headers = request_headers - if use_proxy: proxy = self.proxy else: diff --git a/Robots/bkmk_rcurl.py b/Robots/bkmk_rcurl.py index 4e31b7e..d59f4b4 100644 --- a/Robots/bkmk_rcurl.py +++ b/Robots/bkmk_rcurl.py @@ -13,23 +13,17 @@ __all__ = ['robot_curl'] from urllib.parse import urlsplit, parse_qsl, quote, quote_plus, urlencode -from m_lib.defenc import default_encoding import certifi import pycurl -from Robots.base import robot_base, request_headers +from Robots.base import robot_base class robot_curl(robot_base): def version_str(self): return str(pycurl.version) - async def get(self, url, accept_charset=None, use_proxy=False): - if accept_charset: - headers = request_headers.copy() - headers['Accept-Charset'] = accept_charset - else: - headers = request_headers + async def get(self, url, headers, use_proxy=False): headers = ['%s: %s' % (k, v) for k, v in headers.items()] curl = pycurl.Curl() @@ -63,7 +57,7 @@ class robot_curl(robot_base): try: url.encode('ascii') except UnicodeEncodeError: - url = encode_url(url, accept_charset) + url = encode_url(url) curl.setopt(pycurl.URL, url) try: curl.perform() @@ -102,10 +96,7 @@ class robot_curl(robot_base): return '' # We don't store welcome message yet -def encode_url(url, encoding): - if not encoding: - encoding = default_encoding - +def encode_url(url, encoding='latin1'): split_results = urlsplit(url) protocol, netloc, path, query, tag = split_results user = split_results.username @@ -116,21 +107,15 @@ def encode_url(url, encoding): if query: qlist = [] for name, value in parse_qsl(query): - if isinstance(name, bytes): - name = name.decode(default_encoding) - value = value.decode(default_encoding) - name = name.encode(encoding) - value = value.encode(encoding) + if not isinstance(name, bytes): + name = name.encode(encoding) + value = value.encode(encoding) qlist.append((name, value)) url = protocol + "://" if user: - if isinstance(user, bytes): - user = user.decode(default_encoding) url += quote(user.encode(encoding)) if password: - if isinstance(password, bytes): - password = password.decode(default_encoding) url += ':' + quote(password.encode(encoding)) url += '@' if host: @@ -143,14 +128,10 @@ def encode_url(url, encoding): if protocol == "file": url += quote(path) else: - if isinstance(path, bytes): - path = path.decode(default_encoding) url += quote(path.encode(encoding)) if query: url += '?' + urlencode(qlist) if tag: - if isinstance(tag, bytes): - tag = tag.decode(default_encoding) url += '#' + quote_plus(tag.encode(encoding)) return url diff --git a/Robots/bkmk_rmultiaio.py b/Robots/bkmk_rmultiaio.py index 62da680..d4b3342 100644 --- a/Robots/bkmk_rmultiaio.py +++ b/Robots/bkmk_rmultiaio.py @@ -50,12 +50,11 @@ class robot_multiaio(multi_mixin, robot_aiohttp): current_href.set(bookmark.href) await self.check_bookmark_async(bookmark) - async def get_url(self, url, accept_charset=None): + async def get_url(self, url, headers): if url not in self.logs: self.logs[url] = [] current_href.set(url) - return await super(robot_multiaio, self).get_url( - url, accept_charset=accept_charset) + return await super(robot_multiaio, self).get_url(url, headers) def wait(self): self.loop.run_until_complete(self.wait_async()) diff --git a/Robots/bkmk_rrequests.py b/Robots/bkmk_rrequests.py index dd5a120..c9d5f68 100644 --- a/Robots/bkmk_rrequests.py +++ b/Robots/bkmk_rrequests.py @@ -21,14 +21,14 @@ from requests.packages.urllib3.util.ssl_ import create_urllib3_context import requests import urllib3 -from Robots.base import robot_base, request_headers +from Robots.base import robot_base class robot_requests(robot_base): def version_str(self): return 'python-requests urllib3/%s' % urllib3.__version__ - async def get(self, url, accept_charset=None, use_proxy=False): + async def get(self, url, headers, use_proxy=False): if url.startswith('ftp://'): error, welcome, body = _get_ftp(url, self.timeout) if error is not None: @@ -36,12 +36,6 @@ class robot_requests(robot_base): self.welcome = welcome return None, None, None, None, body - if accept_charset: - headers = request_headers.copy() - headers['Accept-Charset'] = accept_charset - else: - headers = request_headers - if use_proxy: proxies = {'http': self.proxy, 'https': self.proxy} else: