Refactor(Robots): Pass headers instead of charset

author Oleg Broytman <phd@phdru.name>

Tue, 20 Aug 2024 22:21:26 +0000 (01:21 +0300)

committer Oleg Broytman <phd@phdru.name>

Sat, 7 Sep 2024 10:59:02 +0000 (13:59 +0300)
author Oleg Broytman <phd@phdru.name>
Tue, 20 Aug 2024 22:21:26 +0000 (01:21 +0300)
committer Oleg Broytman <phd@phdru.name>
Sat, 7 Sep 2024 10:59:02 +0000 (13:59 +0300)
diff --git a/Robots/base.py b/Robots/base.py

index d9551be25ed4dfb6785e97d39b75b8864f2c74d9..e3cf4614f0e4c5aa303f110830230f40f7928e2f 100644 (file)
--- a/Robots/base.py
+++ b/Robots/base.py
@@ -105,8 +105,14 @@ class robot_base(Robot):
              self.start = int(time.time())
              bookmark.icon = None
  
+            if bookmark.charset:
+                headers = request_headers.copy()
+                headers['Accept-Charset'] = bookmark.charset
+            else:
+                headers = request_headers
+
              error, http_status_code, redirect_to, headers, content = \
-                await self.get_url(bookmark.href, bookmark.charset)
+                await self.get_url(bookmark.href, headers)
  
              if error is not None:
                  bookmark.error = error
@@ -221,7 +227,8 @@ class robot_base(Robot):
                                      icon_error, \
                                          icon_status_code, icon_redirect_to, \
                                          icon_headers, icon_data = \
-                                        await self.get_url(_icon_url)
+                                        await self.get_url(
+                                            _icon_url, request_headers)
                                      if icon_error:
                                          raise IOError("No icon: " + icon_error)
                                          break
@@ -345,7 +352,7 @@ class robot_base(Robot):
          finally:
              self.finish_check_url(bookmark)
  
-    async def get_url(self, url, accept_charset=None):
+    async def get_url(self, url, headers):
          split_results = urlsplit(url)
          url_proto = split_results.scheme
          url_host = split_results.hostname
@@ -367,11 +374,10 @@ class robot_base(Robot):
          if use_proxy and url_host in self.proxy_ok:
              self.log('   Immediately trying with the proxy')
              error, http_status_code, redirect_to, headers, content = \
-                await self.get(url, accept_charset=accept_charset,
-                               use_proxy=True)
+                await self.get(url, headers, use_proxy=True)
          else:
              error, http_status_code, redirect_to, headers, content = \
-                await self.get(url, accept_charset=accept_charset)
+                await self.get(url, headers)
              if error is not None and (
                  not url_host.startswith('localhost') and
                  not url_host.startswith('127.')
@@ -380,8 +386,7 @@ class robot_base(Robot):
                  if use_proxy and http_status_code != 404:
                      self.log('   Retrying with the proxy...')
                      error, http_status_code, redirect_to, headers, content = \
-                        await self.get(url, accept_charset=accept_charset,
-                                       use_proxy=True)
+                        await self.get(url, headers, use_proxy=True)
                      if error is None:
                          self.proxy_ok.add(url_host)
          if (error is not None) or (
diff --git a/Robots/bkmk_raiohttp.py b/Robots/bkmk_raiohttp.py

index 30294c0dfd3a71241b1b6b4bc4310798159123f7..fa593608d519de92fb5688be8588e35cd90b999c 100644 (file)
--- a/Robots/bkmk_raiohttp.py
+++ b/Robots/bkmk_raiohttp.py
@@ -22,14 +22,14 @@ import aioftp
  import aiohttp
  import aiohttp.client_exceptions
  
-from Robots.base import robot_base, request_headers
+from Robots.base import robot_base
  
  
  class robot_aiohttp(robot_base):
      def version_str(self):
          return 'aiohttp/%s' % aiohttp.__version__
  
-    async def get(self, url, accept_charset=None, use_proxy=False):
+    async def get(self, url, headers, use_proxy=False):
          if url.startswith('ftp://'):
              error, body = await _get_ftp(
                  url, timeout=self.ftp_timeout,
@@ -39,12 +39,6 @@ class robot_aiohttp(robot_base):
                  return error, None, None, None, None
              return None, None, None, None, body
  
-        if accept_charset:
-            headers = request_headers.copy()
-            headers['Accept-Charset'] = accept_charset
-        else:
-            headers = request_headers
-
          if use_proxy:
              proxy = self.proxy
          else:
diff --git a/Robots/bkmk_rcurl.py b/Robots/bkmk_rcurl.py

index 4e31b7e2ec6debc63961675590f31e914d143813..d59f4b4cbb3b9cf7c83400468ea4d80ff8aa4bb1 100644 (file)
--- a/Robots/bkmk_rcurl.py
+++ b/Robots/bkmk_rcurl.py
@@ -13,23 +13,17 @@ __all__ = ['robot_curl']
  
  from urllib.parse import urlsplit, parse_qsl, quote, quote_plus, urlencode
  
-from m_lib.defenc import default_encoding
  import certifi
  import pycurl
  
-from Robots.base import robot_base, request_headers
+from Robots.base import robot_base
  
  
  class robot_curl(robot_base):
      def version_str(self):
          return str(pycurl.version)
  
-    async def get(self, url, accept_charset=None, use_proxy=False):
-        if accept_charset:
-            headers = request_headers.copy()
-            headers['Accept-Charset'] = accept_charset
-        else:
-            headers = request_headers
+    async def get(self, url, headers, use_proxy=False):
          headers = ['%s: %s' % (k, v) for k, v in headers.items()]
  
          curl = pycurl.Curl()
@@ -63,7 +57,7 @@ class robot_curl(robot_base):
          try:
              url.encode('ascii')
          except UnicodeEncodeError:
-            url = encode_url(url, accept_charset)
+            url = encode_url(url)
          curl.setopt(pycurl.URL, url)
          try:
              curl.perform()
@@ -102,10 +96,7 @@ class robot_curl(robot_base):
          return ''  # We don't store welcome message yet
  
  
-def encode_url(url, encoding):
-    if not encoding:
-        encoding = default_encoding
-
+def encode_url(url, encoding='latin1'):
      split_results = urlsplit(url)
      protocol, netloc, path, query, tag = split_results
      user = split_results.username
@@ -116,21 +107,15 @@ def encode_url(url, encoding):
      if query:
          qlist = []
          for name, value in parse_qsl(query):
-            if isinstance(name, bytes):
-                name = name.decode(default_encoding)
-                value = value.decode(default_encoding)
-            name = name.encode(encoding)
-            value = value.encode(encoding)
+            if not isinstance(name, bytes):
+                name = name.encode(encoding)
+                value = value.encode(encoding)
              qlist.append((name, value))
  
      url = protocol + "://"
      if user:
-        if isinstance(user, bytes):
-            user = user.decode(default_encoding)
          url += quote(user.encode(encoding))
          if password:
-            if isinstance(password, bytes):
-                password = password.decode(default_encoding)
              url += ':' + quote(password.encode(encoding))
          url += '@'
      if host:
@@ -143,14 +128,10 @@ def encode_url(url, encoding):
          if protocol == "file":
              url += quote(path)
          else:
-            if isinstance(path, bytes):
-                path = path.decode(default_encoding)
              url += quote(path.encode(encoding))
      if query:
          url += '?' + urlencode(qlist)
      if tag:
-        if isinstance(tag, bytes):
-            tag = tag.decode(default_encoding)
          url += '#' + quote_plus(tag.encode(encoding))
  
      return url
diff --git a/Robots/bkmk_rmultiaio.py b/Robots/bkmk_rmultiaio.py

index 62da6807c59770bbd48b27f0764c9d36149f9a75..d4b3342d2c6ce579f9ada8452cbc018a0adea1f7 100644 (file)
--- a/Robots/bkmk_rmultiaio.py
+++ b/Robots/bkmk_rmultiaio.py
@@ -50,12 +50,11 @@ class robot_multiaio(multi_mixin, robot_aiohttp):
          current_href.set(bookmark.href)
          await self.check_bookmark_async(bookmark)
  
-    async def get_url(self, url, accept_charset=None):
+    async def get_url(self, url, headers):
          if url not in self.logs:
              self.logs[url] = []
          current_href.set(url)
-        return await super(robot_multiaio, self).get_url(
-            url, accept_charset=accept_charset)
+        return await super(robot_multiaio, self).get_url(url, headers)
  
      def wait(self):
          self.loop.run_until_complete(self.wait_async())
diff --git a/Robots/bkmk_rrequests.py b/Robots/bkmk_rrequests.py

index dd5a120b9ec1a33b7772f644aadda01b3dd04850..c9d5f68c002b87ae5cbc9c31faf3eca479671972 100644 (file)
--- a/Robots/bkmk_rrequests.py
+++ b/Robots/bkmk_rrequests.py
@@ -21,14 +21,14 @@ from requests.packages.urllib3.util.ssl_ import create_urllib3_context
  import requests
  import urllib3
  
-from Robots.base import robot_base, request_headers
+from Robots.base import robot_base
  
  
  class robot_requests(robot_base):
      def version_str(self):
          return 'python-requests urllib3/%s' % urllib3.__version__
  
-    async def get(self, url, accept_charset=None, use_proxy=False):
+    async def get(self, url, headers, use_proxy=False):
          if url.startswith('ftp://'):
              error, welcome, body = _get_ftp(url, self.timeout)
              if error is not None:
@@ -36,12 +36,6 @@ class robot_requests(robot_base):
              self.welcome = welcome
              return None, None, None, None, body
  
-        if accept_charset:
-            headers = request_headers.copy()
-            headers['Accept-Charset'] = accept_charset
-        else:
-            headers = request_headers
-
          if use_proxy:
              proxies = {'http': self.proxy, 'https': self.proxy}
          else:
author	Oleg Broytman <phd@phdru.name>
	Tue, 20 Aug 2024 22:21:26 +0000 (01:21 +0300)
committer	Oleg Broytman <phd@phdru.name>
	Sat, 7 Sep 2024 10:59:02 +0000 (13:59 +0300)
Robots/base.py		patch \| blob \| history
Robots/bkmk_raiohttp.py		patch \| blob \| history
Robots/bkmk_rcurl.py		patch \| blob \| history
Robots/bkmk_rmultiaio.py		patch \| blob \| history
Robots/bkmk_rrequests.py		patch \| blob \| history