Feat(Robots): Robot based on aiohttp

author Oleg Broytman <phd@phdru.name>

Mon, 5 Aug 2024 12:00:55 +0000 (15:00 +0300)

committer Oleg Broytman <phd@phdru.name>

Tue, 6 Aug 2024 07:52:04 +0000 (10:52 +0300)
author Oleg Broytman <phd@phdru.name>
Mon, 5 Aug 2024 12:00:55 +0000 (15:00 +0300)
committer Oleg Broytman <phd@phdru.name>
Tue, 6 Aug 2024 07:52:04 +0000 (10:52 +0300)
diff --git a/Robots/bkmk_raiohttp.py b/Robots/bkmk_raiohttp.py

new file mode 100644 (file)

index 0000000..ed4dac6
--- /dev/null
+++ b/Robots/bkmk_raiohttp.py
@@ -0,0 +1,62 @@
+"""Robot based on aiohttp
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2024 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_aiohttp']
+
+
+import asyncio
+import aiohttp
+import aiohttp.client_exceptions
+from Robots.bkmk_robot_base import robot_base, request_headers
+
+
+class robot_aiohttp(robot_base):
+    def version_str(self):
+        return 'aiohttp/%s' % aiohttp.__version__
+
+    def get(self, bookmark, url, accept_charset=False, use_proxy=False):
+        if accept_charset and bookmark.charset:
+            headers = request_headers.copy()
+            headers['Accept-Charset'] = bookmark.charset
+        else:
+            headers = request_headers
+
+        if use_proxy:
+            proxy = self.proxy
+        else:
+            proxy = None
+
+        error, status, resp_headers, body = asyncio.run(get(
+            url, headers=headers, proxy=proxy,
+            connect_timeout=self.connect_timeout, timeout=self.timeout,
+        ))
+        if error is not None or (status and status >= 400):
+            if error is None:
+                error = 'Error %d' % status
+            else:
+                error = str(error)
+                if status:
+                    error = 'Error %d %s' % (status, error)
+            return error, status, None, None, None
+        if status and status >= 300:
+            return None, status, resp_headers['Location'], None, None
+        return None, status, None, resp_headers, body
+
+
+async def get(url, headers={}, proxy=None, connect_timeout=30, timeout=60):
+    timeout = aiohttp.ClientTimeout(connect=connect_timeout, total=timeout)
+    try:
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            async with session.get(
+                    url, headers=headers, proxy=proxy,
+                    allow_redirects=False) as resp:
+                return None, resp.status, resp.headers, await resp.read()
+    except (asyncio.TimeoutError, aiohttp.client_exceptions.ClientError) as e:
+        return e, None, None, None
diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py

index a7429411b8e46eb81518ac4f1ade76325a42b6f9..dffffdb09645361c80ff5a0adc666b3dd8b3fdeb 100644 (file)
--- a/Robots/bkmk_robot_base.py
+++ b/Robots/bkmk_robot_base.py
@@ -105,7 +105,7 @@ class robot_base(Robot):
              error, http_status_code, redirect_to, headers, content = \
                  self.smart_get(bookmark, bookmark.href, True)
  
-            if error:
+            if error is not None:
                  bookmark.error = error
                  return 1
  
@@ -169,7 +169,8 @@ class robot_base(Robot):
                              is_html = True
                              break
                      content_stripped = content.strip()
-                    if content_stripped and charset:
+                    if content_stripped and charset \
+                            and isinstance(content_stripped, bytes):
                          try:
                              content_stripped = content_stripped.decode(
                                  charset, 'replace')
@@ -371,13 +372,15 @@ class robot_base(Robot):
                                   use_proxy=True)
                      if error is None:
                          self.proxy_ok.add(url_host)
-        if error is not None:
+        if (error is not None) or (
+            http_status_code and (http_status_code >= 400)
+        ):
              if use_proxy:
                  self.log('   Proxy error    : %s' % error)
                  if url_host not in self.proxy_ok:
                      self.proxy_error.add(url_host)
              return error, http_status_code, None, None, None
-        if http_status_code:
+        if http_status_code and (http_status_code >= 300):
              return None, http_status_code, redirect_to, None, None
          return None, None, None, headers, content
  
diff --git a/bkmk_db-venv b/bkmk_db-venv

index faed80bcdc1ebf038a19a51a09b0b4569355fe85..62d0d98e5916bc65a57217cc557f6bc78dd61c75 100644 (file)
--- a/bkmk_db-venv
+++ b/bkmk_db-venv
@@ -9,6 +9,6 @@ if [ -z "$VIRTUAL_ENV" ]; then
           . bkmk_db-venv/bin/activate &&
           pip install --compile --upgrade beautifulsoup4 lxml m_lib.full \
           requests requests-ftp \
-         certifi pycurl
+         pycurl certifi aiohttp
      }
  fi
diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE

index aec114b8db3e63feb1f683f6cf9f886837fc43f2..3fb4356e567c76840212d8405dc23c981a499f5a 100644 (file)
--- a/doc/ANNOUNCE
+++ b/doc/ANNOUNCE
@@ -7,6 +7,12 @@ bookmarks.html.
  
  WHAT'S NEW
  
+Version 5.5.0 (2024-08-06)
+
+   Robot based on aiohttp.
+
+   Default list of robots is now aiohttp,curl,requests,forking.
+
  Version 5.4.1 (2024-08-04)
  
     Fix(bkmk_rcurl): IDNA-encode URLs. PycURL doesn't encode URLs itself
diff --git a/doc/ChangeLog b/doc/ChangeLog

index da565471f2bf940d0d04e9a19bf15a5cbb4a2532..cb8415da7c1a6bae5c869b5a86b5e7871d7b0fc6 100644 (file)
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,9 @@
+Version 5.5.0 (2024-08-06)
+
+   Robot based on aiohttp.
+
+   Default list of robots is now aiohttp,curl,requests,forking.
+
  Version 5.4.1 (2024-08-04)
  
     Fix(bkmk_rcurl): IDNA-encode URLs. PycURL doesn't encode URLs itself
diff --git a/doc/TODO b/doc/TODO

index 23564fc7977dfecd2deffc7f7522286a3c265ecf..d2e6573ebaaf072f1b3356080b8d4f412446942f 100644 (file)
--- a/doc/TODO
+++ b/doc/TODO
@@ -1,4 +1,4 @@
-Robot based on aiohttp.
+aioftp.
  
  Robot(s) that test many URLs in parallel.
  
diff --git a/robots.py b/robots.py

index 575f9bdf8c2cbe9319d4aae50350e1bb8c4495a8..06ce1fecac127b96fff6fd6fb4c3654df4da9973 100644 (file)
--- a/robots.py
+++ b/robots.py
@@ -16,7 +16,7 @@ from os import environ
  from bkmk_objects import parse_params, set_params
  
  robot_names, robot_params = parse_params(
-    environ.get("BKMK_ROBOT", "curl,requests,forking"))
+    environ.get("BKMK_ROBOT", "aiohttp,curl,requests,forking"))
  
  
  def import_robot(robot_name):
diff --git a/setup.py b/setup.py

index 27ef38b1d9690f8fdde8cb4c55c627733257dd9a..0a570ad02cea5184ca1d1c3b2a57850f6fff3cb7 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ from setuptools import setup
  
  setup(
      name='bookmarks_db',
-    version='5.4.1',
+    version='5.5.0',
      description='Bookmarks database and Internet robot',
      long_description=open('README', 'r').read(),
      long_description_content_type="text/plain",
@@ -41,5 +41,6 @@ setup(
          'html': ['beautifulsoup4', 'lxml'],
          'requests': ['requests', 'requests-ftp'],
          'curl': ['pycurl', 'certifi'],
+        'aiohttp:python_version>="3.4"': ['aiohttp'],
      },
  )
author	Oleg Broytman <phd@phdru.name>
	Mon, 5 Aug 2024 12:00:55 +0000 (15:00 +0300)
committer	Oleg Broytman <phd@phdru.name>
	Tue, 6 Aug 2024 07:52:04 +0000 (10:52 +0300)
Robots/bkmk_raiohttp.py	[new file with mode: 0644]	patch \| blob
Robots/bkmk_robot_base.py		patch \| blob \| history
bkmk_db-venv		patch \| blob \| history
doc/ANNOUNCE		patch \| blob \| history
doc/ChangeLog		patch \| blob \| history
doc/TODO		patch \| blob \| history
robots.py		patch \| blob \| history
setup.py		patch \| blob \| history