]> git.phdru.name Git - bookmarks_db.git/commitdiff
Feat(Robots): Robot based on aiohttp and concurrent.futures
authorOleg Broytman <phd@phdru.name>
Tue, 10 Sep 2024 14:17:36 +0000 (17:17 +0300)
committerOleg Broytman <phd@phdru.name>
Tue, 10 Sep 2024 14:52:22 +0000 (17:52 +0300)
Processes multiple URLs in parallel (multithreaded).

Robots/bkmk_rmultiaio.py [new file with mode: 0644]
doc/ANNOUNCE
doc/ChangeLog

diff --git a/Robots/bkmk_rmultiaio.py b/Robots/bkmk_rmultiaio.py
new file mode 100644 (file)
index 0000000..78cdaa2
--- /dev/null
@@ -0,0 +1,78 @@
+"""Robot based on aiohttp and concurrent.futures,
+processes multiple URLs in parallel (multithreaded).
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2024 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_multiaio']
+
+
+import asyncio
+import aiohttp
+
+from Robots.bkmk_raio import _get_http, _get_ftp
+from Robots.concurrent_futures import cf_multithread
+
+
+class robot_multiaio(cf_multithread):
+    def __init__(self, *args, **kw):
+        self.async_pending = set()  # pending async tasks
+        cf_multithread.__init__(self, *args, **kw)
+
+    def version_str(self):
+        return 'aiohttp/%s; multithreaded' % aiohttp.__version__
+
+    def main_thread(self):
+        asyncio.run(self.main_thread_async())
+
+    async def main_thread_async(self):
+        """Main loop"""
+
+        while True:
+            if self.queue.empty():
+                pass
+            else:
+                request = self.queue.get_nowait()
+                if request is None:  # Signal to stop
+                    return
+                url, req_headers, use_proxy, queue = request
+
+                task = asyncio.create_task(
+                    self.get_url_task(url, req_headers, use_proxy, queue))
+                self.async_pending.add(task)
+
+            if self.async_pending:
+                done, async_pending = await asyncio.wait(
+                    self.async_pending, timeout=self.timeout,
+                    return_when=asyncio.FIRST_COMPLETED)
+                self.async_pending = async_pending
+
+                for task in done:
+                    error, status, resp_headers, body, queue = task.result()
+                    queue.put_nowait((error, status, resp_headers, body))
+
+    async def get_url_task(self, url, req_headers, use_proxy, queue):
+        if url.startswith('ftp://'):
+            error, body = await _get_ftp(
+                url, timeout=self.timeout,
+            )
+            if error is not None:
+                error = str(error)
+                return error, None, None, None, queue
+            return None, None, None, body, queue
+
+        if use_proxy:
+            proxy = self.proxy
+        else:
+            proxy = None
+
+        error, status, resp_headers, body = await _get_http(
+            url, req_headers, proxy=proxy,
+            timeout=self.timeout,
+        )
+        return error, status, resp_headers, body, queue
index 7239608b2de3616f3ca02ab52f460cbf7ec3cd19..9bced52c63b5fdc62efb4eff18e1abdc41cee5af 100644 (file)
@@ -9,14 +9,18 @@ WHAT'S NEW
 
 Version 6.2.0 (2024-??-??)
 
+   Robot based on aiohttp and concurrent.futures,
+   processes multiple URLs in parallel (multithreaded).
+   Works slowly, the same way as curl.
+
+   Default list of robots is multirequests,aio.
+
    Robot based on twisted and concurrent.futures,
    processes multiple URLs in parallel (multithreaded).
    Doesn't properly support proxies; has problems with HTTP proxy
    and doesn't support SOCKS5 proxy at all.
    Doesn't query FTP; requires more work.
 
-   Default list of robots is still multirequests,aio.
-
    Robots: Removed ftp_timeout.
 
 
index 50750a4dffbd4a4d76b6a378c27383f58b4a975e..e0e4589f10f3175778b460a7883b586ee3370e21 100644 (file)
@@ -1,13 +1,17 @@
 Version 6.2.0 (2024-??-??)
 
+   Robot based on aiohttp and concurrent.futures,
+   processes multiple URLs in parallel (multithreaded).
+   Works slowly, the same way as curl.
+
+   Default list of robots is multirequests,aio.
+
    Robot based on twisted and concurrent.futures,
    processes multiple URLs in parallel (multithreaded).
    Doesn't properly support proxies; has problems with HTTP proxy
    and doesn't support SOCKS5 proxy at all.
    Doesn't query FTP; requires more work.
 
-   Default list of robots is still multirequests,aio.
-
    Robots: Removed ftp_timeout.
 
 Version 6.1.0 (2024-09-08)