]> git.phdru.name Git - bookmarks_db.git/commitdiff
Feat(Robots): Combined curl with curlmulti
authorOleg Broytman <phd@phdru.name>
Sun, 8 Sep 2024 07:36:12 +0000 (10:36 +0300)
committerOleg Broytman <phd@phdru.name>
Sun, 8 Sep 2024 07:56:44 +0000 (10:56 +0300)
The combined robot is named just curl.

Robots/bkmk_rcurl.py
Robots/bkmk_rcurlmulti.py [deleted file]
doc/ANNOUNCE
doc/ChangeLog
robots.py

index 73bfbe32eddb79babd91ed65485de8fd41565f15..36c715b71add292b1255410c65281483174920a7 100644 (file)
@@ -1,4 +1,5 @@
-"""Robot based on PycURL
+"""Robot based on curl_multi and concurrent.futures,
+processes multiple URLs in parallel (multithreaded).
 
 This file is a part of Bookmarks database and Internet robot.
 
@@ -11,32 +12,101 @@ __license__ = "GNU GPL"
 __all__ = ['robot_curl']
 
 
+from queue import Queue
+from time import sleep
 from urllib.parse import urlsplit, parse_qsl, quote, quote_plus, urlencode
+import concurrent.futures
+import threading
 
 import certifi
 import pycurl
 
 from Robots.base import robot_base
+from Robots.concurrent_futures import concurrent_futures
 
 
-class robot_curl(robot_base):
+class robot_curl(concurrent_futures):
+    def __init__(self, *args, **kw):
+        self.concurrent_class = concurrent.futures.ThreadPoolExecutor
+        concurrent_futures.__init__(self, *args, **kw)
+        #
+        # Rename self.log, create one log_lines list per URL (thread)
+        self.file_log = self.log
+        del self.log
+        self.local = threading.local()
+
+        # Queue to pass requests to the main loop
+        self.queue = Queue()
+
+        # Main loop: run curl_multi
+        self.executor.submit(self.curl_multi)
+
+    def __getattr__(self, attr):
+        if attr != 'log':
+            raise AttributeError(attr)
+        try:
+            return self.local.log.append
+        except AttributeError:
+            return self.file_log
+
     def version_str(self):
         return str(pycurl.version)
 
+    def check_bkmk_task(self):
+        return self.check_bkmk_thread
+
+    def check_bkmk_thread(self, bookmark):
+        self.local.log = log = []
+        robot_base.check_bookmark(self, bookmark)
+        return bookmark, log
+
     async def get(self, url, req_headers, use_proxy=False):
-        curlw = CurlWrapper(self, url, req_headers, use_proxy)
+        # Queue to wait results
+        queue = Queue(maxsize=1)
+        self.queue.put((url, req_headers, use_proxy, queue))
+        return queue.get()
 
-        try:
-            curlw.perform()
-        except pycurl.error as e:
-            error = str(e)
-            status = None
-        else:
-            error = None
-            status = curlw.getinfo(pycurl.HTTP_CODE)
-        curlw.close()
+    def curl_multi(self):
+        _curl_multi = pycurl.CurlMulti()
+        _curl_multi.wrappers = {}
 
-        return error, status, curlw.resp_headers, curlw.body
+        while True:
+            if self.queue.empty():
+                pass
+            else:
+                request = self.queue.get_nowait()
+                if request is None:  # Signal to stop
+                    return
+                url, req_headers, use_proxy, queue = request
+                cw = CurlWrapper(self, url, req_headers, use_proxy)
+                _curl_multi.wrappers[cw.curl] = (cw, queue)
+                _curl_multi.add_handle(cw.curl)
+
+            _curl_multi.select(1.0)
+            while True:
+                ret, num_handles = _curl_multi.perform()
+                _, completed, failed = _curl_multi.info_read()
+                for c in completed:
+                    _curl_multi.remove_handle(c)
+                    cw, queue = _curl_multi.wrappers.pop(c)
+                    queue.put_nowait(
+                        (None, cw.getinfo(pycurl.HTTP_CODE),
+                         cw.resp_headers, cw.body)
+                    )
+                for c, errcode, errmsg in failed:
+                    _curl_multi.remove_handle(c)
+                    cw, queue = _curl_multi.wrappers.pop(c)
+                    queue.put_nowait(('Error: %d %s' % (errcode, errmsg),
+                                      None, None, None))
+                if ret != pycurl.E_CALL_MULTI_PERFORM:
+                    break
+            if num_handles == 0:
+                sleep(1)
+
+    def wait(self):
+        super(robot_curl, self).wait()
+        if not self.bookmarks and not self.pending:
+            self.queue.put(None)  # Signal to stop
 
     def get_ftp_welcome(self):
         return ''  # We don't store welcome message yet
diff --git a/Robots/bkmk_rcurlmulti.py b/Robots/bkmk_rcurlmulti.py
deleted file mode 100644 (file)
index 1d4f053..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-"""Robot based on curl_multi and concurrent.futures,
-processes multiple URLs in parallel (multithreaded).
-
-This file is a part of Bookmarks database and Internet robot.
-
-"""
-
-__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2024 PhiloSoft Design"
-__license__ = "GNU GPL"
-
-__all__ = ['robot_curlmulti']
-
-
-from queue import Queue, Empty as QueueEmpty
-from time import sleep
-import concurrent.futures
-import threading
-
-import pycurl
-
-from Robots.base import robot_base
-from Robots.concurrent_futures import concurrent_futures
-from Robots.bkmk_rcurl import CurlWrapper
-
-
-class robot_curlmulti(concurrent_futures):
-    def __init__(self, *args, **kw):
-        self.concurrent_class = concurrent.futures.ThreadPoolExecutor
-        concurrent_futures.__init__(self, *args, **kw)
-        #
-        # Rename self.log, create one log_lines list per URL (thread)
-        self.file_log = self.log
-        del self.log
-        self.local = threading.local()
-
-        # Queue to pass requests to the main loop
-        self.queue = Queue()
-
-        # Main loop: run curl_multi
-        self.executor.submit(self.curl_multi)
-
-    def __getattr__(self, attr):
-        if attr != 'log':
-            raise AttributeError(attr)
-        try:
-            return self.local.log.append
-        except AttributeError:
-            return self.file_log
-
-    def check_bkmk_task(self):
-        return self.check_bkmk_thread
-
-    def check_bkmk_thread(self, bookmark):
-        self.local.log = log = []
-        robot_base.check_bookmark(self, bookmark)
-        return bookmark, log
-
-    async def get(self, url, req_headers, use_proxy=False):
-        # Queue to wait results
-        queue = Queue(maxsize=1)
-        self.queue.put((url, req_headers, use_proxy, queue))
-        return queue.get()
-
-    def curl_multi(self):
-        _curl_multi = pycurl.CurlMulti()
-        _curl_multi.wrappers = {}
-
-        while True:
-            if self.queue.empty():
-                pass
-            else:
-                request = self.queue.get_nowait()
-                if request is None:  # Signal to stop
-                    return
-                url, req_headers, use_proxy, queue = request
-                cw = CurlWrapper(self, url, req_headers, use_proxy)
-                _curl_multi.wrappers[cw.curl] = (cw, queue)
-                _curl_multi.add_handle(cw.curl)
-
-            _curl_multi.select(1.0)
-            while True:
-                ret, num_handles = _curl_multi.perform()
-                _, completed, failed = _curl_multi.info_read()
-                for c in completed:
-                    _curl_multi.remove_handle(c)
-                    cw, queue = _curl_multi.wrappers.pop(c)
-                    queue.put_nowait(
-                        (None, cw.getinfo(pycurl.HTTP_CODE),
-                         cw.resp_headers, cw.body)
-                    )
-                for c, errcode, errmsg in failed:
-                    _curl_multi.remove_handle(c)
-                    cw, queue = _curl_multi.wrappers.pop(c)
-                    queue.put_nowait(('Error: %d %s' % (errcode, errmsg),
-                                      None, None, None))
-                if ret != pycurl.E_CALL_MULTI_PERFORM:
-                    break
-            if num_handles == 0:
-                sleep(1)
-
-    def wait(self):
-        super(robot_curlmulti, self).wait()
-        if not self.bookmarks and not self.pending:
-            self.queue.put(None)  # Signal to stop
index 49a1e525db646850710487835c98c5bce8134fb6..3962e5aaeabdb0a1eca027aeeaf2b43959cb3a6c 100644 (file)
@@ -9,12 +9,14 @@ WHAT'S NEW
 
 Version 6.1.0 (2024-??-??)
 
-   Combine aiohttp with multiaio; the combined robot is named just aio.
+   Combined aiohttp with multiaio; the combined robot is named just aio.
 
    Robot based on curl_multi, processes multiple URLs in parallel
    using concurrent.futures (multithreaded).
 
-   Default list of robots is now multirequests,curlmulti,aio.
+   Combined curl with curlmulti; the combined robot is named just curl.
+
+   Default list of robots is now multirequests,aio,curl.
 
    Make bkmk_rmultirequests always multiprocess.
 
index 36c1968c6e3db59c5c2c7a019864c232a6c95b1c..54fa80233f71da2a601ade32060c21f5163bde75 100644 (file)
@@ -5,7 +5,9 @@ Version 6.1.0 (2024-??-??)
    Robot based on curl_multi, processes multiple URLs in parallel
    using concurrent.futures (multithreaded).
 
-   Default list of robots is now multirequests,curlmulti,aio.
+   Combined curl with curlmulti; the combined robot is named just curl.
+
+   Default list of robots is now multirequests,aio,curl.
 
    Make bkmk_rmultirequests always multiprocess.
 
index 23fce127229d12bd26dc73846c998206795361d2..7b79976e96786ad5e0be1e927d947b6996e2b6d3 100644 (file)
--- a/robots.py
+++ b/robots.py
@@ -16,7 +16,7 @@ from os import environ
 from bkmk_objects import parse_params, set_params
 
 robot_names, robot_params = parse_params(
-    environ.get("BKMK_ROBOT", "multirequests,curlmulti,aio"))
+    environ.get("BKMK_ROBOT", "multirequests,aio,curl"))
 
 
 def import_robot(robot_name):