From 8fadfc8d9a1538e71d2ca8a98bde5cc44ce2eda5 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Sun, 18 Aug 2024 23:38:52 +0300 Subject: [PATCH] Refactor(Robots): Split off `multi_mixin` --- Robots/bkmk_rmultirequests.py | 32 +++++++++-------------------- Robots/multi_mixin.py | 38 +++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 23 deletions(-) create mode 100644 Robots/multi_mixin.py diff --git a/Robots/bkmk_rmultirequests.py b/Robots/bkmk_rmultirequests.py index 5ae0e37..624263a 100644 --- a/Robots/bkmk_rmultirequests.py +++ b/Robots/bkmk_rmultirequests.py @@ -17,44 +17,31 @@ import os from bkmk_objects import copy_bkmk from Robots.bkmk_rrequests import robot_requests +from Robots.multi_mixin import multi_mixin from robots import import_robot, set_params, robot_params + cpu_count = os.cpu_count() -class robot_multirequests(robot_requests): +class robot_multirequests(multi_mixin, robot_requests): concurrent_class = concurrent.futures.ProcessPoolExecutor # or ThreadPoolExecutor # noqa: E501 line too long + # We're I/O-bound, not CPU-bound max_urls = 2*cpu_count if cpu_count else 10 def __init__(self, *args, **kw): - if isinstance(self.max_urls, str): - self.max_urls = int(self.max_urls) concurrent_class = getattr(concurrent.futures, self.concurrent_class) \ if isinstance(self.concurrent_class, str) \ else self.concurrent_class self.concurrent_class_name = concurrent_class.__name__ + multi_mixin.__init__(self, *args, **kw) robot_requests.__init__(self, *args, **kw) - self.executor = concurrent_class(max_urls=self.max_urls) - - # Bookmarks waiting to be processed; - # maps {URL: [bookmark, saved parent, future]} - self.bookmarks = {} - self.pending = set() # pending futures + self.executor = concurrent_class(max_workers=self.max_urls) def version_str(self): return super(robot_multirequests, self).version_str() \ - + ' concurrent.futures.' + self.concurrent_class_name - - def check_bookmark(self, bookmark): - href = bookmark.href - bookmarks = self.bookmarks - if href in bookmarks: - return - bookmarks[href] = [bookmark, None, None] - if len(bookmarks) < self.max_urls: - return - self.wait() + + ' multi: concurrent.futures.' + self.concurrent_class_name def wait(self): log = self.log @@ -62,7 +49,7 @@ class robot_multirequests(robot_requests): pending = self.pending free_workers = self.max_urls - len(pending) - if bookmarks and (free_workers > 0): # there's job and free workers, + if bookmarks and (free_workers > 0): # we have job and free workers for href in bookmarks: bookmark, parent, ft = bookmarks[href] if ft is not None: # it's already pending @@ -96,8 +83,7 @@ class robot_multirequests(robot_requests): log(line) def stop(self): - while self.bookmarks or self.pending: - self.wait() + super(robot_multirequests, self).stop() self.executor.shutdown(wait=True) diff --git a/Robots/multi_mixin.py b/Robots/multi_mixin.py new file mode 100644 index 0000000..ef4542c --- /dev/null +++ b/Robots/multi_mixin.py @@ -0,0 +1,38 @@ +"""Mix-in for robots ath process multiple URLs in parallel. + +This file is a part of Bookmarks database and Internet robot. + +""" + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2024 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['multi_mixin'] + + +class multi_mixin: + max_urls = 10 # Small default + + def __init__(self, *args, **kw): + if isinstance(self.max_urls, str): + self.max_urls = int(self.max_urls) + + # Bookmarks waiting to be processed; + # maps {URL: [bookmark, saved parent, task]} + self.bookmarks = {} + self.pending = set() # pending tasks + + def check_bookmark(self, bookmark): + href = bookmark.href + bookmarks = self.bookmarks + if href in bookmarks: + return + bookmarks[href] = [bookmark, None, None] + if len(bookmarks) < self.max_urls: + return + self.wait() + + def stop(self): + while self.bookmarks or self.pending: + self.wait() -- 2.39.5