From ee0d4bd5e284681f1109e57b3b19b9b1150f2ff1 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Mon, 19 Aug 2024 03:59:24 +0300 Subject: [PATCH] Feat(Robots): Robot based on aiohttp, processes multiple URLs in parallel --- Robots/bkmk_rmultiaio.py | 100 +++++++++++++++++++++++++++++++++++++++ bkmk_objects.py | 2 +- doc/ANNOUNCE | 4 ++ doc/ChangeLog | 4 ++ robots.py | 2 +- 5 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 Robots/bkmk_rmultiaio.py diff --git a/Robots/bkmk_rmultiaio.py b/Robots/bkmk_rmultiaio.py new file mode 100644 index 0000000..784752c --- /dev/null +++ b/Robots/bkmk_rmultiaio.py @@ -0,0 +1,100 @@ +"""Robot based on aiohttp, processes multiple URLs in parallel + +This file is a part of Bookmarks database and Internet robot. + +""" + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2024 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['robot_multiaio'] + + +import asyncio +import contextvars + +from Robots.bkmk_raiohttp import robot_aiohttp +from Robots.multi_mixin import multi_mixin + + +current_href = contextvars.ContextVar('current_href') + + +class robot_multiaio(multi_mixin, robot_aiohttp): + def __init__(self, *args, **kw): + multi_mixin.__init__(self, *args, **kw) + robot_aiohttp.__init__(self, *args, **kw) + + # We need one event loop for the entire application + # so that we can save pending tasks between calls to self.wait(). + # This also means we cannot use asyncio.run(). + self.loop = loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + # Rename self.log, create one log_lines list per URL + self.file_log = self.log + del self.log + self.logs = {} # Map {href: [log lines]} + + def __getattr__(self, attr): + if attr != 'log': + raise AttributeError(attr) + href = current_href.get() + return self.logs[href].append + + def version_str(self): + return super(robot_multiaio, self).version_str() + ' multi: aiohttp' + + async def check_bookmark_async_log(self, bookmark): + current_href.set(bookmark.href) + await self.check_bookmark_async(bookmark) + + def wait(self): + self.loop.run_until_complete(self.wait_async()) + + async def wait_async(self): + bookmarks = self.bookmarks + pending = self.pending + + free_workers = self.max_urls - len(pending) + if bookmarks and (free_workers > 0): # we have job and free workers + for href in bookmarks: + bookmark, _, task = bookmarks[href] + if task is not None: # it's already pending + continue + task = asyncio.create_task( + self.check_bookmark_async_log(bookmark)) + bookmarks[href] = [bookmark, None, task] + self.logs[href] = [] + pending.add(task) + task.href = href + + free_workers -= 1 + if free_workers == 0: + break + + if pending: + done, pending = await asyncio.wait( + pending, timeout=self.timeout+1, + return_when=asyncio.FIRST_COMPLETED) + self.pending = pending + + for task in done: + bookmark, _, old_task = bookmarks.pop(task.href) + assert old_task is task + log = self.file_log + log_lines = self.logs.pop(bookmark.href) + log('Checked: %s' % bookmark.href) + if log_lines: + for line in log_lines: + log(line) + else: + if hasattr(bookmark, 'error'): + log(' Error: %s' % bookmark.error) + else: + log(' No logs') + + def stop(self): + super(robot_multiaio, self).stop() + self.loop.close() diff --git a/bkmk_objects.py b/bkmk_objects.py index 3b604e1..02fab56 100644 --- a/bkmk_objects.py +++ b/bkmk_objects.py @@ -7,7 +7,7 @@ This file is a part of Bookmarks database and Internet robot. __author__ = "Oleg Broytman " __copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design" __license__ = "GNU GPL" -__version__ = '5.7.0' +__version__ = '6.0.0' __all__ = ['Folder', 'Bookmark', 'Ruler', 'Walker', 'Writer', 'Robot', 'InverseLinker', 'Linear', 'make_linear', 'make_tree', 'break_tree', diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE index 47b45a6..c6981a1 100644 --- a/doc/ANNOUNCE +++ b/doc/ANNOUNCE @@ -9,6 +9,10 @@ WHAT'S NEW Version 6.0.0 (2024-??-??) + Robot based on aiohttp, processes multiple URLs in parallel. + + Default list of robots is now multirequests,multiaio,curl. + Make all robots async. Split check_bookmark() into sync and async variants. diff --git a/doc/ChangeLog b/doc/ChangeLog index 5e08408..915b337 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,5 +1,9 @@ Version 6.0.0 (2024-??-??) + Robot based on aiohttp, processes multiple URLs in parallel. + + Default list of robots is now multirequests,multiaio,curl. + Make all robots async. Split check_bookmark() into sync and async variants. diff --git a/robots.py b/robots.py index 583d1a8..0ddba41 100644 --- a/robots.py +++ b/robots.py @@ -16,7 +16,7 @@ from os import environ from bkmk_objects import parse_params, set_params robot_names, robot_params = parse_params( - environ.get("BKMK_ROBOT", "multirequests,curl,requests,aiohttp")) + environ.get("BKMK_ROBOT", "multirequests,multiaoi,curl")) def import_robot(robot_name): -- 2.39.5