From 5bcabc2a6e42fc6c3a8e475bb17b7c086969cdb7 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Mon, 5 Aug 2024 15:00:55 +0300 Subject: [PATCH] Feat(Robots): Robot based on aiohttp --- Robots/bkmk_raiohttp.py | 62 +++++++++++++++++++++++++++++++++++++++ Robots/bkmk_robot_base.py | 11 ++++--- bkmk_db-venv | 2 +- doc/ANNOUNCE | 6 ++++ doc/ChangeLog | 6 ++++ doc/TODO | 2 +- robots.py | 2 +- setup.py | 3 +- 8 files changed, 86 insertions(+), 8 deletions(-) create mode 100644 Robots/bkmk_raiohttp.py diff --git a/Robots/bkmk_raiohttp.py b/Robots/bkmk_raiohttp.py new file mode 100644 index 0000000..ed4dac6 --- /dev/null +++ b/Robots/bkmk_raiohttp.py @@ -0,0 +1,62 @@ +"""Robot based on aiohttp + +This file is a part of Bookmarks database and Internet robot. + +""" + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2024 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['robot_aiohttp'] + + +import asyncio +import aiohttp +import aiohttp.client_exceptions +from Robots.bkmk_robot_base import robot_base, request_headers + + +class robot_aiohttp(robot_base): + def version_str(self): + return 'aiohttp/%s' % aiohttp.__version__ + + def get(self, bookmark, url, accept_charset=False, use_proxy=False): + if accept_charset and bookmark.charset: + headers = request_headers.copy() + headers['Accept-Charset'] = bookmark.charset + else: + headers = request_headers + + if use_proxy: + proxy = self.proxy + else: + proxy = None + + error, status, resp_headers, body = asyncio.run(get( + url, headers=headers, proxy=proxy, + connect_timeout=self.connect_timeout, timeout=self.timeout, + )) + if error is not None or (status and status >= 400): + if error is None: + error = 'Error %d' % status + else: + error = str(error) + if status: + error = 'Error %d %s' % (status, error) + return error, status, None, None, None + if status and status >= 300: + return None, status, resp_headers['Location'], None, None + return None, status, None, resp_headers, body + + +async def get(url, headers={}, proxy=None, connect_timeout=30, timeout=60): + timeout = aiohttp.ClientTimeout(connect=connect_timeout, total=timeout) + try: + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.get( + url, headers=headers, proxy=proxy, + allow_redirects=False) as resp: + return None, resp.status, resp.headers, await resp.read() + except (asyncio.TimeoutError, aiohttp.client_exceptions.ClientError) as e: + return e, None, None, None diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index a742941..dffffdb 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -105,7 +105,7 @@ class robot_base(Robot): error, http_status_code, redirect_to, headers, content = \ self.smart_get(bookmark, bookmark.href, True) - if error: + if error is not None: bookmark.error = error return 1 @@ -169,7 +169,8 @@ class robot_base(Robot): is_html = True break content_stripped = content.strip() - if content_stripped and charset: + if content_stripped and charset \ + and isinstance(content_stripped, bytes): try: content_stripped = content_stripped.decode( charset, 'replace') @@ -371,13 +372,15 @@ class robot_base(Robot): use_proxy=True) if error is None: self.proxy_ok.add(url_host) - if error is not None: + if (error is not None) or ( + http_status_code and (http_status_code >= 400) + ): if use_proxy: self.log(' Proxy error : %s' % error) if url_host not in self.proxy_ok: self.proxy_error.add(url_host) return error, http_status_code, None, None, None - if http_status_code: + if http_status_code and (http_status_code >= 300): return None, http_status_code, redirect_to, None, None return None, None, None, headers, content diff --git a/bkmk_db-venv b/bkmk_db-venv index faed80b..62d0d98 100644 --- a/bkmk_db-venv +++ b/bkmk_db-venv @@ -9,6 +9,6 @@ if [ -z "$VIRTUAL_ENV" ]; then . bkmk_db-venv/bin/activate && pip install --compile --upgrade beautifulsoup4 lxml m_lib.full \ requests requests-ftp \ - certifi pycurl + pycurl certifi aiohttp } fi diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE index aec114b..3fb4356 100644 --- a/doc/ANNOUNCE +++ b/doc/ANNOUNCE @@ -7,6 +7,12 @@ bookmarks.html. WHAT'S NEW +Version 5.5.0 (2024-08-06) + + Robot based on aiohttp. + + Default list of robots is now aiohttp,curl,requests,forking. + Version 5.4.1 (2024-08-04) Fix(bkmk_rcurl): IDNA-encode URLs. PycURL doesn't encode URLs itself diff --git a/doc/ChangeLog b/doc/ChangeLog index da56547..cb8415d 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,9 @@ +Version 5.5.0 (2024-08-06) + + Robot based on aiohttp. + + Default list of robots is now aiohttp,curl,requests,forking. + Version 5.4.1 (2024-08-04) Fix(bkmk_rcurl): IDNA-encode URLs. PycURL doesn't encode URLs itself diff --git a/doc/TODO b/doc/TODO index 23564fc..d2e6573 100644 --- a/doc/TODO +++ b/doc/TODO @@ -1,4 +1,4 @@ -Robot based on aiohttp. +aioftp. Robot(s) that test many URLs in parallel. diff --git a/robots.py b/robots.py index 575f9bd..06ce1fe 100644 --- a/robots.py +++ b/robots.py @@ -16,7 +16,7 @@ from os import environ from bkmk_objects import parse_params, set_params robot_names, robot_params = parse_params( - environ.get("BKMK_ROBOT", "curl,requests,forking")) + environ.get("BKMK_ROBOT", "aiohttp,curl,requests,forking")) def import_robot(robot_name): diff --git a/setup.py b/setup.py index 27ef38b..0a570ad 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools import setup setup( name='bookmarks_db', - version='5.4.1', + version='5.5.0', description='Bookmarks database and Internet robot', long_description=open('README', 'r').read(), long_description_content_type="text/plain", @@ -41,5 +41,6 @@ setup( 'html': ['beautifulsoup4', 'lxml'], 'requests': ['requests', 'requests-ftp'], 'curl': ['pycurl', 'certifi'], + 'aiohttp:python_version>="3.4"': ['aiohttp'], }, ) -- 2.39.5