From 92db996644b1f0ab782d5205ac33bbdfde06cad8 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Sun, 2 Mar 2025 19:52:41 +0300 Subject: [PATCH] Version 6.3.0: Robots based on pycurl --- Robots/bkmk_rcurl.py | 42 +++++++++++++++++++++++++++ Robots/bkmk_rmulticurl.py | 19 ++++++++++++ Robots/curl_wrapper.py | 61 +++++++++++++++++++++++++++++++++++++++ bkmk_objects.py | 2 +- doc/ANNOUNCE | 6 +++- doc/ChangeLog | 6 +++- setup.py | 1 + 7 files changed, 134 insertions(+), 3 deletions(-) create mode 100644 Robots/bkmk_rcurl.py create mode 100644 Robots/bkmk_rmulticurl.py create mode 100644 Robots/curl_wrapper.py diff --git a/Robots/bkmk_rcurl.py b/Robots/bkmk_rcurl.py new file mode 100644 index 0000000..121534b --- /dev/null +++ b/Robots/bkmk_rcurl.py @@ -0,0 +1,42 @@ +"""Robot based on pycurl; get single URL at a time + +This file is a part of Bookmarks database and Internet robot. + +""" + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2025 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['robot_curl'] + + +import pycurl + +from Robots.base import robot_base +from Robots.curl_wrapper import CurlWrapper + + +class robot_curl(robot_base): + def version_str(self): + return 'pucurl %s' % pycurl.version + + async def get(self, url, req_headers, use_proxy=False): + cw = CurlWrapper(url, headers=req_headers, + proxy=self.proxy if use_proxy else None, + timeout=self.timeout) + try: + cw.perform() + except pycurl.error as e: + if e.args[0] == 404: + status = 404 + else: + status = None + return 'Error: %s' % e, status, None, None + + status = cw.getinfo(pycurl.HTTP_CODE) + headers = cw.resp_headers + body = cw.body + cw.close() + + return None, status, headers, body diff --git a/Robots/bkmk_rmulticurl.py b/Robots/bkmk_rmulticurl.py new file mode 100644 index 0000000..9f88e1f --- /dev/null +++ b/Robots/bkmk_rmulticurl.py @@ -0,0 +1,19 @@ +"""Robot based on pycurl and concurrent.futures, +processes multiple URLs in parallel (multiprocess). + +This file is a part of Bookmarks database and Internet robot. + +""" + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2025 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['robot_multicurl'] + + +from Robots.concurrent_futures import cf_multiprocess + + +class robot_multicurl(cf_multiprocess): + robot_name = 'curl' diff --git a/Robots/curl_wrapper.py b/Robots/curl_wrapper.py new file mode 100644 index 0000000..0df3402 --- /dev/null +++ b/Robots/curl_wrapper.py @@ -0,0 +1,61 @@ +import pycurl + + +class CurlWrapper: + def __init__(self, url, headers=None, proxy=None, timeout=None): + self.curl = curl = pycurl.Curl() + self.url = url + self.resp_headers = {} + self.body = b'' + + if headers: + _headers = [] + for h, v in headers.items(): + _headers.append('%s: %s' % (h, v)) + curl.setopt(pycurl.HTTPHEADER, _headers) + _headers = [] + del _headers + + if proxy: + curl.setopt(pycurl.PROXY, proxy) + + # Do not follow redirects + curl.setopt(pycurl.FOLLOWLOCATION, 0) + # Lower security settings - we need to get as musch as possible + curl.setopt(pycurl.SSL_CIPHER_LIST, 'ALL:@SECLEVEL=1') + curl.setopt(pycurl.SSL_VERIFYHOST, 0) + curl.setopt(pycurl.SSL_VERIFYPEER, 0) + # Set timeouts to avoid hanging too long + if timeout: + curl.setopt(pycurl.CONNECTTIMEOUT, timeout) + curl.setopt(pycurl.TIMEOUT, timeout) + # Parse Last-Modified + curl.setopt(pycurl.OPT_FILETIME, 1) + + # Set up a callback to capture the headers and the body + curl.setopt(pycurl.HEADERFUNCTION, self.header_callback) + curl.setopt(pycurl.WRITEFUNCTION, self.body_callback) + + curl.setopt(pycurl.HTTPGET, 1) + curl.setopt(pycurl.URL, url) + + def __getattr__(self, attr): + return getattr(self.curl, attr) + + def header_callback(self, data): + for encoding in 'ascii', 'latin1', 'utf-8': + try: + data = data.decode(encoding) + except UnicodeDecodeError: + pass + else: + break + else: + print("Error decoding header:", data) + return + if ':' in data: + key, value = data.split(':', 1) + self.resp_headers[key.title()] = value.strip() + + def body_callback(self, data): + self.body += data diff --git a/bkmk_objects.py b/bkmk_objects.py index 6ad1442..c729a55 100644 --- a/bkmk_objects.py +++ b/bkmk_objects.py @@ -7,7 +7,7 @@ This file is a part of Bookmarks database and Internet robot. __author__ = "Oleg Broytman " __copyright__ = "Copyright (C) 2000-2025 PhiloSoft Design" __license__ = "GNU GPL" -__version__ = '6.2.0' +__version__ = '6.3.0' __all__ = ['Folder', 'Bookmark', 'Ruler', 'Walker', 'Writer', 'Robot', 'InverseLinker', 'Linear', 'make_linear', 'make_tree', 'break_tree', diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE index 6fac97c..af83c03 100644 --- a/doc/ANNOUNCE +++ b/doc/ANNOUNCE @@ -7,9 +7,13 @@ bookmarks.html. WHAT'S NEW +Version 6.3.0 (2025-03-02) + + Robots based on pycurl. + Version 6.2.0 (2025-03-02) - Robot based on httpx. + Robots based on httpx. Robots: Removed ftp_timeout. diff --git a/doc/ChangeLog b/doc/ChangeLog index 0a02b93..bb54f4a 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,6 +1,10 @@ +Version 6.3.0 (2025-03-02) + + Robots based on pycurl. + Version 6.2.0 (2025-03-02) - Robot based on httpx. + Robots based on httpx. Robots: Removed ftp_timeout. diff --git a/setup.py b/setup.py index 237f0e7..c8d9c0c 100755 --- a/setup.py +++ b/setup.py @@ -39,6 +39,7 @@ setup( 'html': ['beautifulsoup4', 'lxml'], 'requests': ['requests[socks]'], 'httpx': ['httpx[socks]'], + 'curl': 'pycurl', 'aiohttp': ['aiohttp>=3', 'aiohttp-socks', 'aioftp[socks]'], }, ) -- 2.39.5