--- /dev/null
+"""Robot based on pycurl; get single URL at a time
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2025 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_curl']
+
+
+import pycurl
+
+from Robots.base import robot_base
+from Robots.curl_wrapper import CurlWrapper
+
+
+class robot_curl(robot_base):
+ def version_str(self):
+ return 'pucurl %s' % pycurl.version
+
+ async def get(self, url, req_headers, use_proxy=False):
+ cw = CurlWrapper(url, headers=req_headers,
+ proxy=self.proxy if use_proxy else None,
+ timeout=self.timeout)
+ try:
+ cw.perform()
+ except pycurl.error as e:
+ if e.args[0] == 404:
+ status = 404
+ else:
+ status = None
+ return 'Error: %s' % e, status, None, None
+
+ status = cw.getinfo(pycurl.HTTP_CODE)
+ headers = cw.resp_headers
+ body = cw.body
+ cw.close()
+
+ return None, status, headers, body
--- /dev/null
+"""Robot based on pycurl and concurrent.futures,
+processes multiple URLs in parallel (multiprocess).
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2025 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_multicurl']
+
+
+from Robots.concurrent_futures import cf_multiprocess
+
+
+class robot_multicurl(cf_multiprocess):
+ robot_name = 'curl'
--- /dev/null
+import pycurl
+
+
+class CurlWrapper:
+ def __init__(self, url, headers=None, proxy=None, timeout=None):
+ self.curl = curl = pycurl.Curl()
+ self.url = url
+ self.resp_headers = {}
+ self.body = b''
+
+ if headers:
+ _headers = []
+ for h, v in headers.items():
+ _headers.append('%s: %s' % (h, v))
+ curl.setopt(pycurl.HTTPHEADER, _headers)
+ _headers = []
+ del _headers
+
+ if proxy:
+ curl.setopt(pycurl.PROXY, proxy)
+
+ # Do not follow redirects
+ curl.setopt(pycurl.FOLLOWLOCATION, 0)
+ # Lower security settings - we need to get as musch as possible
+ curl.setopt(pycurl.SSL_CIPHER_LIST, 'ALL:@SECLEVEL=1')
+ curl.setopt(pycurl.SSL_VERIFYHOST, 0)
+ curl.setopt(pycurl.SSL_VERIFYPEER, 0)
+ # Set timeouts to avoid hanging too long
+ if timeout:
+ curl.setopt(pycurl.CONNECTTIMEOUT, timeout)
+ curl.setopt(pycurl.TIMEOUT, timeout)
+ # Parse Last-Modified
+ curl.setopt(pycurl.OPT_FILETIME, 1)
+
+ # Set up a callback to capture the headers and the body
+ curl.setopt(pycurl.HEADERFUNCTION, self.header_callback)
+ curl.setopt(pycurl.WRITEFUNCTION, self.body_callback)
+
+ curl.setopt(pycurl.HTTPGET, 1)
+ curl.setopt(pycurl.URL, url)
+
+ def __getattr__(self, attr):
+ return getattr(self.curl, attr)
+
+ def header_callback(self, data):
+ for encoding in 'ascii', 'latin1', 'utf-8':
+ try:
+ data = data.decode(encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ break
+ else:
+ print("Error decoding header:", data)
+ return
+ if ':' in data:
+ key, value = data.split(':', 1)
+ self.resp_headers[key.title()] = value.strip()
+
+ def body_callback(self, data):
+ self.body += data
__author__ = "Oleg Broytman <phd@phdru.name>"
__copyright__ = "Copyright (C) 2000-2025 PhiloSoft Design"
__license__ = "GNU GPL"
-__version__ = '6.2.0'
+__version__ = '6.3.0'
__all__ = ['Folder', 'Bookmark', 'Ruler', 'Walker', 'Writer', 'Robot',
'InverseLinker', 'Linear', 'make_linear', 'make_tree', 'break_tree',
WHAT'S NEW
+Version 6.3.0 (2025-03-02)
+
+ Robots based on pycurl.
+
Version 6.2.0 (2025-03-02)
- Robot based on httpx.
+ Robots based on httpx.
Robots: Removed ftp_timeout.
+Version 6.3.0 (2025-03-02)
+
+ Robots based on pycurl.
+
Version 6.2.0 (2025-03-02)
- Robot based on httpx.
+ Robots based on httpx.
Robots: Removed ftp_timeout.
'html': ['beautifulsoup4', 'lxml'],
'requests': ['requests[socks]'],
'httpx': ['httpx[socks]'],
+ 'curl': 'pycurl',
'aiohttp': ['aiohttp>=3', 'aiohttp-socks', 'aioftp[socks]'],
},
)