From 92db996644b1f0ab782d5205ac33bbdfde06cad8 Mon Sep 17 00:00:00 2001
From: Oleg Broytman <phd@phdru.name>
Date: Sun, 2 Mar 2025 19:52:41 +0300
Subject: [PATCH] Version 6.3.0: Robots based on pycurl

---
 Robots/bkmk_rcurl.py      | 42 +++++++++++++++++++++++++++
 Robots/bkmk_rmulticurl.py | 19 ++++++++++++
 Robots/curl_wrapper.py    | 61 +++++++++++++++++++++++++++++++++++++++
 bkmk_objects.py           |  2 +-
 doc/ANNOUNCE              |  6 +++-
 doc/ChangeLog             |  6 +++-
 setup.py                  |  1 +
 7 files changed, 134 insertions(+), 3 deletions(-)
 create mode 100644 Robots/bkmk_rcurl.py
 create mode 100644 Robots/bkmk_rmulticurl.py
 create mode 100644 Robots/curl_wrapper.py

diff --git a/Robots/bkmk_rcurl.py b/Robots/bkmk_rcurl.py
new file mode 100644
index 0000000..121534b
--- /dev/null
+++ b/Robots/bkmk_rcurl.py
@@ -0,0 +1,42 @@
+"""Robot based on pycurl; get single URL at a time
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2025 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_curl']
+
+
+import pycurl
+
+from Robots.base import robot_base
+from Robots.curl_wrapper import CurlWrapper
+
+
+class robot_curl(robot_base):
+    def version_str(self):
+        return 'pucurl %s' % pycurl.version
+
+    async def get(self, url, req_headers, use_proxy=False):
+        cw = CurlWrapper(url, headers=req_headers,
+                         proxy=self.proxy if use_proxy else None,
+                         timeout=self.timeout)
+        try:
+            cw.perform()
+        except pycurl.error as e:
+            if e.args[0] == 404:
+                status = 404
+            else:
+                status = None
+            return 'Error: %s' % e, status, None, None
+
+        status = cw.getinfo(pycurl.HTTP_CODE)
+        headers = cw.resp_headers
+        body = cw.body
+        cw.close()
+
+        return None, status, headers, body
diff --git a/Robots/bkmk_rmulticurl.py b/Robots/bkmk_rmulticurl.py
new file mode 100644
index 0000000..9f88e1f
--- /dev/null
+++ b/Robots/bkmk_rmulticurl.py
@@ -0,0 +1,19 @@
+"""Robot based on pycurl and concurrent.futures,
+processes multiple URLs in parallel (multiprocess).
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2025 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_multicurl']
+
+
+from Robots.concurrent_futures import cf_multiprocess
+
+
+class robot_multicurl(cf_multiprocess):
+    robot_name = 'curl'
diff --git a/Robots/curl_wrapper.py b/Robots/curl_wrapper.py
new file mode 100644
index 0000000..0df3402
--- /dev/null
+++ b/Robots/curl_wrapper.py
@@ -0,0 +1,61 @@
+import pycurl
+
+
+class CurlWrapper:
+    def __init__(self, url, headers=None, proxy=None, timeout=None):
+        self.curl = curl = pycurl.Curl()
+        self.url = url
+        self.resp_headers = {}
+        self.body = b''
+
+        if headers:
+            _headers = []
+            for h, v in headers.items():
+                _headers.append('%s: %s' % (h, v))
+            curl.setopt(pycurl.HTTPHEADER, _headers)
+            _headers = []
+            del _headers
+
+        if proxy:
+            curl.setopt(pycurl.PROXY, proxy)
+
+        # Do not follow redirects
+        curl.setopt(pycurl.FOLLOWLOCATION, 0)
+        # Lower security settings - we need to get as musch as possible
+        curl.setopt(pycurl.SSL_CIPHER_LIST, 'ALL:@SECLEVEL=1')
+        curl.setopt(pycurl.SSL_VERIFYHOST, 0)
+        curl.setopt(pycurl.SSL_VERIFYPEER, 0)
+        # Set timeouts to avoid hanging too long
+        if timeout:
+            curl.setopt(pycurl.CONNECTTIMEOUT, timeout)
+            curl.setopt(pycurl.TIMEOUT, timeout)
+        # Parse Last-Modified
+        curl.setopt(pycurl.OPT_FILETIME, 1)
+
+        # Set up a callback to capture the headers and the body
+        curl.setopt(pycurl.HEADERFUNCTION, self.header_callback)
+        curl.setopt(pycurl.WRITEFUNCTION, self.body_callback)
+
+        curl.setopt(pycurl.HTTPGET, 1)
+        curl.setopt(pycurl.URL, url)
+
+    def __getattr__(self, attr):
+        return getattr(self.curl, attr)
+
+    def header_callback(self, data):
+        for encoding in 'ascii', 'latin1', 'utf-8':
+            try:
+                data = data.decode(encoding)
+            except UnicodeDecodeError:
+                pass
+            else:
+                break
+        else:
+            print("Error decoding header:", data)
+            return
+        if ':' in data:
+            key, value = data.split(':', 1)
+            self.resp_headers[key.title()] = value.strip()
+
+    def body_callback(self, data):
+        self.body += data
diff --git a/bkmk_objects.py b/bkmk_objects.py
index 6ad1442..c729a55 100644
--- a/bkmk_objects.py
+++ b/bkmk_objects.py
@@ -7,7 +7,7 @@ This file is a part of Bookmarks database and Internet robot.
 __author__ = "Oleg Broytman <phd@phdru.name>"
 __copyright__ = "Copyright (C) 2000-2025 PhiloSoft Design"
 __license__ = "GNU GPL"
-__version__ = '6.2.0'
+__version__ = '6.3.0'
 
 __all__ = ['Folder', 'Bookmark', 'Ruler', 'Walker', 'Writer', 'Robot',
            'InverseLinker', 'Linear', 'make_linear', 'make_tree', 'break_tree',
diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE
index 6fac97c..af83c03 100644
--- a/doc/ANNOUNCE
+++ b/doc/ANNOUNCE
@@ -7,9 +7,13 @@ bookmarks.html.
 
 WHAT'S NEW
 
+Version 6.3.0 (2025-03-02)
+
+   Robots based on pycurl.
+
 Version 6.2.0 (2025-03-02)
 
-   Robot based on httpx.
+   Robots based on httpx.
 
    Robots: Removed ftp_timeout.
 
diff --git a/doc/ChangeLog b/doc/ChangeLog
index 0a02b93..bb54f4a 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,6 +1,10 @@
+Version 6.3.0 (2025-03-02)
+
+   Robots based on pycurl.
+
 Version 6.2.0 (2025-03-02)
 
-   Robot based on httpx.
+   Robots based on httpx.
 
    Robots: Removed ftp_timeout.
 
diff --git a/setup.py b/setup.py
index 237f0e7..c8d9c0c 100755
--- a/setup.py
+++ b/setup.py
@@ -39,6 +39,7 @@ setup(
         'html': ['beautifulsoup4', 'lxml'],
         'requests': ['requests[socks]'],
         'httpx': ['httpx[socks]'],
+        'curl': 'pycurl',
         'aiohttp': ['aiohttp>=3', 'aiohttp-socks', 'aioftp[socks]'],
     },
 )
-- 
2.39.5