Version 6.3.0: Robots based on pycurl

author Oleg Broytman <phd@phdru.name>

Sun, 2 Mar 2025 16:52:41 +0000 (19:52 +0300)

committer Oleg Broytman <phd@phdru.name>

Sun, 2 Mar 2025 16:52:41 +0000 (19:52 +0300)
author Oleg Broytman <phd@phdru.name>
Sun, 2 Mar 2025 16:52:41 +0000 (19:52 +0300)
committer Oleg Broytman <phd@phdru.name>
Sun, 2 Mar 2025 16:52:41 +0000 (19:52 +0300)
diff --git a/Robots/bkmk_rcurl.py b/Robots/bkmk_rcurl.py

new file mode 100644 (file)

index 0000000..121534b
--- /dev/null
+++ b/Robots/bkmk_rcurl.py
@@ -0,0 +1,42 @@
+"""Robot based on pycurl; get single URL at a time
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2025 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_curl']
+
+
+import pycurl
+
+from Robots.base import robot_base
+from Robots.curl_wrapper import CurlWrapper
+
+
+class robot_curl(robot_base):
+    def version_str(self):
+        return 'pucurl %s' % pycurl.version
+
+    async def get(self, url, req_headers, use_proxy=False):
+        cw = CurlWrapper(url, headers=req_headers,
+                         proxy=self.proxy if use_proxy else None,
+                         timeout=self.timeout)
+        try:
+            cw.perform()
+        except pycurl.error as e:
+            if e.args[0] == 404:
+                status = 404
+            else:
+                status = None
+            return 'Error: %s' % e, status, None, None
+
+        status = cw.getinfo(pycurl.HTTP_CODE)
+        headers = cw.resp_headers
+        body = cw.body
+        cw.close()
+
+        return None, status, headers, body
diff --git a/Robots/bkmk_rmulticurl.py b/Robots/bkmk_rmulticurl.py

new file mode 100644 (file)

index 0000000..9f88e1f
--- /dev/null
+++ b/Robots/bkmk_rmulticurl.py
@@ -0,0 +1,19 @@
+"""Robot based on pycurl and concurrent.futures,
+processes multiple URLs in parallel (multiprocess).
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2025 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_multicurl']
+
+
+from Robots.concurrent_futures import cf_multiprocess
+
+
+class robot_multicurl(cf_multiprocess):
+    robot_name = 'curl'
diff --git a/Robots/curl_wrapper.py b/Robots/curl_wrapper.py

new file mode 100644 (file)

index 0000000..0df3402
--- /dev/null
+++ b/Robots/curl_wrapper.py
@@ -0,0 +1,61 @@
+import pycurl
+
+
+class CurlWrapper:
+    def __init__(self, url, headers=None, proxy=None, timeout=None):
+        self.curl = curl = pycurl.Curl()
+        self.url = url
+        self.resp_headers = {}
+        self.body = b''
+
+        if headers:
+            _headers = []
+            for h, v in headers.items():
+                _headers.append('%s: %s' % (h, v))
+            curl.setopt(pycurl.HTTPHEADER, _headers)
+            _headers = []
+            del _headers
+
+        if proxy:
+            curl.setopt(pycurl.PROXY, proxy)
+
+        # Do not follow redirects
+        curl.setopt(pycurl.FOLLOWLOCATION, 0)
+        # Lower security settings - we need to get as musch as possible
+        curl.setopt(pycurl.SSL_CIPHER_LIST, 'ALL:@SECLEVEL=1')
+        curl.setopt(pycurl.SSL_VERIFYHOST, 0)
+        curl.setopt(pycurl.SSL_VERIFYPEER, 0)
+        # Set timeouts to avoid hanging too long
+        if timeout:
+            curl.setopt(pycurl.CONNECTTIMEOUT, timeout)
+            curl.setopt(pycurl.TIMEOUT, timeout)
+        # Parse Last-Modified
+        curl.setopt(pycurl.OPT_FILETIME, 1)
+
+        # Set up a callback to capture the headers and the body
+        curl.setopt(pycurl.HEADERFUNCTION, self.header_callback)
+        curl.setopt(pycurl.WRITEFUNCTION, self.body_callback)
+
+        curl.setopt(pycurl.HTTPGET, 1)
+        curl.setopt(pycurl.URL, url)
+
+    def __getattr__(self, attr):
+        return getattr(self.curl, attr)
+
+    def header_callback(self, data):
+        for encoding in 'ascii', 'latin1', 'utf-8':
+            try:
+                data = data.decode(encoding)
+            except UnicodeDecodeError:
+                pass
+            else:
+                break
+        else:
+            print("Error decoding header:", data)
+            return
+        if ':' in data:
+            key, value = data.split(':', 1)
+            self.resp_headers[key.title()] = value.strip()
+
+    def body_callback(self, data):
+        self.body += data
diff --git a/bkmk_objects.py b/bkmk_objects.py

index 6ad1442f272019fa1bd51428542e81c6be84e4c2..c729a55614c6948b6b5ee24b445a1e5bea1512a9 100644 (file)
--- a/bkmk_objects.py
+++ b/bkmk_objects.py
@@ -7,7 +7,7 @@ This file is a part of Bookmarks database and Internet robot.
  __author__ = "Oleg Broytman <phd@phdru.name>"
  __copyright__ = "Copyright (C) 2000-2025 PhiloSoft Design"
  __license__ = "GNU GPL"
-__version__ = '6.2.0'
+__version__ = '6.3.0'
  
  __all__ = ['Folder', 'Bookmark', 'Ruler', 'Walker', 'Writer', 'Robot',
             'InverseLinker', 'Linear', 'make_linear', 'make_tree', 'break_tree',
diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE

index 6fac97c1e10dc67b11a8dc7b81a3dd57b618f95e..af83c036136b66cc0a126037a4bec094a472b54e 100644 (file)
--- a/doc/ANNOUNCE
+++ b/doc/ANNOUNCE
@@ -7,9 +7,13 @@ bookmarks.html.
  
  WHAT'S NEW
  
+Version 6.3.0 (2025-03-02)
+
+   Robots based on pycurl.
+
  Version 6.2.0 (2025-03-02)
  
-   Robot based on httpx.
+   Robots based on httpx.
  
     Robots: Removed ftp_timeout.
  
diff --git a/doc/ChangeLog b/doc/ChangeLog

index 0a02b9341152dc84a34e02df18474ea8ec33e3ab..bb54f4a8a8dcc579a12102f89856fea99a7bdb34 100644 (file)
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,6 +1,10 @@
+Version 6.3.0 (2025-03-02)
+
+   Robots based on pycurl.
+
  Version 6.2.0 (2025-03-02)
  
-   Robot based on httpx.
+   Robots based on httpx.
  
     Robots: Removed ftp_timeout.
  
diff --git a/setup.py b/setup.py

index 237f0e7e6de7491c95abed4ae7840bb814139f00..c8d9c0ce8fc661d739a1ae126ad5afe3a46954bd 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -39,6 +39,7 @@ setup(
          'html': ['beautifulsoup4', 'lxml'],
          'requests': ['requests[socks]'],
          'httpx': ['httpx[socks]'],
+        'curl': 'pycurl',
          'aiohttp': ['aiohttp>=3', 'aiohttp-socks', 'aioftp[socks]'],
      },
  )
author	Oleg Broytman <phd@phdru.name>
	Sun, 2 Mar 2025 16:52:41 +0000 (19:52 +0300)
committer	Oleg Broytman <phd@phdru.name>
	Sun, 2 Mar 2025 16:52:41 +0000 (19:52 +0300)
Robots/bkmk_rcurl.py	[new file with mode: 0644]	patch \| blob
Robots/bkmk_rmulticurl.py	[new file with mode: 0644]	patch \| blob
Robots/curl_wrapper.py	[new file with mode: 0644]	patch \| blob
bkmk_objects.py		patch \| blob \| history
doc/ANNOUNCE		patch \| blob \| history
doc/ChangeLog		patch \| blob \| history
setup.py		patch \| blob \| history