From: Oleg Broytman <phd@phdru.name>
Date: Wed, 31 Jul 2024 17:29:29 +0000 (+0300)
Subject: Feat(Robots): Robot based on PycURL
X-Git-Tag: 5.4.0~14
X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=e310b274600eb4be00cbccec635f3c102eaac8ac;p=bookmarks_db.git

Feat(Robots): Robot based on PycURL
---

diff --git a/Robots/bkmk_rcurl.py b/Robots/bkmk_rcurl.py
new file mode 100644
index 0000000..6125856
--- /dev/null
+++ b/Robots/bkmk_rcurl.py
@@ -0,0 +1,88 @@
+"""Robot based on PycURL
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2024 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_curl']
+
+
+import certifi
+import pycurl
+
+from Robots.bkmk_robot_base import robot_base, request_headers
+
+
+class robot_curl(robot_base):
+    def get(self, bookmark, url, accept_charset=False, use_proxy=False):
+        if accept_charset and bookmark.charset:
+            headers = request_headers.copy()
+            headers['Accept-Charset'] = bookmark.charset
+        else:
+            headers = request_headers
+        headers = ['%s: %s' % (k, v) for k, v in headers.items()]
+
+        curl = pycurl.Curl()
+        self.headers = {}
+        self.body = b''
+
+        # Do not follow redirects
+        curl.setopt(pycurl.FOLLOWLOCATION, 0)
+        # Verify that we've got the right site; harmless on a non-SSL connect.
+        curl.setopt(pycurl.SSL_VERIFYHOST, 2)
+        curl.setopt(curl.CAINFO, certifi.where())
+        # Set timeouts to avoid hanging too long
+        curl.setopt(pycurl.CONNECTTIMEOUT, 30)
+        curl.setopt(pycurl.TIMEOUT, 60)
+        # Parse Last-Modified
+        curl.setopt(pycurl.OPT_FILETIME, 1)
+
+        if use_proxy:
+            curl.setopt(pycurl.PROXY, self.proxy)
+
+        # Set up a callback to capture the headers and the body
+        curl.setopt(pycurl.HEADERFUNCTION, self.header_callback)
+        curl.setopt(pycurl.WRITEFUNCTION, self.body_callback)
+
+        curl.setopt(pycurl.HTTPGET, 1)
+        curl.setopt(pycurl.HTTPHEADER, headers)
+        curl.setopt(pycurl.URL, url)
+        try:
+            curl.perform()
+        except pycurl.error as e:
+            error = str(e)
+            return error, None, None, None, None
+
+        status = curl.getinfo(pycurl.HTTP_CODE)
+        curl.close()
+
+        if status >= 400:
+            return "Error %d" % status, status, None, None, None
+        if status >= 300:
+            return None, status, self.headers['Location'], None, None
+        return None, None, None, self.headers, self.body
+
+    def header_callback(self, data):
+        for encoding in 'ascii', 'latin1', 'utf-8':
+            try:
+                data = data.decode(encoding)
+            except UnicodeDecodeError:
+                pass
+            else:
+                break
+        else:
+            print("Error decoding header:", data)
+            return
+        if ':' in data:
+            key, value = data.split(':', 1)
+            self.headers[key.title()] = value.strip()
+
+    def body_callback(self, data):
+        self.body += data
+
+    def get_ftp_welcome(self):
+        return ''  # We doen't store welcome message yet
diff --git a/bkmk_db-venv b/bkmk_db-venv
index be74fae..faed80b 100644
--- a/bkmk_db-venv
+++ b/bkmk_db-venv
@@ -8,6 +8,7 @@ if [ -z "$VIRTUAL_ENV" ]; then
          } &&
          . bkmk_db-venv/bin/activate &&
          pip install --compile --upgrade beautifulsoup4 lxml m_lib.full \
-         requests requests-ftp
+         requests requests-ftp \
+         certifi pycurl
     }
 fi
diff --git a/doc/TODO b/doc/TODO
index f9c1af0..23564fc 100644
--- a/doc/TODO
+++ b/doc/TODO
@@ -1,5 +1,3 @@
-Robot based on PycURL.
-
 Robot based on aiohttp.
 
 Robot(s) that test many URLs in parallel.
diff --git a/setup.py b/setup.py
index e872dc6..478b81e 100755
--- a/setup.py
+++ b/setup.py
@@ -42,5 +42,6 @@ setup(
     extras_require={
         'html': ['beautifulsoup4', 'lxml'],
         'requests': ['requests', 'requests-ftp'],
+        'curl': ['pycurl', 'certifi'],
     },
 )