]> git.phdru.name Git - bookmarks_db.git/commitdiff
Feat(Robots): Robot based on PycURL
authorOleg Broytman <phd@phdru.name>
Wed, 31 Jul 2024 17:29:29 +0000 (20:29 +0300)
committerOleg Broytman <phd@phdru.name>
Wed, 31 Jul 2024 22:48:38 +0000 (01:48 +0300)
Robots/bkmk_rcurl.py [new file with mode: 0644]
bkmk_db-venv
doc/TODO
setup.py

diff --git a/Robots/bkmk_rcurl.py b/Robots/bkmk_rcurl.py
new file mode 100644 (file)
index 0000000..6125856
--- /dev/null
@@ -0,0 +1,88 @@
+"""Robot based on PycURL
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2024 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_curl']
+
+
+import certifi
+import pycurl
+
+from Robots.bkmk_robot_base import robot_base, request_headers
+
+
+class robot_curl(robot_base):
+    def get(self, bookmark, url, accept_charset=False, use_proxy=False):
+        if accept_charset and bookmark.charset:
+            headers = request_headers.copy()
+            headers['Accept-Charset'] = bookmark.charset
+        else:
+            headers = request_headers
+        headers = ['%s: %s' % (k, v) for k, v in headers.items()]
+
+        curl = pycurl.Curl()
+        self.headers = {}
+        self.body = b''
+
+        # Do not follow redirects
+        curl.setopt(pycurl.FOLLOWLOCATION, 0)
+        # Verify that we've got the right site; harmless on a non-SSL connect.
+        curl.setopt(pycurl.SSL_VERIFYHOST, 2)
+        curl.setopt(curl.CAINFO, certifi.where())
+        # Set timeouts to avoid hanging too long
+        curl.setopt(pycurl.CONNECTTIMEOUT, 30)
+        curl.setopt(pycurl.TIMEOUT, 60)
+        # Parse Last-Modified
+        curl.setopt(pycurl.OPT_FILETIME, 1)
+
+        if use_proxy:
+            curl.setopt(pycurl.PROXY, self.proxy)
+
+        # Set up a callback to capture the headers and the body
+        curl.setopt(pycurl.HEADERFUNCTION, self.header_callback)
+        curl.setopt(pycurl.WRITEFUNCTION, self.body_callback)
+
+        curl.setopt(pycurl.HTTPGET, 1)
+        curl.setopt(pycurl.HTTPHEADER, headers)
+        curl.setopt(pycurl.URL, url)
+        try:
+            curl.perform()
+        except pycurl.error as e:
+            error = str(e)
+            return error, None, None, None, None
+
+        status = curl.getinfo(pycurl.HTTP_CODE)
+        curl.close()
+
+        if status >= 400:
+            return "Error %d" % status, status, None, None, None
+        if status >= 300:
+            return None, status, self.headers['Location'], None, None
+        return None, None, None, self.headers, self.body
+
+    def header_callback(self, data):
+        for encoding in 'ascii', 'latin1', 'utf-8':
+            try:
+                data = data.decode(encoding)
+            except UnicodeDecodeError:
+                pass
+            else:
+                break
+        else:
+            print("Error decoding header:", data)
+            return
+        if ':' in data:
+            key, value = data.split(':', 1)
+            self.headers[key.title()] = value.strip()
+
+    def body_callback(self, data):
+        self.body += data
+
+    def get_ftp_welcome(self):
+        return ''  # We doen't store welcome message yet
index be74fae15267f0418e3e510078095ac5ceb3c2f9..faed80bcdc1ebf038a19a51a09b0b4569355fe85 100644 (file)
@@ -8,6 +8,7 @@ if [ -z "$VIRTUAL_ENV" ]; then
          } &&
          . bkmk_db-venv/bin/activate &&
          pip install --compile --upgrade beautifulsoup4 lxml m_lib.full \
-         requests requests-ftp
+         requests requests-ftp \
+         certifi pycurl
     }
 fi
index f9c1af0e6890888afe329fd2e9d31c695071b3da..23564fc7977dfecd2deffc7f7522286a3c265ecf 100644 (file)
--- a/doc/TODO
+++ b/doc/TODO
@@ -1,5 +1,3 @@
-Robot based on PycURL.
-
 Robot based on aiohttp.
 
 Robot(s) that test many URLs in parallel.
index e872dc60dd424458b11958d7485560b985eec3a3..478b81e6187a6880affca1b8526931cc21289109 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -42,5 +42,6 @@ setup(
     extras_require={
         'html': ['beautifulsoup4', 'lxml'],
         'requests': ['requests', 'requests-ftp'],
+        'curl': ['pycurl', 'certifi'],
     },
 )