From e310b274600eb4be00cbccec635f3c102eaac8ac Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Wed, 31 Jul 2024 20:29:29 +0300 Subject: [PATCH] Feat(Robots): Robot based on PycURL --- Robots/bkmk_rcurl.py | 88 ++++++++++++++++++++++++++++++++++++++++++++ bkmk_db-venv | 3 +- doc/TODO | 2 - setup.py | 1 + 4 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 Robots/bkmk_rcurl.py diff --git a/Robots/bkmk_rcurl.py b/Robots/bkmk_rcurl.py new file mode 100644 index 0000000..6125856 --- /dev/null +++ b/Robots/bkmk_rcurl.py @@ -0,0 +1,88 @@ +"""Robot based on PycURL + +This file is a part of Bookmarks database and Internet robot. + +""" + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2024 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['robot_curl'] + + +import certifi +import pycurl + +from Robots.bkmk_robot_base import robot_base, request_headers + + +class robot_curl(robot_base): + def get(self, bookmark, url, accept_charset=False, use_proxy=False): + if accept_charset and bookmark.charset: + headers = request_headers.copy() + headers['Accept-Charset'] = bookmark.charset + else: + headers = request_headers + headers = ['%s: %s' % (k, v) for k, v in headers.items()] + + curl = pycurl.Curl() + self.headers = {} + self.body = b'' + + # Do not follow redirects + curl.setopt(pycurl.FOLLOWLOCATION, 0) + # Verify that we've got the right site; harmless on a non-SSL connect. + curl.setopt(pycurl.SSL_VERIFYHOST, 2) + curl.setopt(curl.CAINFO, certifi.where()) + # Set timeouts to avoid hanging too long + curl.setopt(pycurl.CONNECTTIMEOUT, 30) + curl.setopt(pycurl.TIMEOUT, 60) + # Parse Last-Modified + curl.setopt(pycurl.OPT_FILETIME, 1) + + if use_proxy: + curl.setopt(pycurl.PROXY, self.proxy) + + # Set up a callback to capture the headers and the body + curl.setopt(pycurl.HEADERFUNCTION, self.header_callback) + curl.setopt(pycurl.WRITEFUNCTION, self.body_callback) + + curl.setopt(pycurl.HTTPGET, 1) + curl.setopt(pycurl.HTTPHEADER, headers) + curl.setopt(pycurl.URL, url) + try: + curl.perform() + except pycurl.error as e: + error = str(e) + return error, None, None, None, None + + status = curl.getinfo(pycurl.HTTP_CODE) + curl.close() + + if status >= 400: + return "Error %d" % status, status, None, None, None + if status >= 300: + return None, status, self.headers['Location'], None, None + return None, None, None, self.headers, self.body + + def header_callback(self, data): + for encoding in 'ascii', 'latin1', 'utf-8': + try: + data = data.decode(encoding) + except UnicodeDecodeError: + pass + else: + break + else: + print("Error decoding header:", data) + return + if ':' in data: + key, value = data.split(':', 1) + self.headers[key.title()] = value.strip() + + def body_callback(self, data): + self.body += data + + def get_ftp_welcome(self): + return '' # We doen't store welcome message yet diff --git a/bkmk_db-venv b/bkmk_db-venv index be74fae..faed80b 100644 --- a/bkmk_db-venv +++ b/bkmk_db-venv @@ -8,6 +8,7 @@ if [ -z "$VIRTUAL_ENV" ]; then } && . bkmk_db-venv/bin/activate && pip install --compile --upgrade beautifulsoup4 lxml m_lib.full \ - requests requests-ftp + requests requests-ftp \ + certifi pycurl } fi diff --git a/doc/TODO b/doc/TODO index f9c1af0..23564fc 100644 --- a/doc/TODO +++ b/doc/TODO @@ -1,5 +1,3 @@ -Robot based on PycURL. - Robot based on aiohttp. Robot(s) that test many URLs in parallel. diff --git a/setup.py b/setup.py index e872dc6..478b81e 100755 --- a/setup.py +++ b/setup.py @@ -42,5 +42,6 @@ setup( extras_require={ 'html': ['beautifulsoup4', 'lxml'], 'requests': ['requests', 'requests-ftp'], + 'curl': ['pycurl', 'certifi'], }, ) -- 2.39.5