From 82e816782f69ed9bc08c50d84d52a095eaf32928 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Tue, 10 Sep 2024 15:15:37 +0300 Subject: [PATCH] Feat(bkmk_rtwisted): HTTP proxy --- Robots/bkmk_rtwisted.py | 16 ++++++++++++++-- doc/ANNOUNCE | 5 ++++- doc/ChangeLog | 5 ++++- robots.py | 2 +- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/Robots/bkmk_rtwisted.py b/Robots/bkmk_rtwisted.py index 3ee108e..34a5fad 100644 --- a/Robots/bkmk_rtwisted.py +++ b/Robots/bkmk_rtwisted.py @@ -12,11 +12,13 @@ __license__ = "GNU GPL" __all__ = ['robot_twisted'] +from urllib.parse import urlsplit from time import sleep from twisted import __version__ from twisted.internet import reactor -from twisted.web.client import Agent, readBody +from twisted.internet.endpoints import TCP4ClientEndpoint +from twisted.web.client import Agent, ProxyAgent, readBody from twisted.web.http_headers import Headers from Robots.base import encode_url @@ -64,7 +66,13 @@ class robot_twisted(cf_multithread): def main_thread(self): """Main loop: create twisted agent and HTTP queries""" - agent = Agent(reactor, connectTimeout=self.timeout) + direct_agent = Agent(reactor, connectTimeout=self.timeout) + + if self.proxy and self.proxy.startswith('http'): + proxy = urlsplit(self.proxy) + endpoint = TCP4ClientEndpoint( + reactor, proxy.hostname, proxy.port, timeout=self.timeout) + proxy_agent = ProxyAgent(endpoint) while True: if self.queue.empty(): @@ -81,6 +89,10 @@ class robot_twisted(cf_multithread): except UnicodeEncodeError: url = encode_url(url) req_headers = {k: [v] for k, v in req_headers.items()} + if use_proxy: + agent = proxy_agent + else: + agent = direct_agent try: d = agent.request(b'GET', url.encode('ascii'), Headers(req_headers)) diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE index f2483d1..c0fa340 100644 --- a/doc/ANNOUNCE +++ b/doc/ANNOUNCE @@ -11,8 +11,11 @@ Version 6.2.0 (2024-??-??) Robot based on twisted and concurrent.futures, processes multiple URLs in parallel (multithreaded). + Doesn't properly support proxies; has problems with HTTP proxy + and doesn't support SOCKS5 proxy at all. + Doesn't query FTP; requires more work. - Default list of robots is now multirequests,aio,twisted. + Default list of robots is still multirequests,aio. WHERE TO GET diff --git a/doc/ChangeLog b/doc/ChangeLog index 752349c..9182c44 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -2,8 +2,11 @@ Version 6.2.0 (2024-??-??) Robot based on twisted and concurrent.futures, processes multiple URLs in parallel (multithreaded). + Doesn't properly support proxies; has problems with HTTP proxy + and doesn't support SOCKS5 proxy at all. + Doesn't query FTP; requires more work. - Default list of robots is now multirequests,aio,twisted. + Default list of robots is still multirequests,aio. Version 6.1.0 (2024-09-08) diff --git a/robots.py b/robots.py index 24bed78..2b80404 100644 --- a/robots.py +++ b/robots.py @@ -15,7 +15,7 @@ from os import environ from bkmk_objects import parse_params, set_params robot_names, robot_params = parse_params( - environ.get("BKMK_ROBOT", "multirequests,aio,twisted")) + environ.get("BKMK_ROBOT", "multirequests,aio")) def import_robot(robot_name): -- 2.39.5