class robot_multirequests(robot_requests):
concurrent_class = concurrent.futures.ProcessPoolExecutor # or ThreadPoolExecutor # noqa: E501 line too long
# We're I/O-bound, not CPU-bound
- max_workers = 2*cpu_count if cpu_count else 10
+ max_urls = 2*cpu_count if cpu_count else 10
def __init__(self, *args, **kw):
- if isinstance(self.max_workers, str):
- self.max_workers = int(self.max_workers)
+ if isinstance(self.max_urls, str):
+ self.max_urls = int(self.max_urls)
concurrent_class = getattr(concurrent.futures, self.concurrent_class) \
if isinstance(self.concurrent_class, str) \
else self.concurrent_class
self.concurrent_class_name = concurrent_class.__name__
robot_requests.__init__(self, *args, **kw)
- self.executor = concurrent_class(max_workers=self.max_workers)
+ self.executor = concurrent_class(max_urls=self.max_urls)
# Bookmarks waiting to be processed;
# maps {URL: [bookmark, saved parent, future]}
if href in bookmarks:
return
bookmarks[href] = [bookmark, None, None]
- if len(bookmarks) < self.max_workers:
+ if len(bookmarks) < self.max_urls:
return
self.wait()
bookmarks = self.bookmarks
pending = self.pending
- free_workers = self.max_workers - len(pending)
+ free_workers = self.max_urls - len(pending)
if bookmarks and (free_workers > 0): # there's job and free workers,
for href in bookmarks:
bookmark, parent, ft = bookmarks[href]
done, pending = concurrent.futures.wait(
pending, self.timeout+1,
return_when=concurrent.futures.FIRST_COMPLETED)
+ self.pending = pending
for ft in done:
new_bkmk, log_lines = ft.result()
for line in log_lines:
log(line)
- self.pending = pending
-
def stop(self):
while self.bookmarks or self.pending:
self.wait()
WHAT'S NEW
-Version 5.7.0 (2024-08-16)
+Version 6.0.0 (2024-??-??)
- Robot bkmk_rrequests: Use ftplib directly, without requests_ftp.
-
- Robots: Removed connect_timeout, added ftp_timeout.
-
- Robot bkmk_raiohttp: Use aiohttp-socks for aiohttp, siosocks for aioftp.
-
-Version 5.6.1 (2024-08-15)
-
- Minor fixes.
-
-Version 5.6.0 (2024-08-15)
-
- Robot based on requests and concurrent.futures,
- processes multiple URLs in parallel. Multiprocess variant works
- very well, multithreading not so good (too many sites report errors).
-
- Removed urllib-based robots.
-
- Dropped support for Python 2.
-
- Default list of robots is now curl,requests,aiohttp.
+ Renamed max_workers to max_urls.
WHERE TO GET