From 7462ff4d330df6386557b5265e454ffa0e6b7bb9 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Sun, 3 Mar 2024 12:48:11 +0300 Subject: [PATCH] Refactor(Robots): Refactor request headers --- Robots/bkmk_robot_base.py | 16 +++++++++++++++ Robots/bkmk_rurllib.py | 40 ++++++++++++++++---------------------- Robots/bkmk_rurllib2.py | 20 ++++--------------- Robots/bkmk_rurllib_py3.py | 26 ++++++++----------------- 4 files changed, 45 insertions(+), 57 deletions(-) diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index 1e511d0..df33a26 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -24,6 +24,22 @@ from bkmk_objects import Robot from parse_html import parse_html +# Fake headers to pretend this is a real browser +_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)" +" Gecko/20001221 Firefox/2.0.0" +_x_user_agent = "bookmarks_db (Python %d.%d.%d)" % sys.version_info[:3] + +request_headers = { + 'Accept': '*/*', + 'Accept-Language': 'ru,en', + 'Cache-Control': 'max-age=300', + 'Connection': 'close', + 'Referer': '/', + 'User-Agent': _user_agent, + 'X-User-Agent': _x_user_agent, +} + + reloc_dict = { 301: "perm1.", 302: "temp2.", diff --git a/Robots/bkmk_rurllib.py b/Robots/bkmk_rurllib.py index 067799c..057c018 100644 --- a/Robots/bkmk_rurllib.py +++ b/Robots/bkmk_rurllib.py @@ -5,16 +5,15 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['robot_urllib'] -import sys import urllib -from Robots.bkmk_robot_base import robot_base, get_error +from Robots.bkmk_robot_base import robot_base, request_headers, get_error class RedirectException(Exception): @@ -56,24 +55,19 @@ class MyURLopener(urllib.URLopener): raise IOError(('http error', errcode, errmsg, headers)) -urllib._urlopener = MyURLopener() +def add_headers(opener): + try: + _user_agent = request_headers.pop('User-Agent') + except KeyError: + pass + else: + opener.addheaders[0] = ('User-Agent', _user_agent) + for h, v in request_headers.items(): + opener.addheader(h, v) -# Fake headers to pretend this is a real browser -_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)" -" Gecko/20001221 Firefox/2.0.0" -urllib._urlopener.addheaders[0] = ('User-Agent', _user_agent) -_x_user_agent = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % ( - sys.version_info[0], sys.version_info[1], - sys.version_info[2], urllib.__version__ -) -urllib._urlopener.addheader('X-User-Agent', _x_user_agent) -urllib._urlopener.addheader('Referer', '') - -urllib._urlopener.addheader('Accept', '*/*') -urllib._urlopener.addheader('Accept-Language', 'ru,en') -urllib._urlopener.addheader('Cache-Control', 'max-age=300') -urllib._urlopener.addheader('Connection', 'close') +urllib._urlopener = opener = MyURLopener() +add_headers(opener) urllib_ftpwrapper = urllib.ftpwrapper ftpcache_key = None @@ -93,16 +87,16 @@ class robot_urllib(robot_base): def get(self, bookmark, url, accept_charset=False): try: # Set fake referer to the base URL - urllib._urlopener.addheaders[2] = ('Referer', url) + opener.addheaders[2] = ('Referer', url) if accept_charset and bookmark.charset: - urllib._urlopener.addheader('Accept-Charset', bookmark.charset) + opener.addheader('Accept-Charset', bookmark.charset) try: fname, headers = urllib.urlretrieve(url) finally: if accept_charset and bookmark.charset: # Remove Accept-Charset - del urllib._urlopener.addheaders[-1] + del opener.addheaders[-1] infile = open(fname, 'rt') content = infile.read() @@ -127,7 +121,7 @@ class robot_urllib(robot_base): def get_ftp_welcome(self): global ftpcache_key - _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome + _welcome = opener.ftpcache[ftpcache_key].ftp.welcome # I am assuming there are no duplicate ftp URLs in db. # If there are - ftpcache_key in next line is invalid. ftpcache_key = None diff --git a/Robots/bkmk_rurllib2.py b/Robots/bkmk_rurllib2.py index 1233c70..d1b679c 100644 --- a/Robots/bkmk_rurllib2.py +++ b/Robots/bkmk_rurllib2.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2014-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 2014-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['robot_urllib2'] @@ -14,7 +14,7 @@ __all__ = ['robot_urllib2'] import sys import httplib import urllib2 -from Robots.bkmk_robot_base import robot_base, get_error +from Robots.bkmk_robot_base import robot_base, request_headers, get_error _fw = None @@ -39,25 +39,13 @@ for klass in default_classes: urllib2.install_opener(opener) -# Fake headers to pretend this is a real browser -_user_agent = "Mozilla/5.0 (X11; Linux i686; rv:30.0)" -" Gecko/20100101 Firefox/30.0" -_x_user_agent = "bookmarks_db (Python %d.%d.%d; urllib2)" % ( - sys.version_info[0], sys.version_info[1], sys.version_info[2]) - - class robot_urllib2(robot_base): def get(self, bookmark, url, accept_charset=False): request = urllib2.Request(url) - request.add_header('Accept', '*/*') + for h, v in request_headers.items(): + request.add_header(h, v) if accept_charset and bookmark.charset: request.add_header('Accept-Charset', bookmark.charset) - request.add_header('Accept-Language', 'ru,en') - request.add_header('Cache-Control', 'max-age=300') - request.add_header('Connection', 'close') - request.add_header('Referer', url) - request.add_header('User-agent', _user_agent) - request.add_header('X-User-Agent', _x_user_agent) global _fw _fw = None diff --git a/Robots/bkmk_rurllib_py3.py b/Robots/bkmk_rurllib_py3.py index ae88b19..268dad4 100644 --- a/Robots/bkmk_rurllib_py3.py +++ b/Robots/bkmk_rurllib_py3.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['robot_urllib_py3'] @@ -14,10 +14,16 @@ __all__ = ['robot_urllib_py3'] import http.client import socket import sys +import urllib import urllib.request from Robots.bkmk_robot_base import robot_base, get_error +# Fake to import 'add_headers' +urllib.URLopener = urllib.request.URLopener +urllib.ftpwrapper = urllib.request.ftpwrapper +from Robots.bkmk_rurllib import add_headers # noqa: E402 import not at top + class RedirectException(Exception): def __init__(self, errcode, newurl): @@ -62,23 +68,7 @@ class MyURLopener(urllib.request.URLopener): urllib.request._opener = opener = MyURLopener() - -# Fake headers to pretend this is a real browser -_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)" -" Gecko/20001221 Firefox/2.0.0" -opener.addheaders[0] = ('User-Agent', _user_agent) -_x_user_agent = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % ( - sys.version_info[0], sys.version_info[1], - sys.version_info[2], urllib.request.__version__ -) -opener.addheader('X-User-Agent', _x_user_agent) -opener.addheader('Referer', '') - -opener.addheader('Accept', '*/*') -opener.addheader('Accept-Language', 'ru,en') -opener.addheader('Cache-Control', 'max-age=300') -opener.addheader('Connection', 'close') - +add_headers(opener) urllib_ftpwrapper = urllib.request.ftpwrapper ftpcache_key = None -- 2.39.2