X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fbkmk_rurllib.py;h=057c018e510c9fc6ff4de4e992e6cbd765a0ce63;hb=8e4df3830e6f68b691cb833415d30ab5d37338c9;hp=20b74239b8fb55fc184b08e2cf686ce8ca150983;hpb=cb9c36b39ed72cd1fa272130d2bcf162a89c3013;p=bookmarks_db.git diff --git a/Robots/bkmk_rurllib.py b/Robots/bkmk_rurllib.py index 20b7423..057c018 100644 --- a/Robots/bkmk_rurllib.py +++ b/Robots/bkmk_rurllib.py @@ -5,18 +5,15 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['robot_urllib'] -import os -import sys -import time import urllib -from Robots.bkmk_robot_base import robot_base, get_error +from Robots.bkmk_robot_base import robot_base, request_headers, get_error class RedirectException(Exception): @@ -29,9 +26,9 @@ class RedirectException(Exception): class MyURLopener(urllib.URLopener): # Error 301 -- relocated (permanently) def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): - if headers.has_key('location'): + if 'location' in headers: newurl = headers['location'] - elif headers.has_key('uri'): + elif 'uri' in headers: newurl = headers['uri'] else: newurl = "Nowhere" @@ -43,6 +40,8 @@ class MyURLopener(urllib.URLopener): http_error_303 = http_error_301 # Error 307 -- relocated (temporarily) http_error_307 = http_error_301 + # Error 308 -- relocated (permanently) + http_error_308 = http_error_301 # Error 401 -- authentication required def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): @@ -51,29 +50,24 @@ class MyURLopener(urllib.URLopener): def http_error_default(self, url, fp, errcode, errmsg, headers): if fp: - void = fp.read() + fp.read() fp.close() raise IOError(('http error', errcode, errmsg, headers)) -urllib._urlopener = MyURLopener() +def add_headers(opener): + try: + _user_agent = request_headers.pop('User-Agent') + except KeyError: + pass + else: + opener.addheaders[0] = ('User-Agent', _user_agent) + for h, v in request_headers.items(): + opener.addheader(h, v) -# Fake headers to pretend this is a real browser -_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)" -" Gecko/20001221 Firefox/2.0.0" -urllib._urlopener.addheaders[0] = ('User-Agent', _user_agent) -_x_user_agent = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % ( - sys.version_info[0], sys.version_info[1], - sys.version_info[2], urllib.__version__ -) -urllib._urlopener.addheader('X-User-Agent', _x_user_agent) -urllib._urlopener.addheader('Referer', '') - -urllib._urlopener.addheader('Accept', '*/*') -urllib._urlopener.addheader('Accept-Language', 'ru,en') -urllib._urlopener.addheader('Cache-Control', 'max-age=300') -urllib._urlopener.addheader('Connection', 'close') +urllib._urlopener = opener = MyURLopener() +add_headers(opener) urllib_ftpwrapper = urllib.ftpwrapper ftpcache_key = None @@ -93,18 +87,18 @@ class robot_urllib(robot_base): def get(self, bookmark, url, accept_charset=False): try: # Set fake referer to the base URL - urllib._urlopener.addheaders[2] = ('Referer', url) + opener.addheaders[2] = ('Referer', url) if accept_charset and bookmark.charset: - urllib._urlopener.addheader('Accept-Charset', bookmark.charset) + opener.addheader('Accept-Charset', bookmark.charset) try: fname, headers = urllib.urlretrieve(url) finally: if accept_charset and bookmark.charset: # Remove Accept-Charset - del urllib._urlopener.addheaders[-1] + del opener.addheaders[-1] - infile = open(fname, 'rb') + infile = open(fname, 'rt') content = infile.read() infile.close() @@ -117,7 +111,7 @@ class robot_urllib(robot_base): if (e[0] == "http error") and (e[1] == -1): error = None bookmark.no_error = "The server did not return any header - " - "it is not an error, actually" + "it is not an error, actually" self.log(' no headers: %s' % bookmark.no_error) else: error = get_error(e) @@ -127,7 +121,7 @@ class robot_urllib(robot_base): def get_ftp_welcome(self): global ftpcache_key - _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome + _welcome = opener.ftpcache[ftpcache_key].ftp.welcome # I am assuming there are no duplicate ftp URLs in db. # If there are - ftpcache_key in next line is invalid. ftpcache_key = None