From: Oleg Broytman Date: Sun, 12 Nov 2023 10:01:29 +0000 (+0300) Subject: Fix(Py3): Fix `urllib`-based robot X-Git-Tag: 5.0.0~56 X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=68438d614389f7011d78ed6b29d3a2ab13471c8b;p=bookmarks_db.git Fix(Py3): Fix `urllib`-based robot --- diff --git a/Robots/bkmk_rurllib_py3.py b/Robots/bkmk_rurllib_py3.py new file mode 100644 index 0000000..b392887 --- /dev/null +++ b/Robots/bkmk_rurllib_py3.py @@ -0,0 +1,136 @@ +"""Simple, strightforward robot based on urllib + +This file is a part of Bookmarks database and Internet robot. + +""" + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['robot_urllib_py3'] + + +import sys +import urllib.request + +from Robots.bkmk_robot_base import robot_base, get_error + + +class RedirectException(Exception): + def __init__(self, errcode, newurl): + Exception.__init__(self) + self.errcode = errcode + self.newurl = newurl + + +class MyURLopener(urllib.request.URLopener): + # Error 301 -- relocated (permanently) + def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): + if 'location' in headers: + newurl = headers['location'] + elif 'uri' in headers: + newurl = headers['uri'] + else: + newurl = "Nowhere" + raise RedirectException(errcode, newurl) + + # Error 302 -- relocated (temporarily) + http_error_302 = http_error_301 + # Error 303 -- relocated (see other) + http_error_303 = http_error_301 + # Error 307 -- relocated (temporarily) + http_error_307 = http_error_301 + + # Error 401 -- authentication required + def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): + raise IOError( + ('http error', errcode, "Authentication required ", headers)) + + def http_error_default(self, url, fp, errcode, errmsg, headers): + if fp: + fp.read() + fp.close() + raise IOError(('http error', errcode, errmsg, headers)) + + +urllib.request._opener = MyURLopener() + +# Fake headers to pretend this is a real browser +_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)" +" Gecko/20001221 Firefox/2.0.0" +urllib.request._opener.addheaders[0] = ('User-Agent', _user_agent) +_x_user_agent = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % ( + sys.version_info[0], sys.version_info[1], + sys.version_info[2], urllib.request.__version__ +) +urllib.request._opener.addheader('X-User-Agent', _x_user_agent) +urllib.request._opener.addheader('Referer', '') + +urllib.request._opener.addheader('Accept', '*/*') +urllib.request._opener.addheader('Accept-Language', 'ru,en') +urllib.request._opener.addheader('Cache-Control', 'max-age=300') +urllib.request._opener.addheader('Connection', 'close') + + +urllib_ftpwrapper = urllib.request.ftpwrapper +ftpcache_key = None + + +class myftpwrapper(urllib_ftpwrapper): + def __init__(self, user, passwd, host, port, dirs): + urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs) + global ftpcache_key + ftpcache_key = (user, host, port, '/'.join(dirs)) + + +urllib.request.ftpwrapper = myftpwrapper + + +class robot_urllib_py3(robot_base): + def get(self, bookmark, url, accept_charset=False): + try: + # Set fake referer to the base URL + urllib.request._opener.addheaders[2] = ('Referer', url) + + if accept_charset and bookmark.charset: + urllib.request._opener.addheader('Accept-Charset', bookmark.charset) + try: + fname, headers = urllib.request.urlretrieve(url) + finally: + if accept_charset and bookmark.charset: + # Remove Accept-Charset + del urllib.request._opener.addheaders[-1] + + infile = open(fname, 'rt') + content = infile.read() + infile.close() + + return None, None, None, headers, content + + except RedirectException as e: + return None, e.errcode, e.newurl, None, None + + except IOError as e: + if (e[0] == "http error") and (e[1] == -1): + error = None + bookmark.no_error = "The server did not return any header - " + "it is not an error, actually" + self.log(' no headers: %s' % bookmark.no_error) + else: + error = get_error(e) + self.log(' Error: %s' % error) + + return error, None, None, None, None + + def get_ftp_welcome(self): + global ftpcache_key + _welcome = urllib.request._opener.ftpcache[ftpcache_key].ftp.welcome + # I am assuming there are no duplicate ftp URLs in db. + # If there are - ftpcache_key in next line is invalid. + ftpcache_key = None + return _welcome + + def finish_check_url(self, bookmark): + robot_base.finish_check_url(self, bookmark) + urllib.request.urlcleanup() diff --git a/robots.py b/robots.py index 8fac908..1571d58 100644 --- a/robots.py +++ b/robots.py @@ -11,6 +11,7 @@ __license__ = "GNU GPL" __all__ = ['import_robot', 'robot'] +import sys from os import environ from bkmk_objects import parse_params, set_params @@ -18,6 +19,8 @@ robot_name, robot_params = parse_params(environ.get("BKMK_ROBOT", "forking")) def import_robot(robot_name): + if (robot_name == 'urllib') and (sys.version_info[0] >= 3): + robot_name = 'urllib_py3' ns = locals() exec("from Robots import bkmk_r%s" % robot_name, globals(), ns) exec("robot = bkmk_r%s.robot_%s" % (robot_name, robot_name), globals(), ns)