X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fbkmk_rurllib_py3.py;h=ae88b1956f54e5f894f9ae2c584258408e5cfa9b;hb=2b3829aef193cb1951989a8cf97a96dcbfc084a1;hp=b39288770fa2212fe0cbfbaba010505c436faa1d;hpb=68438d614389f7011d78ed6b29d3a2ab13471c8b;p=bookmarks_db.git diff --git a/Robots/bkmk_rurllib_py3.py b/Robots/bkmk_rurllib_py3.py index b392887..ae88b19 100644 --- a/Robots/bkmk_rurllib_py3.py +++ b/Robots/bkmk_rurllib_py3.py @@ -11,6 +11,8 @@ __license__ = "GNU GPL" __all__ = ['robot_urllib_py3'] +import http.client +import socket import sys import urllib.request @@ -41,6 +43,8 @@ class MyURLopener(urllib.request.URLopener): http_error_303 = http_error_301 # Error 307 -- relocated (temporarily) http_error_307 = http_error_301 + # Error 308 -- relocated (permanently) + http_error_308 = http_error_301 # Error 401 -- authentication required def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): @@ -53,24 +57,27 @@ class MyURLopener(urllib.request.URLopener): fp.close() raise IOError(('http error', errcode, errmsg, headers)) + def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): + return urllib.request.URLopener.open(self, fullurl, data) -urllib.request._opener = MyURLopener() + +urllib.request._opener = opener = MyURLopener() # Fake headers to pretend this is a real browser _user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)" " Gecko/20001221 Firefox/2.0.0" -urllib.request._opener.addheaders[0] = ('User-Agent', _user_agent) +opener.addheaders[0] = ('User-Agent', _user_agent) _x_user_agent = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % ( sys.version_info[0], sys.version_info[1], sys.version_info[2], urllib.request.__version__ ) -urllib.request._opener.addheader('X-User-Agent', _x_user_agent) -urllib.request._opener.addheader('Referer', '') +opener.addheader('X-User-Agent', _x_user_agent) +opener.addheader('Referer', '') -urllib.request._opener.addheader('Accept', '*/*') -urllib.request._opener.addheader('Accept-Language', 'ru,en') -urllib.request._opener.addheader('Cache-Control', 'max-age=300') -urllib.request._opener.addheader('Connection', 'close') +opener.addheader('Accept', '*/*') +opener.addheader('Accept-Language', 'ru,en') +opener.addheader('Cache-Control', 'max-age=300') +opener.addheader('Connection', 'close') urllib_ftpwrapper = urllib.request.ftpwrapper @@ -91,26 +98,50 @@ class robot_urllib_py3(robot_base): def get(self, bookmark, url, accept_charset=False): try: # Set fake referer to the base URL - urllib.request._opener.addheaders[2] = ('Referer', url) + opener.addheaders[2] = ('Referer', url) if accept_charset and bookmark.charset: - urllib.request._opener.addheader('Accept-Charset', bookmark.charset) + opener.addheader('Accept-Charset', bookmark.charset) try: fname, headers = urllib.request.urlretrieve(url) finally: if accept_charset and bookmark.charset: # Remove Accept-Charset - del urllib.request._opener.addheaders[-1] - - infile = open(fname, 'rt') - content = infile.read() - infile.close() - + del opener.addheaders[-1] + + possible_encodings = [] + for encoding in ( + bookmark.charset, + sys.getfilesystemencoding(), + 'utf-8', + ): + if encoding and encoding not in possible_encodings: + possible_encodings.append(encoding) + content = e = None + infile = open(fname, 'rb') + try: + content = infile.read() + except Exception: + content = None + finally: + infile.close() + + if content is None: + e = str(e) + return ( + 'ERROR: ' + e, + None, None, None, None + ) return None, None, None, headers, content except RedirectException as e: return None, e.errcode, e.newurl, None, None + except (OSError, http.client.IncompleteRead) as e: + error = str(e) + self.log(' Error: %s' % error) + return error, None, None, None, None + except IOError as e: if (e[0] == "http error") and (e[1] == -1): error = None @@ -125,7 +156,7 @@ class robot_urllib_py3(robot_base): def get_ftp_welcome(self): global ftpcache_key - _welcome = urllib.request._opener.ftpcache[ftpcache_key].ftp.welcome + _welcome = opener.ftpcache[ftpcache_key].ftp.welcome # I am assuming there are no duplicate ftp URLs in db. # If there are - ftpcache_key in next line is invalid. ftpcache_key = None @@ -134,3 +165,4 @@ class robot_urllib_py3(robot_base): def finish_check_url(self, bookmark): robot_base.finish_check_url(self, bookmark) urllib.request.urlcleanup() + urllib.request._opener = opener