1 """Simple, strightforward robot based on urllib
3 This file is a part of Bookmarks database and Internet robot.
7 __author__ = "Oleg Broytman <phd@phdru.name>"
8 __copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
9 __license__ = "GNU GPL"
11 __all__ = ['robot_urllib_py3']
19 from Robots.bkmk_robot_base import robot_base, get_error
22 class RedirectException(Exception):
23 def __init__(self, errcode, newurl):
24 Exception.__init__(self)
25 self.errcode = errcode
29 class MyURLopener(urllib.request.URLopener):
30 # Error 301 -- relocated (permanently)
31 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
32 if 'location' in headers:
33 newurl = headers['location']
34 elif 'uri' in headers:
35 newurl = headers['uri']
38 raise RedirectException(errcode, newurl)
40 # Error 302 -- relocated (temporarily)
41 http_error_302 = http_error_301
42 # Error 303 -- relocated (see other)
43 http_error_303 = http_error_301
44 # Error 307 -- relocated (temporarily)
45 http_error_307 = http_error_301
46 # Error 308 -- relocated (permanently)
47 http_error_308 = http_error_301
49 # Error 401 -- authentication required
50 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
52 ('http error', errcode, "Authentication required ", headers))
54 def http_error_default(self, url, fp, errcode, errmsg, headers):
58 raise IOError(('http error', errcode, errmsg, headers))
60 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
61 return urllib.request.URLopener.open(self, fullurl, data)
64 urllib.request._opener = opener = MyURLopener()
66 # Fake headers to pretend this is a real browser
67 _user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)"
68 " Gecko/20001221 Firefox/2.0.0"
69 opener.addheaders[0] = ('User-Agent', _user_agent)
70 _x_user_agent = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % (
71 sys.version_info[0], sys.version_info[1],
72 sys.version_info[2], urllib.request.__version__
74 opener.addheader('X-User-Agent', _x_user_agent)
75 opener.addheader('Referer', '')
77 opener.addheader('Accept', '*/*')
78 opener.addheader('Accept-Language', 'ru,en')
79 opener.addheader('Cache-Control', 'max-age=300')
80 opener.addheader('Connection', 'close')
83 urllib_ftpwrapper = urllib.request.ftpwrapper
87 class myftpwrapper(urllib_ftpwrapper):
88 def __init__(self, user, passwd, host, port, dirs):
89 urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs)
91 ftpcache_key = (user, host, port, '/'.join(dirs))
94 urllib.request.ftpwrapper = myftpwrapper
97 class robot_urllib_py3(robot_base):
98 def get(self, bookmark, url, accept_charset=False):
100 # Set fake referer to the base URL
101 opener.addheaders[2] = ('Referer', url)
103 if accept_charset and bookmark.charset:
104 opener.addheader('Accept-Charset', bookmark.charset)
106 fname, headers = urllib.request.urlretrieve(url)
108 if accept_charset and bookmark.charset:
109 # Remove Accept-Charset
110 del opener.addheaders[-1]
112 possible_encodings = []
115 sys.getfilesystemencoding(),
118 if encoding and encoding not in possible_encodings:
119 possible_encodings.append(encoding)
121 infile = open(fname, 'rb')
123 content = infile.read()
124 except Exception as e:
133 None, None, None, None
135 return None, None, None, headers, content
137 except RedirectException as e:
138 return None, e.errcode, e.newurl, None, None
140 except (OSError, http.client.IncompleteRead) as e:
142 self.log(' Error: %s' % error)
143 return error, None, None, None, None
146 if (e[0] == "http error") and (e[1] == -1):
148 bookmark.no_error = "The server did not return any header - "
149 "it is not an error, actually"
150 self.log(' no headers: %s' % bookmark.no_error)
153 self.log(' Error: %s' % error)
155 return error, None, None, None, None
157 def get_ftp_welcome(self):
159 _welcome = opener.ftpcache[ftpcache_key].ftp.welcome
160 # I am assuming there are no duplicate ftp URLs in db.
161 # If there are - ftpcache_key in next line is invalid.
165 def finish_check_url(self, bookmark):
166 robot_base.finish_check_url(self, bookmark)
167 urllib.request.urlcleanup()
168 urllib.request._opener = opener