Robots/bkmk_rurllib_py3.py

   1 """Simple, strightforward robot based on urllib
   2
   3 This file is a part of Bookmarks database and Internet robot.
   4
   5 """
   6
   7 __author__ = "Oleg Broytman <phd@phdru.name>"
   8 __copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
   9 __license__ = "GNU GPL"
  10
  11 __all__ = ['robot_urllib_py3']
  12
  13
  14 import http.client
  15 import socket
  16 import sys
  17 import urllib.request
  18
  19 from Robots.bkmk_robot_base import robot_base, get_error
  20
  21
  22 class RedirectException(Exception):
  23     def __init__(self, errcode, newurl):
  24         Exception.__init__(self)
  25         self.errcode = errcode
  26         self.newurl = newurl
  27
  28
  29 class MyURLopener(urllib.request.URLopener):
  30     # Error 301 -- relocated (permanently)
  31     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
  32         if 'location' in headers:
  33             newurl = headers['location']
  34         elif 'uri' in headers:
  35             newurl = headers['uri']
  36         else:
  37             newurl = "Nowhere"
  38         raise RedirectException(errcode, newurl)
  39
  40     # Error 302 -- relocated (temporarily)
  41     http_error_302 = http_error_301
  42     # Error 303 -- relocated (see other)
  43     http_error_303 = http_error_301
  44     # Error 307 -- relocated (temporarily)
  45     http_error_307 = http_error_301
  46
  47     # Error 401 -- authentication required
  48     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
  49         raise IOError(
  50             ('http error', errcode, "Authentication required ", headers))
  51
  52     def http_error_default(self, url, fp, errcode, errmsg, headers):
  53         if fp:
  54             fp.read()
  55             fp.close()
  56         raise IOError(('http error', errcode, errmsg, headers))
  57
  58     def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
  59         return urllib.request.URLopener.open(self, fullurl, data)
  60
  61
  62 urllib.request._opener = opener = MyURLopener()
  63
  64 # Fake headers to pretend this is a real browser
  65 _user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)"
  66 " Gecko/20001221 Firefox/2.0.0"
  67 opener.addheaders[0] = ('User-Agent', _user_agent)
  68 _x_user_agent = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % (
  69    sys.version_info[0], sys.version_info[1],
  70    sys.version_info[2], urllib.request.__version__
  71 )
  72 opener.addheader('X-User-Agent', _x_user_agent)
  73 opener.addheader('Referer', '')
  74
  75 opener.addheader('Accept', '*/*')
  76 opener.addheader('Accept-Language', 'ru,en')
  77 opener.addheader('Cache-Control', 'max-age=300')
  78 opener.addheader('Connection', 'close')
  79
  80
  81 urllib_ftpwrapper = urllib.request.ftpwrapper
  82 ftpcache_key = None
  83
  84
  85 class myftpwrapper(urllib_ftpwrapper):
  86     def __init__(self, user, passwd, host, port, dirs):
  87         urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs)
  88         global ftpcache_key
  89         ftpcache_key = (user, host, port, '/'.join(dirs))
  90
  91
  92 urllib.request.ftpwrapper = myftpwrapper
  93
  94
  95 class robot_urllib_py3(robot_base):
  96     def get(self, bookmark, url, accept_charset=False):
  97         try:
  98             # Set fake referer to the base URL
  99             opener.addheaders[2] = ('Referer', url)
 100
 101             if accept_charset and bookmark.charset:
 102                 opener.addheader('Accept-Charset', bookmark.charset)
 103             try:
 104                 fname, headers = urllib.request.urlretrieve(url)
 105             finally:
 106                 if accept_charset and bookmark.charset:
 107                     # Remove Accept-Charset
 108                     del opener.addheaders[-1]
 109
 110             possible_encodings = []
 111             for encoding in (
 112                     bookmark.charset,
 113                     sys.getfilesystemencoding(),
 114                     'utf-8',
 115             ):
 116                 if encoding and encoding not in possible_encodings:
 117                     possible_encodings.append(encoding)
 118             content = None
 119             for encoding in possible_encodings:
 120                 infile = open(fname, 'rt', encoding=encoding)
 121                 try:
 122                     content = infile.read()
 123                 except UnicodeDecodeError:
 124                     infile.close()
 125                     continue
 126                 else:
 127                     break
 128                 infile.close()
 129
 130             if content is None:
 131                 return (
 132                     'ERROR: File encoding was not recognized',
 133                     None, None, None, None
 134                 )
 135             return None, None, None, headers, content
 136
 137         except RedirectException as e:
 138             return None, e.errcode, e.newurl, None, None
 139
 140         except (OSError, http.client.IncompleteRead) as e:
 141             error = str(e)
 142             self.log('   Error: %s' % error)
 143             return error, None, None, None, None
 144
 145         except IOError as e:
 146             if (e[0] == "http error") and (e[1] == -1):
 147                 error = None
 148                 bookmark.no_error = "The server did not return any header - "
 149                 "it is not an error, actually"
 150                 self.log('   no headers: %s' % bookmark.no_error)
 151             else:
 152                 error = get_error(e)
 153                 self.log('   Error: %s' % error)
 154
 155             return error, None, None, None, None
 156
 157     def get_ftp_welcome(self):
 158         global ftpcache_key
 159         _welcome = opener.ftpcache[ftpcache_key].ftp.welcome
 160         # I am assuming there are no duplicate ftp URLs in db.
 161         # If there are - ftpcache_key in next line is invalid.
 162         ftpcache_key = None
 163         return _welcome
 164
 165     def finish_check_url(self, bookmark):
 166         robot_base.finish_check_url(self, bookmark)
 167         urllib.request.urlcleanup()
 168         urllib.request._opener = opener