Robots/bkmk_rsimple.py

   1 """
   2    Simple, strightforward robot
   3
   4    Written by Oleg Broytman. Copyright (C) 2000-2010 PhiloSoft Design.
   5 """
   6
   7 import sys, os
   8 import time, urllib
   9 from base64 import b64encode
  10 from urlparse import urljoin
  11
  12 from m_lib.net.www.util import parse_time
  13 from m_lib.md5wrapper import md5wrapper
  14
  15 from bkmk_objects import Robot
  16 from parse_html import parse_html
  17
  18
  19 class RedirectException(Exception):
  20    reloc_dict = {
  21       301: "perm.",
  22       302: "temp.",
  23       "html": "html"
  24    }
  25    def __init__(self, errcode, newurl):
  26       Exception.__init__(self, "(%s) to %s" % (self.reloc_dict[errcode], newurl))
  27       self.url = newurl
  28
  29
  30 class MyURLopener(urllib.URLopener):
  31    # Error 302 -- relocated (temporarily)
  32    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
  33       if headers.has_key('location'):
  34          newurl = headers['location']
  35       elif headers.has_key('uri'):
  36          newurl = headers['uri']
  37       else:
  38          newurl = "Nowhere"
  39       raise RedirectException(errcode, newurl)
  40
  41    # Error 301 -- also relocated (permanently)
  42    http_error_301 = http_error_302
  43    # Error 307 -- also relocated (temporary)
  44    http_error_307 = http_error_302
  45
  46    # Error 401 -- authentication required
  47    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
  48       raise IOError, ('http error', errcode, "Authentication required ", headers)
  49
  50    def http_error_default(self, url, fp, errcode, errmsg, headers):
  51       if fp:
  52          void = fp.read()
  53          fp.close()
  54       raise IOError, ('http error', errcode, errmsg, headers)
  55
  56
  57 urllib._urlopener = MyURLopener()
  58
  59 # Fake headers to pretend this is a real browser
  60 _version = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en) Gecko/20001221 Firefox/2.0.0"
  61 urllib._urlopener.addheaders[0] = ('User-Agent', _version)
  62 _version = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % (
  63    sys.version_info[0], sys.version_info[1], sys.version_info[2], urllib.__version__)
  64 urllib._urlopener.addheader('X-User-Agent', _version)
  65 urllib._urlopener.addheader('Referer', '')
  66
  67 urllib._urlopener.addheader('Connection', 'close')
  68 urllib._urlopener.addheader('Accept', '*/*')
  69 urllib._urlopener.addheader('Accept-Language', 'ru,en')
  70 urllib._urlopener.addheader('Cache-Control', 'max-age=300')
  71
  72
  73 def get_error(msg):
  74    if isinstance(msg, str):
  75       return msg
  76
  77    else:
  78       s = []
  79       for i in msg:
  80          s.append("'%s'" % str(i).replace('\n', "\\n"))
  81       return "(%s)" % ' '.join(s)
  82
  83
  84 urllib_ftpwrapper = urllib.ftpwrapper
  85 ftpcache_key = None
  86
  87 class myftpwrapper(urllib_ftpwrapper):
  88    def __init__(self, user, passwd, host, port, dirs):
  89       urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs)
  90       global ftpcache_key
  91       ftpcache_key = (user, host, port, '/'.join(dirs))
  92
  93 urllib.ftpwrapper = myftpwrapper
  94
  95 def get_welcome():
  96    global ftpcache_key
  97    _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome
  98    ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db.
  99                        # If there are - ftpcache_key in prev line is invalid.
 100    return _welcome
 101
 102
 103 icons = {} # Icon cache; maps URL to a tuple (content type, data)
 104            # or None if there is no icon.
 105
 106 class robot_simple(Robot):
 107    def check_url(self, bookmark):
 108       fname = None
 109       try:
 110          self.start = int(time.time())
 111          bookmark.icon = None
 112
 113          url_type, url_rest = urllib.splittype(bookmark.href)
 114          url_host, url_path = urllib.splithost(url_rest)
 115          url_path, url_tag  = urllib.splittag(url_path)
 116
 117          # Set fake referer to the root of the site
 118          urllib._urlopener.addheaders[2] = ('Referer', "%s://%s%s" % (url_type, url_host, url_path))
 119
 120          if bookmark.charset: urllib._urlopener.addheader('Accept-Charset', bookmark.charset)
 121          fname, headers = urllib.urlretrieve("%s://%s%s" % (url_type, url_host, url_path))
 122          if bookmark.charset: del urllib._urlopener.addheaders[-1]
 123
 124          size = 0
 125          last_modified = None
 126
 127          if headers:
 128             try:
 129                size = headers["Content-Length"]
 130             except KeyError:
 131                pass
 132
 133             try:
 134                last_modified = headers["Last-Modified"]
 135             except KeyError:
 136                pass
 137
 138             if last_modified:
 139                last_modified = parse_time(last_modified)
 140
 141          if last_modified:
 142             last_modified = str(int(last_modified))
 143          else:
 144             last_modified = bookmark.last_visit
 145
 146          bookmark.size = size
 147          bookmark.last_modified = last_modified
 148
 149          md5 = md5wrapper()
 150          if urllib._urlopener.type == "ftp": # Pass welcome message through MD5
 151             md5.update(get_welcome())
 152
 153          md5.md5file(fname)
 154          bookmark.md5 = str(md5)
 155
 156          if headers:
 157             try:
 158                content_type = headers["Content-Type"]
 159                self.log("   Content-Type: %s" % content_type)
 160                try:
 161                   # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
 162                   content_type, charset = content_type.split(';', 1)
 163                   content_type = content_type.strip()
 164                   charset = charset.split('=')[1].strip().split(',')[0]
 165                   self.log("   HTTP charset   : %s" % charset)
 166                except (ValueError, IndexError):
 167                   charset = None
 168                   self.log("   no charset in Content-Type header")
 169                for ctype in ("text/html", "application/xhtml+xml"):
 170                   if content_type.startswith(ctype):
 171                       html = True
 172                       break
 173                else:
 174                   html = False
 175                if html:
 176                   parser = parse_html(fname, charset, self.log)
 177                   if parser:
 178                       bookmark.real_title = parser.title
 179                       icon = parser.icon
 180                   else:
 181                      icon = None
 182                   if not icon:
 183                      icon = "/favicon.ico"
 184                   icon = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
 185                   self.log("   looking for icon at: %s" % icon)
 186                   if icon in icons:
 187                      if icons[icon]:
 188                         bookmark.icon_href = icon
 189                         content_type, bookmark.icon = icons[icon]
 190                         self.log("       cached icon: %s" % content_type)
 191                      else:
 192                         self.log("       cached icon: no icon")
 193                   else:
 194                      try:
 195                         _icon = icon
 196                         for i in range(8):
 197                            try:
 198                               icon_fname, headers = urllib.urlretrieve(_icon)
 199                            except RedirectException, e:
 200                               _icon = e.url
 201                               self.log("       redirect to : %s" % _icon)
 202                            else:
 203                               break
 204                         else:
 205                            raise IOError("Too many redirects")
 206                      except:
 207                         etype, emsg, tb = sys.exc_info()
 208                         self.log("   no icon        : %s %s" % (etype, emsg))
 209                         etype = emsg = tb = None
 210                         icons[icon] = None
 211                      else:
 212                         content_type = headers["Content-Type"]
 213                         if content_type.startswith("application/") \
 214                               or content_type.startswith("image/") \
 215                               or content_type.startswith("text/plain"):
 216                            icon_file = open(icon_fname, "rb")
 217                            icon_data = icon_file.read()
 218                            icon_file.close()
 219                            bookmark.icon_href = icon
 220                            self.log("   got icon       : %s" % content_type)
 221                            if content_type.startswith("application/") \
 222                                  or content_type.startswith("text/plain"):
 223                               self.log("   got non-image icon, assume x-icon")
 224                               content_type = 'image/x-icon'
 225                            bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
 226                            icons[icon] = (content_type, bookmark.icon)
 227                         else:
 228                            self.log("   no icon        : bad content type '%s'" % content_type)
 229                            icons[icon] = None
 230                   if parser and parser.refresh:
 231                      refresh = parser.refresh
 232                      try:
 233                         url = refresh.split('=', 1)[1]
 234                      except IndexError:
 235                         url = "self"
 236                      try:
 237                         timeout = float(refresh.split(';')[0])
 238                      except (IndexError, ValueError):
 239                         raise RedirectException("html", "Bad redirect to %s (%s)" % (url, refresh))
 240                      else:
 241                         try:
 242                            timeout = int(refresh.split(';')[0])
 243                         except ValueError:
 244                            pass # float timeout
 245                         raise RedirectException("html", "%s (%s sec)" % (url, timeout))
 246
 247             except KeyError, key:
 248                self.log("   no header: %s" % key)
 249
 250       except IOError, msg:
 251          if (msg[0] == "http error") and (msg[1] == -1):
 252             bookmark.no_error = "The server did not return any header - it is not an error, actually"
 253             self.log('   no headers: %s' % bookmark.no_error)
 254          else:
 255             bookmark.error = get_error(msg)
 256             self.log('   Error: %s' % bookmark.error)
 257
 258       except EOFError:
 259          bookmark.error = "Unexpected EOF (FTP server closed connection)"
 260          self.log('   EOF: %s' % bookmark.error)
 261
 262       except RedirectException, msg:
 263          bookmark.moved = str(msg)
 264          self.log('   Moved: %s' % bookmark.moved)
 265
 266       except KeyboardInterrupt:
 267          self.log("Keyboard interrupt (^C)")
 268          return 0
 269
 270       except:
 271          import traceback
 272          traceback.print_exc()
 273          bookmark.error = "Exception!"
 274          self.log('   Exception: %s' % bookmark.error)
 275
 276       finally:
 277          self.finish_check_url(bookmark, fname)
 278
 279       # Tested
 280       return 1
 281
 282
 283    def finish_check_url(self, bookmark, fname=None):
 284       # Calculate these attributes even in case of an error
 285       if fname and os.path.exists(fname):
 286          size = str(os.path.getsize(fname))
 287          if size[-1] == 'L':
 288             size = size[:-1]
 289          bookmark.size = size
 290
 291       start = self.start
 292       bookmark.last_tested = str(start)
 293
 294       now = int(time.time())
 295       bookmark.test_time = str(now - start)
 296       urllib.urlcleanup()