-"""
- Simple, strightforward robot; guaranteed to has problems with timeouts :)
+"""Simple, strightforward robot
- Written by BroytMann, Mar 2000 - Aug 2002. Copyright (C) 2000-2002 PhiloSoft Design
+This file is a part of Bookmarks database and Internet robot.
"""
+__version__ = "$Revision$"[11:-2]
+__revision__ = "$Id$"[5:-2]
+__date__ = "$Date$"[7:-2]
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2000-2011 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_simple', 'get_error']
+
+
+import sys, os
+import time, urllib
+from base64 import b64encode
+from urlparse import urljoin
+
+from m_lib.net.www.util import parse_time
+from m_lib.md5wrapper import md5wrapper
+
+from bkmk_objects import Robot
+from parse_html import parse_html
+
class RedirectException(Exception):
reloc_dict = {
}
def __init__(self, errcode, newurl):
Exception.__init__(self, "(%s) to %s" % (self.reloc_dict[errcode], newurl))
-
-
-import string, os
-import time, urllib
-from m_lib.net.www.util import parse_time
-from m_lib.md5wrapper import md5wrapper
+ self.url = newurl
class MyURLopener(urllib.URLopener):
# Error 301 -- also relocated (permanently)
http_error_301 = http_error_302
+ # Error 307 -- also relocated (temporary)
+ http_error_307 = http_error_302
# Error 401 -- authentication required
def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
raise IOError, ('http error', errcode, "Authentication required ", headers)
+ def http_error_default(self, url, fp, errcode, errmsg, headers):
+ if fp:
+ void = fp.read()
+ fp.close()
+ raise IOError, ('http error', errcode, errmsg, headers)
+
urllib._urlopener = MyURLopener()
-# Some sites allow only Mozilla-compatible browsers; way to stop robots?
-server_version = "Mozilla/3.0 (compatible; Python-urllib/%s)" % urllib.__version__
-urllib._urlopener.addheaders[0] = ('User-agent', server_version)
+# Fake headers to pretend this is a real browser
+_version = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en) Gecko/20001221 Firefox/2.0.0"
+urllib._urlopener.addheaders[0] = ('User-Agent', _version)
+_version = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % (
+ sys.version_info[0], sys.version_info[1], sys.version_info[2], urllib.__version__)
+urllib._urlopener.addheader('X-User-Agent', _version)
+urllib._urlopener.addheader('Referer', '')
+
+urllib._urlopener.addheader('Connection', 'close')
+urllib._urlopener.addheader('Accept', '*/*')
+urllib._urlopener.addheader('Accept-Language', 'ru,en')
+urllib._urlopener.addheader('Cache-Control', 'max-age=300')
def get_error(msg):
- if type(msg) == type(""):
+ if isinstance(msg, str):
return msg
else:
s = []
for i in msg:
- s.append("'%s'" % string.join(string.split(str(i), "\n"), "\\n"))
- return "(%s)" % string.join(s)
+ s.append("'%s'" % str(i).replace('\n', "\\n"))
+ return "(%s)" % ' '.join(s)
urllib_ftpwrapper = urllib.ftpwrapper
def __init__(self, user, passwd, host, port, dirs):
urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs)
global ftpcache_key
- ftpcache_key = (user, host, port, string.join(dirs, '/'))
+ ftpcache_key = (user, host, port, '/'.join(dirs))
urllib.ftpwrapper = myftpwrapper
return _welcome
-from bkmk_objects import Robot
-from parse_html import parse_html
+icons = {} # Icon cache; maps URL to a tuple (content type, data)
+ # or None if there is no icon.
class robot_simple(Robot):
- def check_url(self, bookmark, url_type, url_rest):
- if not self.tempfname:
- self.tempfname = bookmark.tempfname
-
+ def check_url(self, bookmark):
+ fname = None
try:
- try:
- self.start = int(time.time())
- url_host, url_path = urllib.splithost(url_rest)
- url_path, url_tag = urllib.splittag(url_path)
+ self.start = int(time.time())
+ bookmark.icon = None
- fname, headers = urllib.urlretrieve("%s://%s%s" % (url_type, url_host, url_path), self.tempfname)
+ url_type, url_rest = urllib.splittype(bookmark.href)
+ url_host, url_path = urllib.splithost(url_rest)
+ url_path, url_tag = urllib.splittag(url_path)
- size = 0
- last_modified = None
+ # Set fake referer to the root of the site
+ urllib._urlopener.addheaders[2] = ('Referer', "%s://%s%s" % (url_type, url_host, url_path))
- if headers:
- try:
- size = headers["Content-Length"]
- except KeyError:
- pass
+ if bookmark.charset: urllib._urlopener.addheader('Accept-Charset', bookmark.charset)
+ fname, headers = urllib.urlretrieve("%s://%s%s" % (url_type, url_host, url_path))
+ if bookmark.charset: del urllib._urlopener.addheaders[-1]
- try:
- last_modified = headers["Last-Modified"]
- except KeyError:
- pass
+ size = 0
+ last_modified = None
+
+ if headers:
+ try:
+ size = headers["Content-Length"]
+ except KeyError:
+ pass
- if last_modified:
- last_modified = parse_time(last_modified)
+ try:
+ last_modified = headers["Last-Modified"]
+ except KeyError:
+ pass
if last_modified:
- last_modified = str(int(last_modified))
- else:
- last_modified = bookmark.last_visit
+ last_modified = parse_time(last_modified)
- bookmark.size = size
- bookmark.last_modified = last_modified
+ if last_modified:
+ last_modified = str(int(last_modified))
+ else:
+ last_modified = bookmark.last_visit
- md5 = md5wrapper()
- if urllib._urlopener.type == "ftp": # Pass welcome message through MD5
- md5.update(get_welcome())
+ bookmark.size = size
+ bookmark.last_modified = last_modified
+
+ md5 = md5wrapper()
+ if urllib._urlopener.type == "ftp": # Pass welcome message through MD5
+ md5.update(get_welcome())
- md5.md5file(self.tempfname)
- bookmark.md5 = str(md5)
+ md5.md5file(fname)
+ bookmark.md5 = str(md5)
- if headers:
+ if headers:
+ try:
+ content_type = headers["Content-Type"]
+ self.log(" Content-Type: %s" % content_type)
try:
- content_type = headers["Content-Type"]
- try:
- content_type, charset = content_type.split(';')
- content_type = content_type.strip()
- charset = charset.split('=')[1].strip()
- if self.log: self.log(" HTTP charset : %s" % charset)
- except (ValueError, IndexError):
- charset = None
- if self.log: self.log(" no charset in Content-Type header")
- if content_type == "text/html":
- parser = parse_html(fname, charset, self.log)
- title = parser.title.replace('\r', '').replace('\n', ' ').strip()
- bookmark.real_title = parser.unescape(title)
- if self.log: self.log(" final title : %s" % bookmark.real_title)
- if parser.refresh:
- refresh = parser.refresh
+ # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
+ content_type, charset = content_type.split(';', 1)
+ content_type = content_type.strip()
+ charset = charset.split('=')[1].strip().split(',')[0]
+ self.log(" HTTP charset : %s" % charset)
+ except (ValueError, IndexError):
+ charset = None
+ self.log(" no charset in Content-Type header")
+ for ctype in ("text/html", "application/xhtml+xml"):
+ if content_type.startswith(ctype):
+ html = True
+ break
+ else:
+ html = False
+ if html:
+ parser = parse_html(fname, charset, self.log)
+ if parser:
+ bookmark.real_title = parser.title
+ icon = parser.icon
+ else:
+ icon = None
+ if not icon:
+ icon = "/favicon.ico"
+ icon = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
+ self.log(" looking for icon at: %s" % icon)
+ if icon in icons:
+ if icons[icon]:
+ bookmark.icon_href = icon
+ content_type, bookmark.icon = icons[icon]
+ self.log(" cached icon: %s" % content_type)
+ else:
+ self.log(" cached icon: no icon")
+ else:
+ try:
+ _icon = icon
+ for i in range(8):
+ try:
+ icon_fname, headers = urllib.urlretrieve(_icon)
+ except RedirectException, e:
+ _icon = e.url
+ self.log(" redirect to : %s" % _icon)
+ else:
+ break
+ else:
+ raise IOError("Too many redirects")
+ except:
+ etype, emsg, tb = sys.exc_info()
+ self.log(" no icon : %s %s" % (etype, emsg))
+ etype = emsg = tb = None
+ icons[icon] = None
+ else:
+ content_type = headers["Content-Type"]
+ if content_type.startswith("application/") \
+ or content_type.startswith("image/") \
+ or content_type.startswith("text/plain"):
+ icon_file = open(icon_fname, "rb")
+ icon_data = icon_file.read()
+ icon_file.close()
+ bookmark.icon_href = icon
+ self.log(" got icon : %s" % content_type)
+ if content_type.startswith("application/") \
+ or content_type.startswith("text/plain"):
+ self.log(" non-image content type, assume x-icon")
+ content_type = 'image/x-icon'
+ bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
+ icons[icon] = (content_type, bookmark.icon)
+ else:
+ self.log(" no icon : bad content type '%s'" % content_type)
+ icons[icon] = None
+ if parser and parser.refresh:
+ refresh = parser.refresh
+ try:
+ url = refresh.split('=', 1)[1]
+ except IndexError:
+ url = "self"
+ try:
+ timeout = float(refresh.split(';')[0])
+ except (IndexError, ValueError):
+ raise RedirectException("html", "Bad redirect to %s (%s)" % (url, refresh))
+ else:
try:
timeout = int(refresh.split(';')[0])
- except (IndexError, ValueError):
- timeout = "ERROR"
- try:
- url = refresh.split('=', 1)[1]
- except IndexError:
- url = "self"
- raise RedirectException("html", "%s (%d sec)" % (url, timeout))
- except KeyError:
- pass
-
- except IOError, msg:
- if (msg[0] == "http error") and (msg[1] == -1):
- bookmark.no_error = "The server did not return any header - it is not an error, actually"
- else:
- bookmark.error = get_error(msg)
-
- except EOFError:
- bookmark.error = "Unexpected EOF (FTP server closed connection)"
-
- except RedirectException, msg:
- bookmark.moved = str(msg)
-
- except KeyboardInterrupt:
- return 0
+ except ValueError:
+ pass # float timeout
+ raise RedirectException("html", "%s (%s sec)" % (url, timeout))
+
+ except KeyError, key:
+ self.log(" no header: %s" % key)
+
+ except IOError, msg:
+ if (msg[0] == "http error") and (msg[1] == -1):
+ bookmark.no_error = "The server did not return any header - it is not an error, actually"
+ self.log(' no headers: %s' % bookmark.no_error)
+ else:
+ bookmark.error = get_error(msg)
+ self.log(' Error: %s' % bookmark.error)
+
+ except EOFError:
+ bookmark.error = "Unexpected EOF (FTP server closed connection)"
+ self.log(' EOF: %s' % bookmark.error)
+
+ except RedirectException, msg:
+ bookmark.moved = str(msg)
+ self.log(' Moved: %s' % bookmark.moved)
+
+ except KeyboardInterrupt:
+ self.log("Keyboard interrupt (^C)")
+ return 0
+
+ except:
+ import traceback
+ traceback.print_exc()
+ bookmark.error = "Exception!"
+ self.log(' Exception: %s' % bookmark.error)
finally:
- self.finish_check_url(bookmark)
+ self.finish_check_url(bookmark, fname)
# Tested
return 1
-
- def finish_check_url(self, bookmark):
+ def finish_check_url(self, bookmark, fname=None):
# Calculate these attributes even in case of an error
- if os.path.exists(self.tempfname):
- size = str(os.stat(self.tempfname).st_size)
+ if fname and os.path.exists(fname):
+ size = str(os.path.getsize(fname))
if size[-1] == 'L':
size = size[:-1]
bookmark.size = size
now = int(time.time())
bookmark.test_time = str(now - start)
+ urllib.urlcleanup()