-"""Simple, strightforward robot
+"""Base class for robots
This file is a part of Bookmarks database and Internet robot.
__copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
__license__ = "GNU GPL"
-__all__ = ['robot_simple', 'get_error']
+__all__ = ['robot_base', 'get_error']
-import sys, os
+import sys
import time, urllib
from base64 import b64encode
from urlparse import urljoin
from m_lib.md5wrapper import md5wrapper
from bkmk_objects import Robot
-from parse_html import parse_filename
+from parse_html import parse_html
class RedirectException(Exception):
self.url = newurl
-class MyURLopener(urllib.URLopener):
- # Error 302 -- relocated (temporarily)
- def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
- if headers.has_key('location'):
- newurl = headers['location']
- elif headers.has_key('uri'):
- newurl = headers['uri']
- else:
- newurl = "Nowhere"
- raise RedirectException(errcode, newurl)
-
- # Error 301 -- also relocated (permanently)
- http_error_301 = http_error_302
- # Error 307 -- also relocated (temporary)
- http_error_307 = http_error_302
-
- # Error 401 -- authentication required
- def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
- raise IOError, ('http error', errcode, "Authentication required ", headers)
-
- def http_error_default(self, url, fp, errcode, errmsg, headers):
- if fp:
- void = fp.read()
- fp.close()
- raise IOError, ('http error', errcode, errmsg, headers)
-
-
-urllib._urlopener = MyURLopener()
-
-# Fake headers to pretend this is a real browser
-_version = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en) Gecko/20001221 Firefox/2.0.0"
-urllib._urlopener.addheaders[0] = ('User-Agent', _version)
-_version = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % (
- sys.version_info[0], sys.version_info[1], sys.version_info[2], urllib.__version__)
-urllib._urlopener.addheader('X-User-Agent', _version)
-urllib._urlopener.addheader('Referer', '')
-
-urllib._urlopener.addheader('Connection', 'close')
-urllib._urlopener.addheader('Accept', '*/*')
-urllib._urlopener.addheader('Accept-Language', 'ru,en')
-urllib._urlopener.addheader('Cache-Control', 'max-age=300')
-
-
def get_error(msg):
if isinstance(msg, str):
return msg
return "(%s)" % ' '.join(s)
-urllib_ftpwrapper = urllib.ftpwrapper
-ftpcache_key = None
-
-class myftpwrapper(urllib_ftpwrapper):
- def __init__(self, user, passwd, host, port, dirs):
- urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs)
- global ftpcache_key
- ftpcache_key = (user, host, port, '/'.join(dirs))
-
-urllib.ftpwrapper = myftpwrapper
-
-def get_welcome():
- global ftpcache_key
- _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome
- ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db.
- # If there are - ftpcache_key in prev line is invalid.
- return _welcome
-
-
icons = {} # Icon cache; maps URL to a tuple (content type, data)
# or None if there is no icon.
-class robot_simple(Robot):
+class robot_base(Robot):
def check_url(self, bookmark):
- fname = None
try:
self.start = int(time.time())
bookmark.icon = None
url_host, url_path = urllib.splithost(url_rest)
url_path, url_tag = urllib.splittag(url_path)
- # Set fake referer to the root of the site
- urllib._urlopener.addheaders[2] = ('Referer', "%s://%s%s" % (url_type, url_host, url_path))
+ url = "%s://%s%s" % (url_type, url_host, url_path)
+ headers, content = self.urlretrieve(bookmark, url, True)
- if bookmark.charset: urllib._urlopener.addheader('Accept-Charset', bookmark.charset)
- fname, headers = urllib.urlretrieve("%s://%s%s" % (url_type, url_host, url_path))
- if bookmark.charset: del urllib._urlopener.addheaders[-1]
+ if content is None:
+ return 1
size = 0
last_modified = None
try:
size = headers["Content-Length"]
except KeyError:
- pass
+ size = len(content)
try:
last_modified = headers["Last-Modified"]
if last_modified:
last_modified = parse_time(last_modified)
+ else:
+ size = len(content)
if last_modified:
last_modified = str(int(last_modified))
md5 = md5wrapper()
if urllib._urlopener.type == "ftp": # Pass welcome message through MD5
- md5.update(get_welcome())
+ md5.update(self.get_ftp_welcome())
- md5.md5file(fname)
+ md5.update(content)
bookmark.md5 = str(md5)
if headers:
else:
html = False
if html:
- parser = parse_filename(fname, charset, self.log)
+ parser = parse_html(content, charset, self.log)
if parser:
bookmark.real_title = parser.title
icon = parser.icon
icon = None
if not icon:
icon = "/favicon.ico"
- icon = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
- self.log(" looking for icon at: %s" % icon)
- if icon in icons:
- if icons[icon]:
- bookmark.icon_href = icon
- content_type, bookmark.icon = icons[icon]
+ icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
+ self.log(" looking for icon at: %s" % icon_url)
+ if icon_url in icons:
+ if icons[icon_url]:
+ bookmark.icon_href = icon_url
+ content_type, bookmark.icon = icons[icon_url]
self.log(" cached icon: %s" % content_type)
else:
self.log(" cached icon: no icon")
else:
try:
- _icon = icon
+ _icon_url = icon_url
for i in range(8):
try:
- icon_fname, headers = urllib.urlretrieve(_icon)
+ icon_headers, icon_data = self.urlretrieve(bookmark, _icon_url)
except RedirectException, e:
- _icon = e.url
- self.log(" redirect to : %s" % _icon)
+ _icon_url = e.url
+ self.log(" redirect to : %s" % _icon_url)
else:
+ if icon_data is None:
+ raise IOError("No icon")
break
else:
raise IOError("Too many redirects")
etype, emsg, tb = sys.exc_info()
self.log(" no icon : %s %s" % (etype, emsg))
etype = emsg = tb = None
- icons[icon] = None
+ icons[icon_url] = None
else:
- content_type = headers["Content-Type"]
+ content_type = icon_headers["Content-Type"]
if content_type.startswith("application/") \
or content_type.startswith("image/") \
or content_type.startswith("text/plain"):
- icon_file = open(icon_fname, "rb")
- icon_data = icon_file.read()
- icon_file.close()
- bookmark.icon_href = icon
+ bookmark.icon_href = icon_url
self.log(" got icon : %s" % content_type)
if content_type.startswith("application/") \
or content_type.startswith("text/plain"):
self.log(" non-image content type, assume x-icon")
content_type = 'image/x-icon'
bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
- icons[icon] = (content_type, bookmark.icon)
+ icons[icon_url] = (content_type, bookmark.icon)
else:
self.log(" no icon : bad content type '%s'" % content_type)
- icons[icon] = None
+ icons[icon_url] = None
if parser and parser.refresh:
refresh = parser.refresh
try:
except KeyError, key:
self.log(" no header: %s" % key)
- except IOError, msg:
- if (msg[0] == "http error") and (msg[1] == -1):
- bookmark.no_error = "The server did not return any header - it is not an error, actually"
- self.log(' no headers: %s' % bookmark.no_error)
- else:
- bookmark.error = get_error(msg)
- self.log(' Error: %s' % bookmark.error)
-
except EOFError:
bookmark.error = "Unexpected EOF (FTP server closed connection)"
self.log(' EOF: %s' % bookmark.error)
self.log(' Exception: %s' % bookmark.error)
finally:
- self.finish_check_url(bookmark, fname)
+ self.finish_check_url(bookmark)
# Tested
return 1
- def finish_check_url(self, bookmark, fname=None):
- # Calculate these attributes even in case of an error
- if fname and os.path.exists(fname):
- size = str(os.path.getsize(fname))
- if size[-1] == 'L':
- size = size[:-1]
- bookmark.size = size
-
+ def finish_check_url(self, bookmark):
start = self.start
bookmark.last_tested = str(start)
now = int(time.time())
bookmark.test_time = str(now - start)
- urllib.urlcleanup()
+
+ self.cleanup()
--- /dev/null
+"""Simple, strightforward robot based on urllib
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_urllib']
+
+
+import sys, os
+import time, urllib
+from Robots.bkmk_robot_base import robot_base, RedirectException, get_error
+
+
+class MyURLopener(urllib.URLopener):
+ # Error 302 -- relocated (temporarily)
+ def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
+ if headers.has_key('location'):
+ newurl = headers['location']
+ elif headers.has_key('uri'):
+ newurl = headers['uri']
+ else:
+ newurl = "Nowhere"
+ raise RedirectException(errcode, newurl)
+
+ # Error 301 -- also relocated (permanently)
+ http_error_301 = http_error_302
+ # Error 307 -- also relocated (temporary)
+ http_error_307 = http_error_302
+
+ # Error 401 -- authentication required
+ def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
+ raise IOError, ('http error', errcode, "Authentication required ", headers)
+
+ def http_error_default(self, url, fp, errcode, errmsg, headers):
+ if fp:
+ void = fp.read()
+ fp.close()
+ raise IOError, ('http error', errcode, errmsg, headers)
+
+
+urllib._urlopener = MyURLopener()
+
+# Fake headers to pretend this is a real browser
+_version = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en) Gecko/20001221 Firefox/2.0.0"
+urllib._urlopener.addheaders[0] = ('User-Agent', _version)
+_version = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % (
+ sys.version_info[0], sys.version_info[1], sys.version_info[2], urllib.__version__)
+urllib._urlopener.addheader('X-User-Agent', _version)
+urllib._urlopener.addheader('Referer', '')
+
+urllib._urlopener.addheader('Connection', 'close')
+urllib._urlopener.addheader('Accept', '*/*')
+urllib._urlopener.addheader('Accept-Language', 'ru,en')
+urllib._urlopener.addheader('Cache-Control', 'max-age=300')
+
+
+urllib_ftpwrapper = urllib.ftpwrapper
+ftpcache_key = None
+
+class myftpwrapper(urllib_ftpwrapper):
+ def __init__(self, user, passwd, host, port, dirs):
+ urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs)
+ global ftpcache_key
+ ftpcache_key = (user, host, port, '/'.join(dirs))
+
+urllib.ftpwrapper = myftpwrapper
+
+
+class robot_urllib(robot_base):
+ def urlretrieve(self, bookmark, url, accept_charset=False):
+ try:
+ # Set fake referer to the base URL
+ urllib._urlopener.addheaders[2] = ('Referer', url)
+
+ if accept_charset and bookmark.charset:
+ urllib._urlopener.addheader('Accept-Charset', bookmark.charset)
+ fname, headers = urllib.urlretrieve(url)
+ if accept_charset and bookmark.charset:
+ del urllib._urlopener.addheaders[-1]
+
+ infile = open(fname, 'rb')
+ content = infile.read()
+ infile.close()
+
+ return headers, content
+
+ except IOError, msg:
+ if (msg[0] == "http error") and (msg[1] == -1):
+ bookmark.no_error = "The server did not return any header - it is not an error, actually"
+ self.log(' no headers: %s' % bookmark.no_error)
+ else:
+ bookmark.error = get_error(msg)
+ self.log(' Error: %s' % bookmark.error)
+
+ return None, None
+
+ def get_ftp_welcome(self):
+ global ftpcache_key
+ _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome
+ ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db.
+ # If there are - ftpcache_key in prev line is invalid.
+ return _welcome
+
+ def cleanup(self):
+ urllib.urlcleanup()