From 9d989c8c9ebd7f13e9f0fdc95fa139c99ed83387 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Sat, 31 May 2014 23:58:51 +0400 Subject: [PATCH] Split simple robot Separate network operations and URL handling/HTML parsing. --- Robots/bkmk_rforking_sub.py | 9 +- .../{bkmk_rsimple.py => bkmk_robot_base.py} | 149 +++++------------- Robots/bkmk_rurllib.py | 110 +++++++++++++ 3 files changed, 151 insertions(+), 117 deletions(-) rename Robots/{bkmk_rsimple.py => bkmk_robot_base.py} (58%) create mode 100644 Robots/bkmk_rurllib.py diff --git a/Robots/bkmk_rforking_sub.py b/Robots/bkmk_rforking_sub.py index 1b4d59b..73956e2 100755 --- a/Robots/bkmk_rforking_sub.py +++ b/Robots/bkmk_rforking_sub.py @@ -1,11 +1,12 @@ #! /usr/bin/env python -"""Check URL - subprocess for the forking robot +"""Subprocess for the forking robot - check URL using bkmk_rurlib robot This file is a part of Bookmarks database and Internet robot. + """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 1999-2012 PhiloSoft Design" +__copyright__ = "Copyright (C) 1999-2014 PhiloSoft Design" __license__ = "GNU GPL" __all__ = [] @@ -31,8 +32,8 @@ def run(): from m_lib.flog import openlog log = openlog("check2.log") - from bkmk_rsimple import robot_simple - robot = robot_simple(log) + from bkmk_rurllib import robot_urllib + robot = robot_urllib(log) while 1: bookmark = pickle.loads(bkmk_in.read_record()) diff --git a/Robots/bkmk_rsimple.py b/Robots/bkmk_robot_base.py similarity index 58% rename from Robots/bkmk_rsimple.py rename to Robots/bkmk_robot_base.py index 2c4df9e..63fd73e 100644 --- a/Robots/bkmk_rsimple.py +++ b/Robots/bkmk_robot_base.py @@ -1,4 +1,4 @@ -"""Simple, strightforward robot +"""Base class for robots This file is a part of Bookmarks database and Internet robot. @@ -8,10 +8,10 @@ __author__ = "Oleg Broytman " __copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design" __license__ = "GNU GPL" -__all__ = ['robot_simple', 'get_error'] +__all__ = ['robot_base', 'get_error'] -import sys, os +import sys import time, urllib from base64 import b64encode from urlparse import urljoin @@ -20,7 +20,7 @@ from m_lib.net.www.util import parse_time from m_lib.md5wrapper import md5wrapper from bkmk_objects import Robot -from parse_html import parse_filename +from parse_html import parse_html class RedirectException(Exception): @@ -36,49 +36,6 @@ class RedirectException(Exception): self.url = newurl -class MyURLopener(urllib.URLopener): - # Error 302 -- relocated (temporarily) - def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): - if headers.has_key('location'): - newurl = headers['location'] - elif headers.has_key('uri'): - newurl = headers['uri'] - else: - newurl = "Nowhere" - raise RedirectException(errcode, newurl) - - # Error 301 -- also relocated (permanently) - http_error_301 = http_error_302 - # Error 307 -- also relocated (temporary) - http_error_307 = http_error_302 - - # Error 401 -- authentication required - def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): - raise IOError, ('http error', errcode, "Authentication required ", headers) - - def http_error_default(self, url, fp, errcode, errmsg, headers): - if fp: - void = fp.read() - fp.close() - raise IOError, ('http error', errcode, errmsg, headers) - - -urllib._urlopener = MyURLopener() - -# Fake headers to pretend this is a real browser -_version = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en) Gecko/20001221 Firefox/2.0.0" -urllib._urlopener.addheaders[0] = ('User-Agent', _version) -_version = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % ( - sys.version_info[0], sys.version_info[1], sys.version_info[2], urllib.__version__) -urllib._urlopener.addheader('X-User-Agent', _version) -urllib._urlopener.addheader('Referer', '') - -urllib._urlopener.addheader('Connection', 'close') -urllib._urlopener.addheader('Accept', '*/*') -urllib._urlopener.addheader('Accept-Language', 'ru,en') -urllib._urlopener.addheader('Cache-Control', 'max-age=300') - - def get_error(msg): if isinstance(msg, str): return msg @@ -90,31 +47,11 @@ def get_error(msg): return "(%s)" % ' '.join(s) -urllib_ftpwrapper = urllib.ftpwrapper -ftpcache_key = None - -class myftpwrapper(urllib_ftpwrapper): - def __init__(self, user, passwd, host, port, dirs): - urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs) - global ftpcache_key - ftpcache_key = (user, host, port, '/'.join(dirs)) - -urllib.ftpwrapper = myftpwrapper - -def get_welcome(): - global ftpcache_key - _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome - ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db. - # If there are - ftpcache_key in prev line is invalid. - return _welcome - - icons = {} # Icon cache; maps URL to a tuple (content type, data) # or None if there is no icon. -class robot_simple(Robot): +class robot_base(Robot): def check_url(self, bookmark): - fname = None try: self.start = int(time.time()) bookmark.icon = None @@ -123,12 +60,11 @@ class robot_simple(Robot): url_host, url_path = urllib.splithost(url_rest) url_path, url_tag = urllib.splittag(url_path) - # Set fake referer to the root of the site - urllib._urlopener.addheaders[2] = ('Referer', "%s://%s%s" % (url_type, url_host, url_path)) + url = "%s://%s%s" % (url_type, url_host, url_path) + headers, content = self.urlretrieve(bookmark, url, True) - if bookmark.charset: urllib._urlopener.addheader('Accept-Charset', bookmark.charset) - fname, headers = urllib.urlretrieve("%s://%s%s" % (url_type, url_host, url_path)) - if bookmark.charset: del urllib._urlopener.addheaders[-1] + if content is None: + return 1 size = 0 last_modified = None @@ -137,7 +73,7 @@ class robot_simple(Robot): try: size = headers["Content-Length"] except KeyError: - pass + size = len(content) try: last_modified = headers["Last-Modified"] @@ -146,6 +82,8 @@ class robot_simple(Robot): if last_modified: last_modified = parse_time(last_modified) + else: + size = len(content) if last_modified: last_modified = str(int(last_modified)) @@ -157,9 +95,9 @@ class robot_simple(Robot): md5 = md5wrapper() if urllib._urlopener.type == "ftp": # Pass welcome message through MD5 - md5.update(get_welcome()) + md5.update(self.get_ftp_welcome()) - md5.md5file(fname) + md5.update(content) bookmark.md5 = str(md5) if headers: @@ -182,7 +120,7 @@ class robot_simple(Robot): else: html = False if html: - parser = parse_filename(fname, charset, self.log) + parser = parse_html(content, charset, self.log) if parser: bookmark.real_title = parser.title icon = parser.icon @@ -190,25 +128,27 @@ class robot_simple(Robot): icon = None if not icon: icon = "/favicon.ico" - icon = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon) - self.log(" looking for icon at: %s" % icon) - if icon in icons: - if icons[icon]: - bookmark.icon_href = icon - content_type, bookmark.icon = icons[icon] + icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon) + self.log(" looking for icon at: %s" % icon_url) + if icon_url in icons: + if icons[icon_url]: + bookmark.icon_href = icon_url + content_type, bookmark.icon = icons[icon_url] self.log(" cached icon: %s" % content_type) else: self.log(" cached icon: no icon") else: try: - _icon = icon + _icon_url = icon_url for i in range(8): try: - icon_fname, headers = urllib.urlretrieve(_icon) + icon_headers, icon_data = self.urlretrieve(bookmark, _icon_url) except RedirectException, e: - _icon = e.url - self.log(" redirect to : %s" % _icon) + _icon_url = e.url + self.log(" redirect to : %s" % _icon_url) else: + if icon_data is None: + raise IOError("No icon") break else: raise IOError("Too many redirects") @@ -216,26 +156,23 @@ class robot_simple(Robot): etype, emsg, tb = sys.exc_info() self.log(" no icon : %s %s" % (etype, emsg)) etype = emsg = tb = None - icons[icon] = None + icons[icon_url] = None else: - content_type = headers["Content-Type"] + content_type = icon_headers["Content-Type"] if content_type.startswith("application/") \ or content_type.startswith("image/") \ or content_type.startswith("text/plain"): - icon_file = open(icon_fname, "rb") - icon_data = icon_file.read() - icon_file.close() - bookmark.icon_href = icon + bookmark.icon_href = icon_url self.log(" got icon : %s" % content_type) if content_type.startswith("application/") \ or content_type.startswith("text/plain"): self.log(" non-image content type, assume x-icon") content_type = 'image/x-icon' bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data)) - icons[icon] = (content_type, bookmark.icon) + icons[icon_url] = (content_type, bookmark.icon) else: self.log(" no icon : bad content type '%s'" % content_type) - icons[icon] = None + icons[icon_url] = None if parser and parser.refresh: refresh = parser.refresh try: @@ -256,14 +193,6 @@ class robot_simple(Robot): except KeyError, key: self.log(" no header: %s" % key) - except IOError, msg: - if (msg[0] == "http error") and (msg[1] == -1): - bookmark.no_error = "The server did not return any header - it is not an error, actually" - self.log(' no headers: %s' % bookmark.no_error) - else: - bookmark.error = get_error(msg) - self.log(' Error: %s' % bookmark.error) - except EOFError: bookmark.error = "Unexpected EOF (FTP server closed connection)" self.log(' EOF: %s' % bookmark.error) @@ -283,22 +212,16 @@ class robot_simple(Robot): self.log(' Exception: %s' % bookmark.error) finally: - self.finish_check_url(bookmark, fname) + self.finish_check_url(bookmark) # Tested return 1 - def finish_check_url(self, bookmark, fname=None): - # Calculate these attributes even in case of an error - if fname and os.path.exists(fname): - size = str(os.path.getsize(fname)) - if size[-1] == 'L': - size = size[:-1] - bookmark.size = size - + def finish_check_url(self, bookmark): start = self.start bookmark.last_tested = str(start) now = int(time.time()) bookmark.test_time = str(now - start) - urllib.urlcleanup() + + self.cleanup() diff --git a/Robots/bkmk_rurllib.py b/Robots/bkmk_rurllib.py new file mode 100644 index 0000000..f0a614e --- /dev/null +++ b/Robots/bkmk_rurllib.py @@ -0,0 +1,110 @@ +"""Simple, strightforward robot based on urllib + +This file is a part of Bookmarks database and Internet robot. + +""" + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['robot_urllib'] + + +import sys, os +import time, urllib +from Robots.bkmk_robot_base import robot_base, RedirectException, get_error + + +class MyURLopener(urllib.URLopener): + # Error 302 -- relocated (temporarily) + def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): + if headers.has_key('location'): + newurl = headers['location'] + elif headers.has_key('uri'): + newurl = headers['uri'] + else: + newurl = "Nowhere" + raise RedirectException(errcode, newurl) + + # Error 301 -- also relocated (permanently) + http_error_301 = http_error_302 + # Error 307 -- also relocated (temporary) + http_error_307 = http_error_302 + + # Error 401 -- authentication required + def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): + raise IOError, ('http error', errcode, "Authentication required ", headers) + + def http_error_default(self, url, fp, errcode, errmsg, headers): + if fp: + void = fp.read() + fp.close() + raise IOError, ('http error', errcode, errmsg, headers) + + +urllib._urlopener = MyURLopener() + +# Fake headers to pretend this is a real browser +_version = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en) Gecko/20001221 Firefox/2.0.0" +urllib._urlopener.addheaders[0] = ('User-Agent', _version) +_version = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % ( + sys.version_info[0], sys.version_info[1], sys.version_info[2], urllib.__version__) +urllib._urlopener.addheader('X-User-Agent', _version) +urllib._urlopener.addheader('Referer', '') + +urllib._urlopener.addheader('Connection', 'close') +urllib._urlopener.addheader('Accept', '*/*') +urllib._urlopener.addheader('Accept-Language', 'ru,en') +urllib._urlopener.addheader('Cache-Control', 'max-age=300') + + +urllib_ftpwrapper = urllib.ftpwrapper +ftpcache_key = None + +class myftpwrapper(urllib_ftpwrapper): + def __init__(self, user, passwd, host, port, dirs): + urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs) + global ftpcache_key + ftpcache_key = (user, host, port, '/'.join(dirs)) + +urllib.ftpwrapper = myftpwrapper + + +class robot_urllib(robot_base): + def urlretrieve(self, bookmark, url, accept_charset=False): + try: + # Set fake referer to the base URL + urllib._urlopener.addheaders[2] = ('Referer', url) + + if accept_charset and bookmark.charset: + urllib._urlopener.addheader('Accept-Charset', bookmark.charset) + fname, headers = urllib.urlretrieve(url) + if accept_charset and bookmark.charset: + del urllib._urlopener.addheaders[-1] + + infile = open(fname, 'rb') + content = infile.read() + infile.close() + + return headers, content + + except IOError, msg: + if (msg[0] == "http error") and (msg[1] == -1): + bookmark.no_error = "The server did not return any header - it is not an error, actually" + self.log(' no headers: %s' % bookmark.no_error) + else: + bookmark.error = get_error(msg) + self.log(' Error: %s' % bookmark.error) + + return None, None + + def get_ftp_welcome(self): + global ftpcache_key + _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome + ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db. + # If there are - ftpcache_key in prev line is invalid. + return _welcome + + def cleanup(self): + urllib.urlcleanup() -- 2.39.5