From 27c6253f3e707d0b90e67ee52f78e1335482e17e Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Sun, 6 Jul 2014 05:22:31 +0400 Subject: [PATCH] Add robot based on urllib2 --- Robots/bkmk_rurllib2.py | 63 +++++++++++++++++++++++++++++++++++++++++ doc/ChangeLog | 2 ++ doc/TODO | 2 +- 3 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 Robots/bkmk_rurllib2.py diff --git a/Robots/bkmk_rurllib2.py b/Robots/bkmk_rurllib2.py new file mode 100644 index 0000000..b15a7b0 --- /dev/null +++ b/Robots/bkmk_rurllib2.py @@ -0,0 +1,63 @@ +"""Robot based on urllib2 + +This file is a part of Bookmarks database and Internet robot. + +""" + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2014 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['robot_urllib2'] + + +import sys +import httplib +import urllib2 +from Robots.bkmk_robot_base import robot_base + + +opener = urllib2.OpenerDirector() +default_classes = [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler, + urllib2.FTPHandler, urllib2.HTTPErrorProcessor] +if hasattr(httplib, 'HTTPS'): + default_classes.insert(0, urllib2.HTTPSHandler) +for klass in default_classes: + opener.add_handler(klass()) + +urllib2.install_opener(opener) + + +# Fake headers to pretend this is a real browser +_user_agent = "Mozilla/5.0 (X11; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0" +_x_user_agent = "bookmarks_db (Python %d.%d.%d; urllib2)" % ( + sys.version_info[0], sys.version_info[1], sys.version_info[2]) + + +class robot_urllib2(robot_base): + def get(self, bookmark, url, accept_charset=False): + request = urllib2.Request(url) + request.add_header('Accept', '*/*') + if accept_charset and bookmark.charset: + request.add_header('Accept-Charset', bookmark.charset) + request.add_header('Accept-Language', 'ru,en') + request.add_header('Cache-Control', 'max-age=300') + request.add_header('Connection', 'close') + request.add_header('Referer', url) + request.add_header('User-agent', _user_agent) + request.add_header('X-User-Agent', _x_user_agent) + + try: + response = urllib2.urlopen(request) + except urllib2.HTTPError, e: + if e.code in (301, 302, 303, 307): + return None, e.code, e.hdrs['Location'], None, None + else: + return "HTTP Error %s: %s" % (e.code, e.msg), None, None, None, None + except urllib2.URLError, e: + return "URL Error: %s" % e.reason, None, None, None, None + else: + return None, None, None, response.info(), response.read() + + def get_ftp_welcome(self): + return '' diff --git a/doc/ChangeLog b/doc/ChangeLog index cc441e7..9627379 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -17,6 +17,8 @@ WHAT'S NEW in version 4.6.0 (2014-06-??) Pass subproc parameter to the subprocess to allow different robots. + Add a new robot based on urllib2. + WHAT'S NEW in version 4.5.6 (2014-01-14) diff --git a/doc/TODO b/doc/TODO index 234dc2b..f98d076 100644 --- a/doc/TODO +++ b/doc/TODO @@ -1,4 +1,4 @@ -A new robot based on urllib2. +Robot based on urllib2: handle timeout and ftp. A new robot based on PycURL. -- 2.39.5