]> git.phdru.name Git - bookmarks_db.git/commitdiff
Split simple robot
authorOleg Broytman <phd@phdru.name>
Sat, 31 May 2014 19:58:51 +0000 (23:58 +0400)
committerOleg Broytman <phd@phdru.name>
Sat, 31 May 2014 19:58:51 +0000 (23:58 +0400)
Separate network operations and URL handling/HTML parsing.

Robots/bkmk_rforking_sub.py
Robots/bkmk_robot_base.py [moved from Robots/bkmk_rsimple.py with 58% similarity]
Robots/bkmk_rurllib.py [new file with mode: 0644]

index 1b4d59bf7b459193f491e5fdb3aa38c494d32944..73956e246d6b02342e3256248bc7e90639c8b501 100755 (executable)
@@ -1,11 +1,12 @@
 #! /usr/bin/env python
-"""Check URL - subprocess for the forking robot
+"""Subprocess for the forking robot - check URL using bkmk_rurlib robot
 
 This file is a part of Bookmarks database and Internet robot.
+
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 1999-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 1999-2014 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = []
@@ -31,8 +32,8 @@ def run():
 
    from m_lib.flog import openlog
    log = openlog("check2.log")
-   from bkmk_rsimple import robot_simple
-   robot = robot_simple(log)
+   from bkmk_rurllib import robot_urllib
+   robot = robot_urllib(log)
 
    while 1:
       bookmark = pickle.loads(bkmk_in.read_record())
similarity index 58%
rename from Robots/bkmk_rsimple.py
rename to Robots/bkmk_robot_base.py
index 2c4df9e278f99b186daa869720f9437c81f7cefa..63fd73e08e9e71b0423d44eca0b8c9a1db671725 100644 (file)
@@ -1,4 +1,4 @@
-"""Simple, strightforward robot
+"""Base class for robots
 
 This file is a part of Bookmarks database and Internet robot.
 
@@ -8,10 +8,10 @@ __author__ = "Oleg Broytman <phd@phdru.name>"
 __copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
 __license__ = "GNU GPL"
 
-__all__ = ['robot_simple', 'get_error']
+__all__ = ['robot_base', 'get_error']
 
 
-import sys, os
+import sys
 import time, urllib
 from base64 import b64encode
 from urlparse import urljoin
@@ -20,7 +20,7 @@ from m_lib.net.www.util import parse_time
 from m_lib.md5wrapper import md5wrapper
 
 from bkmk_objects import Robot
-from parse_html import parse_filename
+from parse_html import parse_html
 
 
 class RedirectException(Exception):
@@ -36,49 +36,6 @@ class RedirectException(Exception):
       self.url = newurl
 
 
-class MyURLopener(urllib.URLopener):
-   # Error 302 -- relocated (temporarily)
-   def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 
-      if headers.has_key('location'):
-         newurl = headers['location']
-      elif headers.has_key('uri'):
-         newurl = headers['uri']
-      else:
-         newurl = "Nowhere"
-      raise RedirectException(errcode, newurl)
-
-   # Error 301 -- also relocated (permanently)
-   http_error_301 = http_error_302
-   # Error 307 -- also relocated (temporary)
-   http_error_307 = http_error_302
-
-   # Error 401 -- authentication required
-   def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): 
-      raise IOError, ('http error', errcode, "Authentication required ", headers)
-
-   def http_error_default(self, url, fp, errcode, errmsg, headers):
-      if fp:
-         void = fp.read()
-         fp.close()
-      raise IOError, ('http error', errcode, errmsg, headers)
-
-
-urllib._urlopener = MyURLopener()
-
-# Fake headers to pretend this is a real browser
-_version = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en) Gecko/20001221 Firefox/2.0.0"
-urllib._urlopener.addheaders[0] = ('User-Agent', _version)
-_version = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % (
-   sys.version_info[0], sys.version_info[1], sys.version_info[2], urllib.__version__)
-urllib._urlopener.addheader('X-User-Agent', _version)
-urllib._urlopener.addheader('Referer', '')
-
-urllib._urlopener.addheader('Connection', 'close')
-urllib._urlopener.addheader('Accept', '*/*')
-urllib._urlopener.addheader('Accept-Language', 'ru,en')
-urllib._urlopener.addheader('Cache-Control', 'max-age=300')
-
-
 def get_error(msg):
    if isinstance(msg, str):
       return msg
@@ -90,31 +47,11 @@ def get_error(msg):
       return "(%s)" % ' '.join(s)
 
 
-urllib_ftpwrapper = urllib.ftpwrapper
-ftpcache_key = None
-
-class myftpwrapper(urllib_ftpwrapper):
-   def __init__(self, user, passwd, host, port, dirs):
-      urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs)
-      global ftpcache_key
-      ftpcache_key = (user, host, port, '/'.join(dirs))
-
-urllib.ftpwrapper = myftpwrapper
-
-def get_welcome():
-   global ftpcache_key
-   _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome
-   ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db.
-                       # If there are - ftpcache_key in prev line is invalid.
-   return _welcome
-
-
 icons = {} # Icon cache; maps URL to a tuple (content type, data)
            # or None if there is no icon.
 
-class robot_simple(Robot):
+class robot_base(Robot):
    def check_url(self, bookmark):
-      fname = None
       try:
          self.start = int(time.time())
          bookmark.icon = None
@@ -123,12 +60,11 @@ class robot_simple(Robot):
          url_host, url_path = urllib.splithost(url_rest)
          url_path, url_tag  = urllib.splittag(url_path)
 
-         # Set fake referer to the root of the site
-         urllib._urlopener.addheaders[2] = ('Referer', "%s://%s%s" % (url_type, url_host, url_path))
+         url = "%s://%s%s" % (url_type, url_host, url_path)
+         headers, content = self.urlretrieve(bookmark, url, True)
 
-         if bookmark.charset: urllib._urlopener.addheader('Accept-Charset', bookmark.charset)
-         fname, headers = urllib.urlretrieve("%s://%s%s" % (url_type, url_host, url_path))
-         if bookmark.charset: del urllib._urlopener.addheaders[-1]
+         if content is None:
+             return 1
 
          size = 0
          last_modified = None
@@ -137,7 +73,7 @@ class robot_simple(Robot):
             try:
                size = headers["Content-Length"]
             except KeyError:
-               pass
+               size = len(content)
 
             try:
                last_modified = headers["Last-Modified"]
@@ -146,6 +82,8 @@ class robot_simple(Robot):
 
             if last_modified:
                last_modified = parse_time(last_modified)
+         else:
+            size = len(content)
 
          if last_modified:
             last_modified = str(int(last_modified))
@@ -157,9 +95,9 @@ class robot_simple(Robot):
 
          md5 = md5wrapper()
          if urllib._urlopener.type == "ftp": # Pass welcome message through MD5
-            md5.update(get_welcome())
+            md5.update(self.get_ftp_welcome())
 
-         md5.md5file(fname)
+         md5.update(content)
          bookmark.md5 = str(md5)
 
          if headers:
@@ -182,7 +120,7 @@ class robot_simple(Robot):
                else:
                   html = False
                if html:
-                  parser = parse_filename(fname, charset, self.log)
+                  parser = parse_html(content, charset, self.log)
                   if parser:
                       bookmark.real_title = parser.title
                       icon = parser.icon
@@ -190,25 +128,27 @@ class robot_simple(Robot):
                      icon = None
                   if not icon:
                      icon = "/favicon.ico"
-                  icon = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
-                  self.log("   looking for icon at: %s" % icon)
-                  if icon in icons:
-                     if icons[icon]:
-                        bookmark.icon_href = icon
-                        content_type, bookmark.icon = icons[icon]
+                  icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
+                  self.log("   looking for icon at: %s" % icon_url)
+                  if icon_url in icons:
+                     if icons[icon_url]:
+                        bookmark.icon_href = icon_url
+                        content_type, bookmark.icon = icons[icon_url]
                         self.log("   cached icon: %s" % content_type)
                      else:
                         self.log("   cached icon: no icon")
                   else:
                      try:
-                        _icon = icon
+                        _icon_url = icon_url
                         for i in range(8):
                            try:
-                              icon_fname, headers = urllib.urlretrieve(_icon)
+                              icon_headers, icon_data = self.urlretrieve(bookmark, _icon_url)
                            except RedirectException, e:
-                              _icon = e.url
-                              self.log("   redirect to : %s" % _icon)
+                              _icon_url = e.url
+                              self.log("   redirect to : %s" % _icon_url)
                            else:
+                              if icon_data is None:
+                                   raise IOError("No icon")
                               break
                         else:
                            raise IOError("Too many redirects")
@@ -216,26 +156,23 @@ class robot_simple(Robot):
                         etype, emsg, tb = sys.exc_info()
                         self.log("   no icon        : %s %s" % (etype, emsg))
                         etype = emsg = tb = None
-                        icons[icon] = None
+                        icons[icon_url] = None
                      else:
-                        content_type = headers["Content-Type"]
+                        content_type = icon_headers["Content-Type"]
                         if content_type.startswith("application/") \
                               or content_type.startswith("image/") \
                               or content_type.startswith("text/plain"):
-                           icon_file = open(icon_fname, "rb")
-                           icon_data = icon_file.read()
-                           icon_file.close()
-                           bookmark.icon_href = icon
+                           bookmark.icon_href = icon_url
                            self.log("   got icon       : %s" % content_type)
                            if content_type.startswith("application/") \
                                  or content_type.startswith("text/plain"):
                               self.log("   non-image content type, assume x-icon")
                               content_type = 'image/x-icon'
                            bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
-                           icons[icon] = (content_type, bookmark.icon)
+                           icons[icon_url] = (content_type, bookmark.icon)
                         else:
                            self.log("   no icon        : bad content type '%s'" % content_type)
-                           icons[icon] = None
+                           icons[icon_url] = None
                   if parser and parser.refresh:
                      refresh = parser.refresh
                      try:
@@ -256,14 +193,6 @@ class robot_simple(Robot):
             except KeyError, key:
                self.log("   no header: %s" % key)
 
-      except IOError, msg:
-         if (msg[0] == "http error") and (msg[1] == -1):
-            bookmark.no_error = "The server did not return any header - it is not an error, actually"
-            self.log('   no headers: %s' % bookmark.no_error)
-         else:
-            bookmark.error = get_error(msg)
-            self.log('   Error: %s' % bookmark.error)
-
       except EOFError:
          bookmark.error = "Unexpected EOF (FTP server closed connection)"
          self.log('   EOF: %s' % bookmark.error)
@@ -283,22 +212,16 @@ class robot_simple(Robot):
          self.log('   Exception: %s' % bookmark.error)
 
       finally:
-         self.finish_check_url(bookmark, fname)
+         self.finish_check_url(bookmark)
 
       # Tested
       return 1
 
-   def finish_check_url(self, bookmark, fname=None):
-      # Calculate these attributes even in case of an error
-      if fname and os.path.exists(fname):
-         size = str(os.path.getsize(fname))
-         if size[-1] == 'L':
-            size = size[:-1]
-         bookmark.size = size
-
+   def finish_check_url(self, bookmark):
       start = self.start
       bookmark.last_tested = str(start)
 
       now = int(time.time())
       bookmark.test_time = str(now - start)
-      urllib.urlcleanup()
+
+      self.cleanup()
diff --git a/Robots/bkmk_rurllib.py b/Robots/bkmk_rurllib.py
new file mode 100644 (file)
index 0000000..f0a614e
--- /dev/null
@@ -0,0 +1,110 @@
+"""Simple, strightforward robot based on urllib
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_urllib']
+
+
+import sys, os
+import time, urllib
+from Robots.bkmk_robot_base import robot_base, RedirectException, get_error
+
+
+class MyURLopener(urllib.URLopener):
+   # Error 302 -- relocated (temporarily)
+   def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 
+      if headers.has_key('location'):
+         newurl = headers['location']
+      elif headers.has_key('uri'):
+         newurl = headers['uri']
+      else:
+         newurl = "Nowhere"
+      raise RedirectException(errcode, newurl)
+
+   # Error 301 -- also relocated (permanently)
+   http_error_301 = http_error_302
+   # Error 307 -- also relocated (temporary)
+   http_error_307 = http_error_302
+
+   # Error 401 -- authentication required
+   def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): 
+      raise IOError, ('http error', errcode, "Authentication required ", headers)
+
+   def http_error_default(self, url, fp, errcode, errmsg, headers):
+      if fp:
+         void = fp.read()
+         fp.close()
+      raise IOError, ('http error', errcode, errmsg, headers)
+
+
+urllib._urlopener = MyURLopener()
+
+# Fake headers to pretend this is a real browser
+_version = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en) Gecko/20001221 Firefox/2.0.0"
+urllib._urlopener.addheaders[0] = ('User-Agent', _version)
+_version = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % (
+   sys.version_info[0], sys.version_info[1], sys.version_info[2], urllib.__version__)
+urllib._urlopener.addheader('X-User-Agent', _version)
+urllib._urlopener.addheader('Referer', '')
+
+urllib._urlopener.addheader('Connection', 'close')
+urllib._urlopener.addheader('Accept', '*/*')
+urllib._urlopener.addheader('Accept-Language', 'ru,en')
+urllib._urlopener.addheader('Cache-Control', 'max-age=300')
+
+
+urllib_ftpwrapper = urllib.ftpwrapper
+ftpcache_key = None
+
+class myftpwrapper(urllib_ftpwrapper):
+   def __init__(self, user, passwd, host, port, dirs):
+      urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs)
+      global ftpcache_key
+      ftpcache_key = (user, host, port, '/'.join(dirs))
+
+urllib.ftpwrapper = myftpwrapper
+
+
+class robot_urllib(robot_base):
+   def urlretrieve(self, bookmark, url, accept_charset=False):
+      try:
+         # Set fake referer to the base URL
+         urllib._urlopener.addheaders[2] = ('Referer', url)
+
+         if accept_charset and bookmark.charset:
+            urllib._urlopener.addheader('Accept-Charset', bookmark.charset)
+         fname, headers = urllib.urlretrieve(url)
+         if accept_charset and bookmark.charset:
+            del urllib._urlopener.addheaders[-1]
+
+         infile = open(fname, 'rb')
+         content = infile.read()
+         infile.close()
+
+         return headers, content
+
+      except IOError, msg:
+         if (msg[0] == "http error") and (msg[1] == -1):
+            bookmark.no_error = "The server did not return any header - it is not an error, actually"
+            self.log('   no headers: %s' % bookmark.no_error)
+         else:
+            bookmark.error = get_error(msg)
+            self.log('   Error: %s' % bookmark.error)
+
+         return None, None
+
+   def get_ftp_welcome(self):
+      global ftpcache_key
+      _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome
+      ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db.
+                          # If there are - ftpcache_key in prev line is invalid.
+      return _welcome
+
+   def cleanup(self):
+      urllib.urlcleanup()