Split simple robot

author Oleg Broytman <phd@phdru.name>

Sat, 31 May 2014 19:58:51 +0000 (23:58 +0400)

committer Oleg Broytman <phd@phdru.name>

Sat, 31 May 2014 19:58:51 +0000 (23:58 +0400)
author Oleg Broytman <phd@phdru.name>
Sat, 31 May 2014 19:58:51 +0000 (23:58 +0400)
committer Oleg Broytman <phd@phdru.name>
Sat, 31 May 2014 19:58:51 +0000 (23:58 +0400)
diff --git a/Robots/bkmk_rforking_sub.py b/Robots/bkmk_rforking_sub.py

index 1b4d59bf7b459193f491e5fdb3aa38c494d32944..73956e246d6b02342e3256248bc7e90639c8b501 100755 (executable)
--- a/Robots/bkmk_rforking_sub.py
+++ b/Robots/bkmk_rforking_sub.py
@@ -1,11 +1,12 @@
  #! /usr/bin/env python
  #! /usr/bin/env python
-"""Check URL - subprocess for the forking robot
+"""Subprocess for the forking robot - check URL using bkmk_rurlib robot
  
  This file is a part of Bookmarks database and Internet robot.
  
  This file is a part of Bookmarks database and Internet robot.
+
  """
  
  __author__ = "Oleg Broytman <phd@phdru.name>"
  """
  
  __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 1999-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 1999-2014 PhiloSoft Design"
  __license__ = "GNU GPL"
  
  __all__ = []
  __license__ = "GNU GPL"
  
  __all__ = []
@@ -31,8 +32,8 @@ def run():
  
     from m_lib.flog import openlog
     log = openlog("check2.log")
  
     from m_lib.flog import openlog
     log = openlog("check2.log")
-   from bkmk_rsimple import robot_simple
-   robot = robot_simple(log)
+   from bkmk_rurllib import robot_urllib
+   robot = robot_urllib(log)
  
     while 1:
        bookmark = pickle.loads(bkmk_in.read_record())
  
     while 1:
        bookmark = pickle.loads(bkmk_in.read_record())
diff --git a/Robots/bkmk_rsimple.py b/Robots/bkmk_robot_base.py

similarity index 58%

rename from Robots/bkmk_rsimple.py

rename to Robots/bkmk_robot_base.py

index 2c4df9e278f99b186daa869720f9437c81f7cefa..63fd73e08e9e71b0423d44eca0b8c9a1db671725 100644 (file)
--- a/Robots/bkmk_rsimple.py
+++ b/Robots/bkmk_robot_base.py
@@ -1,4 +1,4 @@
-"""Simple, strightforward robot
+"""Base class for robots
  
  This file is a part of Bookmarks database and Internet robot.
  
  
  This file is a part of Bookmarks database and Internet robot.
  
@@ -8,10 +8,10 @@ __author__ = "Oleg Broytman <phd@phdru.name>"
  __copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
  __license__ = "GNU GPL"
  
  __copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
  __license__ = "GNU GPL"
  
-__all__ = ['robot_simple', 'get_error']
+__all__ = ['robot_base', 'get_error']
  
  
  
  
-import sys, os
+import sys
  import time, urllib
  from base64 import b64encode
  from urlparse import urljoin
  import time, urllib
  from base64 import b64encode
  from urlparse import urljoin
@@ -20,7 +20,7 @@ from m_lib.net.www.util import parse_time
  from m_lib.md5wrapper import md5wrapper
  
  from bkmk_objects import Robot
  from m_lib.md5wrapper import md5wrapper
  
  from bkmk_objects import Robot
-from parse_html import parse_filename
+from parse_html import parse_html
  
  
  class RedirectException(Exception):
  
  
  class RedirectException(Exception):
@@ -36,49 +36,6 @@ class RedirectException(Exception):
        self.url = newurl
  
  
        self.url = newurl
  
  
-class MyURLopener(urllib.URLopener):
-   # Error 302 -- relocated (temporarily)
-   def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 
-      if headers.has_key('location'):
-         newurl = headers['location']
-      elif headers.has_key('uri'):
-         newurl = headers['uri']
-      else:
-         newurl = "Nowhere"
-      raise RedirectException(errcode, newurl)
-
-   # Error 301 -- also relocated (permanently)
-   http_error_301 = http_error_302
-   # Error 307 -- also relocated (temporary)
-   http_error_307 = http_error_302
-
-   # Error 401 -- authentication required
-   def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): 
-      raise IOError, ('http error', errcode, "Authentication required ", headers)
-
-   def http_error_default(self, url, fp, errcode, errmsg, headers):
-      if fp:
-         void = fp.read()
-         fp.close()
-      raise IOError, ('http error', errcode, errmsg, headers)
-
-
-urllib._urlopener = MyURLopener()
-
-# Fake headers to pretend this is a real browser
-_version = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en) Gecko/20001221 Firefox/2.0.0"
-urllib._urlopener.addheaders[0] = ('User-Agent', _version)
-_version = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % (
-   sys.version_info[0], sys.version_info[1], sys.version_info[2], urllib.__version__)
-urllib._urlopener.addheader('X-User-Agent', _version)
-urllib._urlopener.addheader('Referer', '')
-
-urllib._urlopener.addheader('Connection', 'close')
-urllib._urlopener.addheader('Accept', '*/*')
-urllib._urlopener.addheader('Accept-Language', 'ru,en')
-urllib._urlopener.addheader('Cache-Control', 'max-age=300')
-
-
  def get_error(msg):
     if isinstance(msg, str):
        return msg
  def get_error(msg):
     if isinstance(msg, str):
        return msg
@@ -90,31 +47,11 @@ def get_error(msg):
        return "(%s)" % ' '.join(s)
  
  
        return "(%s)" % ' '.join(s)
  
  
-urllib_ftpwrapper = urllib.ftpwrapper
-ftpcache_key = None
-
-class myftpwrapper(urllib_ftpwrapper):
-   def __init__(self, user, passwd, host, port, dirs):
-      urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs)
-      global ftpcache_key
-      ftpcache_key = (user, host, port, '/'.join(dirs))
-
-urllib.ftpwrapper = myftpwrapper
-
-def get_welcome():
-   global ftpcache_key
-   _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome
-   ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db.
-                       # If there are - ftpcache_key in prev line is invalid.
-   return _welcome
-
-
  icons = {} # Icon cache; maps URL to a tuple (content type, data)
             # or None if there is no icon.
  
  icons = {} # Icon cache; maps URL to a tuple (content type, data)
             # or None if there is no icon.
  
-class robot_simple(Robot):
+class robot_base(Robot):
     def check_url(self, bookmark):
     def check_url(self, bookmark):
-      fname = None
        try:
           self.start = int(time.time())
           bookmark.icon = None
        try:
           self.start = int(time.time())
           bookmark.icon = None
@@ -123,12 +60,11 @@ class robot_simple(Robot):
           url_host, url_path = urllib.splithost(url_rest)
           url_path, url_tag  = urllib.splittag(url_path)
  
           url_host, url_path = urllib.splithost(url_rest)
           url_path, url_tag  = urllib.splittag(url_path)
  
-         # Set fake referer to the root of the site
-         urllib._urlopener.addheaders[2] = ('Referer', "%s://%s%s" % (url_type, url_host, url_path))
+         url = "%s://%s%s" % (url_type, url_host, url_path)
+         headers, content = self.urlretrieve(bookmark, url, True)
  
  
-         if bookmark.charset: urllib._urlopener.addheader('Accept-Charset', bookmark.charset)
-         fname, headers = urllib.urlretrieve("%s://%s%s" % (url_type, url_host, url_path))
-         if bookmark.charset: del urllib._urlopener.addheaders[-1]
+         if content is None:
+             return 1
  
           size = 0
           last_modified = None
  
           size = 0
           last_modified = None
@@ -137,7 +73,7 @@ class robot_simple(Robot):
              try:
                 size = headers["Content-Length"]
              except KeyError:
              try:
                 size = headers["Content-Length"]
              except KeyError:
-               pass
+               size = len(content)
  
              try:
                 last_modified = headers["Last-Modified"]
  
              try:
                 last_modified = headers["Last-Modified"]
@@ -146,6 +82,8 @@ class robot_simple(Robot):
  
              if last_modified:
                 last_modified = parse_time(last_modified)
  
              if last_modified:
                 last_modified = parse_time(last_modified)
+         else:
+            size = len(content)
  
           if last_modified:
              last_modified = str(int(last_modified))
  
           if last_modified:
              last_modified = str(int(last_modified))
@@ -157,9 +95,9 @@ class robot_simple(Robot):
  
           md5 = md5wrapper()
           if urllib._urlopener.type == "ftp": # Pass welcome message through MD5
  
           md5 = md5wrapper()
           if urllib._urlopener.type == "ftp": # Pass welcome message through MD5
-            md5.update(get_welcome())
+            md5.update(self.get_ftp_welcome())
  
  
-         md5.md5file(fname)
+         md5.update(content)
           bookmark.md5 = str(md5)
  
           if headers:
           bookmark.md5 = str(md5)
  
           if headers:
@@ -182,7 +120,7 @@ class robot_simple(Robot):
                 else:
                    html = False
                 if html:
                 else:
                    html = False
                 if html:
-                  parser = parse_filename(fname, charset, self.log)
+                  parser = parse_html(content, charset, self.log)
                    if parser:
                        bookmark.real_title = parser.title
                        icon = parser.icon
                    if parser:
                        bookmark.real_title = parser.title
                        icon = parser.icon
@@ -190,25 +128,27 @@ class robot_simple(Robot):
                       icon = None
                    if not icon:
                       icon = "/favicon.ico"
                       icon = None
                    if not icon:
                       icon = "/favicon.ico"
-                  icon = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
-                  self.log("   looking for icon at: %s" % icon)
-                  if icon in icons:
-                     if icons[icon]:
-                        bookmark.icon_href = icon
-                        content_type, bookmark.icon = icons[icon]
+                  icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
+                  self.log("   looking for icon at: %s" % icon_url)
+                  if icon_url in icons:
+                     if icons[icon_url]:
+                        bookmark.icon_href = icon_url
+                        content_type, bookmark.icon = icons[icon_url]
                          self.log("   cached icon: %s" % content_type)
                       else:
                          self.log("   cached icon: no icon")
                    else:
                       try:
                          self.log("   cached icon: %s" % content_type)
                       else:
                          self.log("   cached icon: no icon")
                    else:
                       try:
-                        _icon = icon
+                        _icon_url = icon_url
                          for i in range(8):
                             try:
                          for i in range(8):
                             try:
-                              icon_fname, headers = urllib.urlretrieve(_icon)
+                              icon_headers, icon_data = self.urlretrieve(bookmark, _icon_url)
                             except RedirectException, e:
                             except RedirectException, e:
-                              _icon = e.url
-                              self.log("   redirect to : %s" % _icon)
+                              _icon_url = e.url
+                              self.log("   redirect to : %s" % _icon_url)
                             else:
                             else:
+                              if icon_data is None:
+                                   raise IOError("No icon")
                                break
                          else:
                             raise IOError("Too many redirects")
                                break
                          else:
                             raise IOError("Too many redirects")
@@ -216,26 +156,23 @@ class robot_simple(Robot):
                          etype, emsg, tb = sys.exc_info()
                          self.log("   no icon        : %s %s" % (etype, emsg))
                          etype = emsg = tb = None
                          etype, emsg, tb = sys.exc_info()
                          self.log("   no icon        : %s %s" % (etype, emsg))
                          etype = emsg = tb = None
-                        icons[icon] = None
+                        icons[icon_url] = None
                       else:
                       else:
-                        content_type = headers["Content-Type"]
+                        content_type = icon_headers["Content-Type"]
                          if content_type.startswith("application/") \
                                or content_type.startswith("image/") \
                                or content_type.startswith("text/plain"):
                          if content_type.startswith("application/") \
                                or content_type.startswith("image/") \
                                or content_type.startswith("text/plain"):
-                           icon_file = open(icon_fname, "rb")
-                           icon_data = icon_file.read()
-                           icon_file.close()
-                           bookmark.icon_href = icon
+                           bookmark.icon_href = icon_url
                             self.log("   got icon       : %s" % content_type)
                             if content_type.startswith("application/") \
                                   or content_type.startswith("text/plain"):
                                self.log("   non-image content type, assume x-icon")
                                content_type = 'image/x-icon'
                             bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
                             self.log("   got icon       : %s" % content_type)
                             if content_type.startswith("application/") \
                                   or content_type.startswith("text/plain"):
                                self.log("   non-image content type, assume x-icon")
                                content_type = 'image/x-icon'
                             bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
-                           icons[icon] = (content_type, bookmark.icon)
+                           icons[icon_url] = (content_type, bookmark.icon)
                          else:
                             self.log("   no icon        : bad content type '%s'" % content_type)
                          else:
                             self.log("   no icon        : bad content type '%s'" % content_type)
-                           icons[icon] = None
+                           icons[icon_url] = None
                    if parser and parser.refresh:
                       refresh = parser.refresh
                       try:
                    if parser and parser.refresh:
                       refresh = parser.refresh
                       try:
@@ -256,14 +193,6 @@ class robot_simple(Robot):
              except KeyError, key:
                 self.log("   no header: %s" % key)
  
              except KeyError, key:
                 self.log("   no header: %s" % key)
  
-      except IOError, msg:
-         if (msg[0] == "http error") and (msg[1] == -1):
-            bookmark.no_error = "The server did not return any header - it is not an error, actually"
-            self.log('   no headers: %s' % bookmark.no_error)
-         else:
-            bookmark.error = get_error(msg)
-            self.log('   Error: %s' % bookmark.error)
-
        except EOFError:
           bookmark.error = "Unexpected EOF (FTP server closed connection)"
           self.log('   EOF: %s' % bookmark.error)
        except EOFError:
           bookmark.error = "Unexpected EOF (FTP server closed connection)"
           self.log('   EOF: %s' % bookmark.error)
@@ -283,22 +212,16 @@ class robot_simple(Robot):
           self.log('   Exception: %s' % bookmark.error)
  
        finally:
           self.log('   Exception: %s' % bookmark.error)
  
        finally:
-         self.finish_check_url(bookmark, fname)
+         self.finish_check_url(bookmark)
  
        # Tested
        return 1
  
  
        # Tested
        return 1
  
-   def finish_check_url(self, bookmark, fname=None):
-      # Calculate these attributes even in case of an error
-      if fname and os.path.exists(fname):
-         size = str(os.path.getsize(fname))
-         if size[-1] == 'L':
-            size = size[:-1]
-         bookmark.size = size
-
+   def finish_check_url(self, bookmark):
        start = self.start
        bookmark.last_tested = str(start)
  
        now = int(time.time())
        bookmark.test_time = str(now - start)
        start = self.start
        bookmark.last_tested = str(start)
  
        now = int(time.time())
        bookmark.test_time = str(now - start)
-      urllib.urlcleanup()
+
+      self.cleanup()
diff --git a/Robots/bkmk_rurllib.py b/Robots/bkmk_rurllib.py

new file mode 100644 (file)

index 0000000..f0a614e
--- /dev/null
+++ b/Robots/bkmk_rurllib.py
@@ -0,0 +1,110 @@
+"""Simple, strightforward robot based on urllib
+
+This file is a part of Bookmarks database and Internet robot.
+
+"""
+
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['robot_urllib']
+
+
+import sys, os
+import time, urllib
+from Robots.bkmk_robot_base import robot_base, RedirectException, get_error
+
+
+class MyURLopener(urllib.URLopener):
+   # Error 302 -- relocated (temporarily)
+   def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 
+      if headers.has_key('location'):
+         newurl = headers['location']
+      elif headers.has_key('uri'):
+         newurl = headers['uri']
+      else:
+         newurl = "Nowhere"
+      raise RedirectException(errcode, newurl)
+
+   # Error 301 -- also relocated (permanently)
+   http_error_301 = http_error_302
+   # Error 307 -- also relocated (temporary)
+   http_error_307 = http_error_302
+
+   # Error 401 -- authentication required
+   def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): 
+      raise IOError, ('http error', errcode, "Authentication required ", headers)
+
+   def http_error_default(self, url, fp, errcode, errmsg, headers):
+      if fp:
+         void = fp.read()
+         fp.close()
+      raise IOError, ('http error', errcode, errmsg, headers)
+
+
+urllib._urlopener = MyURLopener()
+
+# Fake headers to pretend this is a real browser
+_version = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en) Gecko/20001221 Firefox/2.0.0"
+urllib._urlopener.addheaders[0] = ('User-Agent', _version)
+_version = "bookmarks_db (Python %d.%d.%d; urllib/%s)" % (
+   sys.version_info[0], sys.version_info[1], sys.version_info[2], urllib.__version__)
+urllib._urlopener.addheader('X-User-Agent', _version)
+urllib._urlopener.addheader('Referer', '')
+
+urllib._urlopener.addheader('Connection', 'close')
+urllib._urlopener.addheader('Accept', '*/*')
+urllib._urlopener.addheader('Accept-Language', 'ru,en')
+urllib._urlopener.addheader('Cache-Control', 'max-age=300')
+
+
+urllib_ftpwrapper = urllib.ftpwrapper
+ftpcache_key = None
+
+class myftpwrapper(urllib_ftpwrapper):
+   def __init__(self, user, passwd, host, port, dirs):
+      urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs)
+      global ftpcache_key
+      ftpcache_key = (user, host, port, '/'.join(dirs))
+
+urllib.ftpwrapper = myftpwrapper
+
+
+class robot_urllib(robot_base):
+   def urlretrieve(self, bookmark, url, accept_charset=False):
+      try:
+         # Set fake referer to the base URL
+         urllib._urlopener.addheaders[2] = ('Referer', url)
+
+         if accept_charset and bookmark.charset:
+            urllib._urlopener.addheader('Accept-Charset', bookmark.charset)
+         fname, headers = urllib.urlretrieve(url)
+         if accept_charset and bookmark.charset:
+            del urllib._urlopener.addheaders[-1]
+
+         infile = open(fname, 'rb')
+         content = infile.read()
+         infile.close()
+
+         return headers, content
+
+      except IOError, msg:
+         if (msg[0] == "http error") and (msg[1] == -1):
+            bookmark.no_error = "The server did not return any header - it is not an error, actually"
+            self.log('   no headers: %s' % bookmark.no_error)
+         else:
+            bookmark.error = get_error(msg)
+            self.log('   Error: %s' % bookmark.error)
+
+         return None, None
+
+   def get_ftp_welcome(self):
+      global ftpcache_key
+      _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome
+      ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db.
+                          # If there are - ftpcache_key in prev line is invalid.
+      return _welcome
+
+   def cleanup(self):
+      urllib.urlcleanup()
author	Oleg Broytman <phd@phdru.name>
	Sat, 31 May 2014 19:58:51 +0000 (23:58 +0400)
committer	Oleg Broytman <phd@phdru.name>
	Sat, 31 May 2014 19:58:51 +0000 (23:58 +0400)
Robots/bkmk_rforking_sub.py		patch \| blob \| history
Robots/bkmk_robot_base.py	[moved from Robots/bkmk_rsimple.py with 58% similarity]	patch \| blob \| history
Robots/bkmk_rurllib.py	[new file with mode: 0644]	patch \| blob