]> git.phdru.name Git - bookmarks_db.git/blobdiff - check_urls2.py
Initial revision
[bookmarks_db.git] / check_urls2.py
diff --git a/check_urls2.py b/check_urls2.py
new file mode 100755 (executable)
index 0000000..73a91f9
--- /dev/null
@@ -0,0 +1,310 @@
+#! /usr/local/bin/python -O
+"""
+   For every URL in the FLAD database get info from the Net
+   and store info in check.db
+
+   Version 2.0
+   Written by BroytMann, Aug 1997 - Mar 1999. Copyright (C) 1997-1999 PhiloSoft Design
+"""
+
+
+import sys, os, stat, string, time
+from getopt import getopt
+
+import urllib, tempfile
+from copy import _copy_dict
+
+import cPickle
+pickle = cPickle
+
+import fladm, fladc, shutil
+from flog import makelog, openlog
+
+
+os.environ["PATH"] = ".:" + os.environ["PATH"]
+from subproc import Subprocess, RecordFile
+
+
+def set_checkpoint(rec_no):
+   cpfile = open("check.dat", 'w')
+   cpfile.write("# chk_urls checkpoint file\n")
+   cpfile.write("Size: %d\n" % db_stat[stat.ST_SIZE])
+   cpfile.write("MTime: %d\n" % db_stat[stat.ST_MTIME])
+   cpfile.write("Record: %d" % rec_no)
+   cpfile.close()
+
+def get_checkpoint():
+   try:
+      cpfile = fladc.load_file("check.dat")
+      if (string.atoi(cpfile["Size"]) <> db_stat[stat.ST_SIZE]) or \
+         (string.atoi(cpfile["MTime"]) <> db_stat[stat.ST_MTIME]):
+         return -3
+
+      return string.atoi(cpfile["Record"])
+
+   except IOError: # No such file
+      return -1
+
+   except KeyError: # No such key in checkpoint file
+      return -2
+
+   except string.atoi_error: # Wrong numeric format
+      return -2
+
+   return 0
+
+def start(db_name, report_stats):
+   start_recno = get_checkpoint()
+   if start_recno < 0:
+      if start_recno == -1:
+         log = makelog("check.log")
+         log("chk_urls started")
+         if report_stats:
+            print "   chk_urls: normal start"
+
+      elif start_recno == -2:
+         log = openlog("check.log")
+         log("chk_urls started")
+         log("   invalid checkpoint file, checkpoint ignored")
+         if report_stats:
+            print "   chk_urls: invalid checkpoint file, checkpoint ignored"
+
+      elif start_recno == -3:
+         log = makelog("check.log")
+         log("chk_urls started")
+         log("   bookmarks.db changed, checkpoint ignored")
+         if report_stats:
+            print "   chk_urls: bookmarks.db changed, checkpoint ignored"
+
+      else:
+         raise RuntimeError, "wrong get_checkpoint() return: `%s'" % str(start_recno)
+
+      start_recno = 0
+
+   elif start_recno == 0:
+      raise RuntimeError, "wrong get_checkpoint() return: `%s'" % str(start_recno)
+
+   else: # start_recno > 0
+      if os.path.exists("check.db"):
+         if not os.path.exists("check.old"):
+            shutil.copy("check.db", "check.old")
+         db_name = "check.db"
+
+         log = openlog("check.log")
+         log("chk_urls started")
+         log("   found valid checkpoint file, continue")
+         if report_stats:
+            print "   chk_urls: found valid checkpoint file, continue"
+
+      else:
+         log = makelog("check.log")
+         log("chk_urls started")
+         log("   valid checkpoint, but no check.db file, restarting")
+         if report_stats:
+            print "   chk_urls: valid checkpoint, but no check.db file, restarting"
+         start_recno = 0
+
+   return start_recno, db_name, log
+
+
+tempfname = "check_urls" + tempfile.gettempprefix() + ".tmp"
+
+
+check_subp = None
+subp_pipe = None
+
+def restart_subp(log, report_stats):
+   global check_subp, subp_pipe
+   if check_subp:
+      log("   restarting hanging subprocess")
+      if report_stats:
+         print "   chk_urls: restarting hanging subprocess"
+      del check_subp
+   del subp_pipe
+
+   check_subp = Subprocess("check_url_sub.py")
+   subp_pipe = RecordFile(check_subp)
+
+
+def check_url(record, log, report_stats):
+   try:
+      record["TEMPFILE"] = tempfname
+      subp_pipe.write_record(pickle.dumps(record))
+
+      if check_subp.waitForPendingChar(900): # wait 15 minutes
+         rec = pickle.loads(subp_pipe.read_record())
+         del record["TEMPFILE"]
+         for key in rec.keys():
+            record[key] = rec[key]
+      else:
+         restart_subp(log, report_stats)
+         del record["TEMPFILE"]
+         record["Error"] = "Subprocess connection timed out"
+
+   except KeyboardInterrupt:
+      return 0
+
+   return 1
+
+
+def run():
+   optlist, args = getopt(sys.argv[1:], "ise")
+
+   show_pbar = 1
+   report_stats = 1
+   only_errors = 0
+   db_name = "bookmarks.db"
+
+   for _opt, _arg in optlist:
+      if _opt == '-i':
+         show_pbar = 0
+      if _opt == '-s':
+         report_stats = 0
+      if _opt == '-e':
+         only_errors = 1
+   try:
+      del _opt, _arg
+   except NameError:
+      pass
+
+   if report_stats:
+      print "BroytMann check_urls, Copyright (C) 1997-1999 PhiloSoft Design"
+
+   if args:
+      if len(args) > 1:
+         sys.stderr.write("chk_urls: too many arguments\n")
+         sys.exit(1)
+      else:
+         db_name = args[0]
+
+   if show_pbar:
+      show_pbar = sys.stderr.isatty()
+
+   if show_pbar:
+      try:
+         from tty_pbar import ttyProgressBar
+      except ImportError:
+         show_pbar = 0
+
+   global db_stat, log
+   db_stat = os.stat(db_name)
+
+   if only_errors:
+      start_recno = 0
+      db_name = "check.db"
+      log = openlog("check.log")
+      log("chk_urls restarted for errors")
+   else:
+      start_recno, db_name, log = start(db_name, report_stats)
+
+   if report_stats:
+      sys.stdout.write("Loading %s: " % db_name)
+      sys.stdout.flush()
+
+   bookmarks_db = fladm.load_from_file(db_name, fladm.check_record, ["Level"])
+   bookmarks_dbstore = bookmarks_db
+
+   if only_errors:
+      bookmarks_db = filter(lambda r: r.has_key("Error") and r["Error"][:5] <> "Moved", bookmarks_db)
+
+   if report_stats:
+      print "Ok"
+
+   db_len = len(bookmarks_db)
+   if db_len == 0:
+      print "Database empty"
+      sys.exit(0)
+
+   if start_recno >= db_len:
+      _s = "start_recno (%d) >= db_len (%d), restarting" % (start_recno, db_len)
+      log("   " + _s)
+      if report_stats:
+         print "   chk_urls: " + _s
+      del _s
+      start_recno = 0
+
+   if report_stats:
+      if only_errors:
+         s = "Rechecking errors: "
+      else:
+         s = "Checking: "
+      sys.stdout.write(s)
+      sys.stdout.flush()
+
+   if show_pbar:
+      save_stats = report_stats
+      report_stats = 0
+      pbar = ttyProgressBar(0, db_len)
+
+   urls_no = 0
+   record_count = 0
+   start_time = time.time()
+
+   rcode = 1
+   restart_subp(log, report_stats) # Not restart, just start afresh
+   checked_dict = {} # Dictionary of checked URLs, mapped to records number
+
+   for record_no in range(start_recno, db_len):
+      if show_pbar:
+         pbar.display(record_no+1)
+
+      record = bookmarks_db[record_no]
+      record_count = record_count + 1
+
+      if only_errors:
+         del record["Error"]
+
+      if record.has_key("URL"):
+         url = record["URL"]
+         if checked_dict.has_key(url):
+            log("Already checked %s" % url)
+            level = record["Level"]
+            comment = record["Comment"]
+            bookmarks_db[record_no] = _copy_dict(bookmarks_db[checked_dict[url]])
+            bookmarks_db[record_no]["Level"] = level
+            bookmarks_db[record_no]["Comment"] = comment
+         else:
+            log("Checking %s" % url)
+            rcode = check_url(record, log, report_stats)
+            if rcode:
+               current_time = time.time()
+               if current_time - start_time >= 300: # Save checkpoint and database every 5 min
+                  bookmarks_dbstore.store_to_file("check.db")
+                  set_checkpoint(record_no)
+                  log.flush()
+                  start_time = current_time
+               urls_no = urls_no + 1
+               checked_dict[url] = record_no
+            else:
+               log("Interrupted by user (^C)")
+               break
+
+   if show_pbar:
+      del pbar
+      report_stats = save_stats 
+
+   if report_stats:
+      print "Ok"
+      print record_count, "records checked"
+      print urls_no, "URLs checked"
+
+   bookmarks_dbstore.store_to_file("check.db")
+
+   if rcode:
+      log("chk_urls finished ok")
+   log.close()
+
+   urllib.urlcleanup()
+   if os.path.exists(tempfname):
+      os.unlink(tempfname)
+
+   if rcode:
+      if os.path.exists("check.dat"):
+         os.unlink("check.dat")
+   else:
+      set_checkpoint(record_no)
+      sys.exit(1)
+
+
+if __name__ == '__main__':
+   run()