X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=check_url_sub.py;fp=check_url_sub.py;h=0000000000000000000000000000000000000000;hb=fb5c3b2b91aeeb615d6d6d890491af3fdff69556;hp=6c301dfcfb9d2236410941ed3e26d63278c34eae;hpb=2e82a937f80392639176d9a414b55ffb8164ebca;p=bookmarks_db.git diff --git a/check_url_sub.py b/check_url_sub.py deleted file mode 100755 index 6c301df..0000000 --- a/check_url_sub.py +++ /dev/null @@ -1,145 +0,0 @@ -#! /usr/local/bin/python -O -""" - Check URL - subprocess - - Written by BroytMann, Mar 1999 - Feb 2000. Copyright (C) 1999-2000 PhiloSoft Design -""" - - -import sys, os, stat, string, time -import urllib, www_util - -import cPickle -pickle = cPickle -from subproc import RecordFile - -from md5wrapper import md5wrapper - - -ftpcache_key = None -def myftpwrapper(user, passwd, host, port, dirs): - global ftpcache_key - ftpcache_key = (user, host, port, string.joinfields(dirs, '/')) - return _ftpwrapper(user, passwd, host, port, dirs) - -_ftpwrapper = urllib.ftpwrapper -urllib.ftpwrapper = myftpwrapper - -def get_welcome(): - global ftpcache_key - _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome - ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db. - # If there are - ftpcache_key in prev line is invalid. - return _welcome - - -class RedirectException(Exception): - reloc_dict = { - 301: "perm", - 302: "temp" - } - def __init__(self, errcode, newurl): - Exception.__init__(self, "(%s.) to %s" % (self.reloc_dict[errcode], newurl)) - - -class MyURLopener(urllib.URLopener): - # Error 302 -- relocated (temporarily) - def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): - if headers.has_key('location'): - newurl = headers['location'] - elif headers.has_key('uri'): - newurl = headers['uri'] - else: - newurl = "Nowhere" - raise RedirectException(errcode, newurl) - - # Error 301 -- also relocated (permanently) - http_error_301 = http_error_302 - - # Error 401 -- authentication required - def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): - raise IOError, ('http error', errcode, "Authentication required ", headers) - - -def get_error(msg): - if type(msg) == type(""): - return msg - - else: - s = [] - for i in msg: - s.append("'%s'" % string.join(string.split(str(i), "\n"), "\\n")) - return "(%s)" % string.join(s) - -def check_url(record): - try: - now = str(int(time.time())) - url_type, url_rest = urllib.splittype(record["URL"]) - url_host, url_path = urllib.splithost(url_rest) - url_path, url_tag = urllib.splittag(url_path) - - tempfname = record["TEMPFILE"] - del record["TEMPFILE"] - - fname, headers = urllib.urlretrieve("%s://%s%s" % (url_type, url_host, url_path), tempfname) - - last_modified = None - record["Size"] = str(os.stat(tempfname)[stat.ST_SIZE]) - - if headers: - try: - last_modified = headers["Last-Modified"] - except KeyError: - last_modified = None - - if last_modified: - last_modified = www_util.parse_time(last_modified) - - if last_modified: - last_modified = str(int(last_modified)) - else: - last_modified = record["LastVisit"] - - record["LastModified"] = last_modified - - md5 = md5wrapper() - if url_type == "ftp": # Pass welcome message through MD5 - md5.update(get_welcome()) - - md5.md5file(tempfname) - record["MD5"] = str(md5) - - except IOError, msg: - if (msg[0] == "http error") and (msg[1] == -1): - record["NoError"] = "The server did not return any header - it is not an error, actually" - else: - record["Error"] = get_error(msg) - - except EOFError: - record["Error"] = "Unexpected EOF (FTP server closed connection)" - - except RedirectException, msg: - record["Moved"] = str(msg) - - # Mark this even in case of error - record["LastTested"] = now - - -def run(): - urllib._urlopener = MyURLopener() - - # Some sites allow only Mozilla-compatible browsers; way to stop robots? - server_version = "Mozilla/3.0 (compatible; Python-urllib/%s)" % urllib.__version__ - urllib._urlopener.addheaders[0] = ('User-agent', server_version) - - rec_in = RecordFile(sys.stdin) - rec_out = RecordFile(sys.stdout) - - while 1: - record = pickle.loads(rec_in.read_record()) - check_url(record) - rec_out.write_record(pickle.dumps(record)) - - -if __name__ == '__main__': - run()