bin/GET.py

   1 #! /usr/bin/env python
   2
   3 import sys
   4 url = sys.argv[1]
   5
   6 import urllib
   7 urllib._urlopener = urllib.FancyURLopener()
   8
   9 from cgi import parse_qsl
  10 from m_lib.defenc import default_encoding
  11
  12 protocol, request = urllib.splittype(url)
  13 user, password, port = None, None, None
  14 host, path = urllib.splithost(request)
  15 if host:
  16    user, host = urllib.splituser(host)
  17    if user:
  18       user, password = urllib.splitpasswd(user)
  19    host, port = urllib.splitport(host)
  20    if port: port = int(port)
  21 path, tag = urllib.splittag(path)
  22 path, query = urllib.splitquery(path)
  23 path = urllib.unquote(path)
  24 if tag: tag = urllib.unquote_plus(tag)
  25
  26 if query:
  27    qlist = []
  28    for name, value in parse_qsl(query):
  29        qlist.append((name, value))
  30
  31 url = protocol + "://"
  32 if user:
  33    url += urllib.quote(user)
  34    if password:
  35       url += ':' + urllib.quote(password)
  36    url += '@'
  37 if host:
  38    url += host.decode(default_encoding).encode('idna')
  39    if port:
  40       url += ':%d' % port
  41 if path:
  42    if protocol == "file":
  43       url += urllib.quote(path)
  44    else:
  45       url += urllib.quote(path)
  46 if query:
  47    url += '?' + urllib.urlencode(qlist)
  48 if tag:
  49    url += '#' + urllib.quote_plus(tag)
  50
  51 # I remember seeing some sites that return broken HTML or even HTTP response
  52 # without "compatible" user agent; I don't know if such sites are still around,
  53 # but this header doesn't cause any harm so I'd better continue to use it.
  54 # UPDATE: I saw a number of sites that forbid "Mozilla compatible"
  55 server_version = "Python-urllib/%s" % urllib.__version__
  56 urllib._urlopener.addheaders[0] = ('User-agent', server_version)
  57 urllib._urlopener.addheaders.append(('Accept-Charset', "koi8-r;q=1.0"))
  58
  59 import os
  60 dest_file = os.path.basename(url)
  61
  62 if not dest_file:
  63    dest_file = "_index.html"
  64
  65 filename, headers = urllib.urlretrieve(url, dest_file)
  66
  67 if headers.has_key("last-modified"):
  68    from m_lib.net.www.util import parse_time
  69    last_modified = parse_time(headers["last-modified"])
  70    if last_modified:
  71       os.utime(dest_file, (last_modified, last_modified))