bin/GET.py

   1 #! /usr/bin/env python3
   2
   3 try:
   4     PY2 = False
   5     from urllib.parse import parse_qsl, urlencode, \
   6         quote, quote_plus, unquote, unquote_plus, \
   7         splittype, splithost, splituser, splitpasswd, \
   8         splitport, splittag, splitquery
   9     from urllib.request import urlretrieve
  10     import urllib.request
  11 except ImportError:
  12     PY2 = True
  13     from cgi import parse_qsl
  14     from urllib import urlencode, quote, quote_plus, unquote, unquote_plus, \
  15         splittype, splithost, splituser, splitpasswd, \
  16         splitport, splittag, splitquery, urlretrieve
  17
  18 import os
  19 import sys
  20 import urllib
  21 from m_lib.defenc import default_encoding
  22
  23 url = sys.argv[1]
  24 if PY2:
  25     _urlopener = urllib._urlopener = urllib.FancyURLopener()
  26 else:
  27     _urlopener = urllib.request._opener = urllib.request.FancyURLopener()
  28
  29 protocol, request = splittype(url)
  30 user, password, port = None, None, None
  31 host, path = splithost(request)
  32 if host:
  33     user, host = splituser(host)
  34     if user:
  35         user, password = splitpasswd(user)
  36     host, port = splitport(host)
  37     if port: port = int(port)
  38 path, tag = splittag(path)
  39 path, query = splitquery(path)
  40 path = unquote(path)
  41 if tag: tag = unquote_plus(tag)
  42
  43 if query:
  44     qlist = []
  45     for name, value in parse_qsl(query):
  46         qlist.append((name, value))
  47
  48 url = protocol + "://"
  49 if user:
  50     url += quote(user)
  51     if password:
  52         url += ':' + quote(password)
  53     url += '@'
  54 if host:
  55     if PY2:
  56         host = host.decode(default_encoding)
  57     host = host.encode('idna')
  58     if not PY2:
  59         host = host.decode('ascii')
  60     url += host
  61     if port:
  62         url += ':%d' % port
  63 if path:
  64     if protocol == "file":
  65         url += quote(path)
  66     else:
  67         url += quote(path)
  68 if query:
  69     url += '?' + urlencode(qlist)
  70 if tag:
  71     url += '#' + quote_plus(tag)
  72
  73 # I remember seeing some sites that return broken HTML or even HTTP response
  74 # without "compatible" user agent; I don't know if such sites are still around,
  75 # but this header doesn't cause any harm so I'd better continue to use it.
  76 # UPDATE: I saw a number of sites that forbid "Mozilla compatible"
  77 if PY2:
  78     urllib_version = urllib.__version__
  79 else:
  80     urllib_version = urllib.request.__version__
  81 server_version = "Python-urllib/%s" % urllib_version
  82
  83 _urlopener.addheaders[0] = ('User-agent', server_version)
  84 _urlopener.addheaders.append(('Accept-Charset', "koi8-r;q=1.0"))
  85
  86 dest_file = os.path.basename(url)
  87 if not dest_file:
  88     dest_file = "_index.html"
  89
  90 filename, headers = urlretrieve(url, filename=dest_file)
  91
  92 if "last-modified" in headers:
  93     from m_lib.net.www.util import parse_time
  94     last_modified = parse_time(headers["last-modified"])
  95     if last_modified:
  96         os.utime(dest_file, (last_modified, last_modified))