bin/GET.py

   1 #! /usr/bin/env python3
   2
   3 from urllib.parse import urlsplit, parse_qsl, quote, quote_plus, urlencode
   4 from urllib.request import urlretrieve
   5 import os
   6 import socket
   7 import sys
   8 import urllib.request
   9
  10 url = sys.argv[1]
  11
  12 split_results = urlsplit(url)
  13 protocol, netloc, path, query, tag = split_results
  14 user = split_results.username
  15 password = split_results.password
  16 host = split_results.hostname
  17 port = split_results.port
  18 qlist = parse_qsl(query)
  19
  20 url = protocol + "://"
  21 if user:
  22     url += quote(user)
  23     if password:
  24         url += ':' + quote(password)
  25     url += '@'
  26 if host:
  27     host = host.encode('idna').decode('ascii')
  28     url += host
  29     if port:
  30         url += ':%d' % port
  31 if path:
  32     if protocol == "file":
  33         url += quote(path)
  34     else:
  35         url += quote(path)
  36 if query:
  37     url += '?' + urlencode(qlist)
  38 if tag:
  39     url += '#' + quote_plus(tag)
  40
  41 # I remember seeing some sites that return broken HTML or even HTTP response
  42 # without "compatible" user agent; I don't know if such sites are still around,
  43 # but this header doesn't cause any harm so I'd better continue to use it.
  44 # UPDATE: I saw a number of sites that forbid "Mozilla compatible"
  45 urllib_version = urllib.request.__version__
  46 server_version = "Python-urllib/%s" % urllib_version
  47
  48
  49 class MyURLopener(urllib.request.URLopener):
  50     def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
  51         return urllib.request.URLopener.open(self, fullurl, data)
  52
  53
  54 _urlopener = urllib.request._opener = MyURLopener()
  55 _urlopener.addheaders[0] = ('User-agent', server_version)
  56 _urlopener.addheaders.append(('Accept-Charset', "koi8-r;q=1.0"))
  57
  58 dest_file = os.path.basename(url)
  59 if not dest_file:
  60     dest_file = "_index.html"
  61
  62 filename, headers = urlretrieve(url, filename=dest_file)
  63
  64 if "last-modified" in headers:
  65     from m_lib.net.www.util import parse_time
  66     last_modified = parse_time(headers["last-modified"])
  67     if last_modified:
  68         os.utime(dest_file, (last_modified, last_modified))