1 #! /usr/bin/env python3
3 from urllib.parse import urlsplit, parse_qsl, quote, quote_plus, urlencode
4 from urllib.request import urlretrieve
12 split_results = urlsplit(url)
13 protocol, netloc, path, query, tag = split_results
14 user = split_results.username
15 password = split_results.password
16 host = split_results.hostname
17 port = split_results.port
18 qlist = parse_qsl(query)
20 url = protocol + "://"
24 url += ':' + quote(password)
27 host = host.encode('idna').decode('ascii')
32 if protocol == "file":
37 url += '?' + urlencode(qlist)
39 url += '#' + quote_plus(tag)
41 # I remember seeing some sites that return broken HTML or even HTTP response
42 # without "compatible" user agent; I don't know if such sites are still around,
43 # but this header doesn't cause any harm so I'd better continue to use it.
44 # UPDATE: I saw a number of sites that forbid "Mozilla compatible"
45 urllib_version = urllib.request.__version__
46 server_version = "Python-urllib/%s" % urllib_version
49 class MyURLopener(urllib.request.URLopener):
50 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
51 return urllib.request.URLopener.open(self, fullurl, data)
54 _urlopener = urllib.request._opener = MyURLopener()
55 _urlopener.addheaders[0] = ('User-agent', server_version)
56 _urlopener.addheaders.append(('Accept-Charset', "koi8-r;q=1.0"))
58 dest_file = os.path.basename(url)
60 dest_file = "_index.html"
62 filename, headers = urlretrieve(url, filename=dest_file)
64 if "last-modified" in headers:
65 from m_lib.net.www.util import parse_time
66 last_modified = parse_time(headers["last-modified"])
68 os.utime(dest_file, (last_modified, last_modified))