1 #! /usr/bin/env python3
5 from urllib.parse import parse_qsl, urlencode, \
6 quote, quote_plus, unquote, unquote_plus, \
7 splittype, splithost, splituser, splitpasswd, \
8 splitport, splittag, splitquery
9 from urllib.request import urlretrieve
13 from cgi import parse_qsl
14 from urllib import urlencode, quote, quote_plus, unquote, unquote_plus, \
15 splittype, splithost, splituser, splitpasswd, \
16 splitport, splittag, splitquery, urlretrieve
21 from m_lib.defenc import default_encoding
25 _urlopener = urllib._urlopener = urllib.FancyURLopener()
27 _urlopener = urllib.request._opener = urllib.request.FancyURLopener()
29 protocol, request = splittype(url)
30 user, password, port = None, None, None
31 host, path = splithost(request)
33 user, host = splituser(host)
35 user, password = splitpasswd(user)
36 host, port = splitport(host)
37 if port: port = int(port)
38 path, tag = splittag(path)
39 path, query = splitquery(path)
41 if tag: tag = unquote_plus(tag)
45 for name, value in parse_qsl(query):
46 qlist.append((name, value))
48 url = protocol + "://"
52 url += ':' + quote(password)
56 host = host.decode(default_encoding)
57 host = host.encode('idna')
59 host = host.decode('ascii')
64 if protocol == "file":
69 url += '?' + urlencode(qlist)
71 url += '#' + quote_plus(tag)
73 # I remember seeing some sites that return broken HTML or even HTTP response
74 # without "compatible" user agent; I don't know if such sites are still around,
75 # but this header doesn't cause any harm so I'd better continue to use it.
76 # UPDATE: I saw a number of sites that forbid "Mozilla compatible"
78 urllib_version = urllib.__version__
80 urllib_version = urllib.request.__version__
81 server_version = "Python-urllib/%s" % urllib_version
83 _urlopener.addheaders[0] = ('User-agent', server_version)
84 _urlopener.addheaders.append(('Accept-Charset', "koi8-r;q=1.0"))
86 dest_file = os.path.basename(url)
88 dest_file = "_index.html"
90 filename, headers = urlretrieve(url, filename=dest_file)
92 if "last-modified" in headers:
93 from m_lib.net.www.util import parse_time
94 last_modified = parse_time(headers["last-modified"])
96 os.utime(dest_file, (last_modified, last_modified))