From 36873224a386a7446a67dd4ecd282b0c3ef9aad5 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Mon, 5 Aug 2024 15:56:23 +0300 Subject: [PATCH] Fix(bkmk_rcurl): IDNA-encode URLs PycURL doesn't encode URLs itself and requires URLs to be in ASCII encoding. --- Robots/bkmk_rcurl.py | 61 ++++++++++++++++++++++++++++++++++++++++++++ doc/ANNOUNCE | 3 +++ doc/ChangeLog | 3 +++ 3 files changed, 67 insertions(+) diff --git a/Robots/bkmk_rcurl.py b/Robots/bkmk_rcurl.py index e4ce184..bbbc9d1 100644 --- a/Robots/bkmk_rcurl.py +++ b/Robots/bkmk_rcurl.py @@ -11,6 +11,9 @@ __license__ = "GNU GPL" __all__ = ['robot_curl'] +from urllib.parse import urlsplit, parse_qsl, quote, quote_plus, urlencode + +from m_lib.defenc import default_encoding import certifi import pycurl @@ -53,6 +56,10 @@ class robot_curl(robot_base): curl.setopt(pycurl.HTTPGET, 1) curl.setopt(pycurl.HTTPHEADER, headers) + try: + url.encode('ascii') + except UnicodeEncodeError: + url = encode_url(url, bookmark.charset) curl.setopt(pycurl.URL, url) try: curl.perform() @@ -89,3 +96,57 @@ class robot_curl(robot_base): def get_ftp_welcome(self): return '' # We doen't store welcome message yet + + +def encode_url(url, encoding): + if not encoding: + encoding = default_encoding + + split_results = urlsplit(url) + protocol, netloc, path, query, tag = split_results + user = split_results.username + password = split_results.password + host = split_results.hostname + port = split_results.port + + if query: + qlist = [] + for name, value in parse_qsl(query): + if isinstance(name, bytes): + name = name.decode(default_encoding) + value = value.decode(default_encoding) + name = name.encode(encoding) + value = value.encode(encoding) + qlist.append((name, value)) + + url = protocol + "://" + if user: + if isinstance(user, bytes): + user = user.decode(default_encoding) + url += quote(user.encode(encoding)) + if password: + if isinstance(password, bytes): + password = password.decode(default_encoding) + url += ':' + quote(password.encode(encoding)) + url += '@' + if host: + if isinstance(host, bytes): + host = host.decode(encoding) + url += host.encode('idna').decode('ascii') + if port: + url += ':%d' % port + if path: + if protocol == "file": + url += quote(path) + else: + if isinstance(path, bytes): + path = path.decode(default_encoding) + url += quote(path.encode(encoding)) + if query: + url += '?' + urlencode(qlist) + if tag: + if isinstance(tag, bytes): + tag = tag.decode(default_encoding) + url += '#' + quote_plus(tag.encode(encoding)) + + return url diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE index ddfb31f..aec114b 100644 --- a/doc/ANNOUNCE +++ b/doc/ANNOUNCE @@ -9,6 +9,9 @@ WHAT'S NEW Version 5.4.1 (2024-08-04) + Fix(bkmk_rcurl): IDNA-encode URLs. PycURL doesn't encode URLs itself + and requires URLs to be in ASCII encoding. + Separate connection timeout for PycURL robot. Also will be used for aiohttp. diff --git a/doc/ChangeLog b/doc/ChangeLog index 57481c0..da56547 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,5 +1,8 @@ Version 5.4.1 (2024-08-04) + Fix(bkmk_rcurl): IDNA-encode URLs. PycURL doesn't encode URLs itself + and requires URLs to be in ASCII encoding. + Separate connection timeout for PycURL robot. Also will be used for aiohttp. -- 2.39.5