From acd7161cfd52ae4889e41598a91de38ee66af9d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 12 Jun 2024 07:50:07 +0200 Subject: [PATCH 1/2] canonicalize_url: do not apply lowercase to userinfo --- tests/test_url.py | 5 +++++ w3lib/url.py | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_url.py b/tests/test_url.py index ca84745..f969cff 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1384,6 +1384,11 @@ def test_domains_are_case_insensitive(self): canonicalize_url("http://www.EXAMPLE.com/"), "http://www.example.com/" ) + def test_userinfo_is_case_sensitive(self): + self.assertEqual( + canonicalize_url("sftp://UsEr:PaSsWoRd@www.EXAMPLE.com/"), "sftp://UsEr:PaSsWoRd@www.example.com/" + ) + def test_canonicalize_idns(self): self.assertEqual( canonicalize_url("http://www.bücher.de?q=bücher"), diff --git a/w3lib/url.py b/w3lib/url.py index 28e70cb..bb6486c 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -654,9 +654,14 @@ def canonicalize_url( fragment = "" if not keep_fragments else fragment + # Apply lowercase to the domain, but not to the userinfo. + netloc_parts = netloc.split("@") + netloc_parts[-1] = netloc_parts[-1].lower().rstrip(":") + netloc = "@".join(netloc_parts) + # every part should be safe already return urlunparse( - (scheme, netloc.lower().rstrip(":"), path, params, query, fragment) + (scheme, netloc, path, params, query, fragment) ) From a4b444c5960fd082322d50adbab2c0c5a78666a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 12 Jun 2024 07:52:18 +0200 Subject: [PATCH 2/2] Run pre-commit --- tests/test_url.py | 3 ++- w3lib/url.py | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_url.py b/tests/test_url.py index f969cff..319d76c 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1386,7 +1386,8 @@ def test_domains_are_case_insensitive(self): def test_userinfo_is_case_sensitive(self): self.assertEqual( - canonicalize_url("sftp://UsEr:PaSsWoRd@www.EXAMPLE.com/"), "sftp://UsEr:PaSsWoRd@www.example.com/" + canonicalize_url("sftp://UsEr:PaSsWoRd@www.EXAMPLE.com/"), + "sftp://UsEr:PaSsWoRd@www.example.com/", ) def test_canonicalize_idns(self): diff --git a/w3lib/url.py b/w3lib/url.py index bb6486c..c142048 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -660,9 +660,7 @@ def canonicalize_url( netloc = "@".join(netloc_parts) # every part should be safe already - return urlunparse( - (scheme, netloc, path, params, query, fragment) - ) + return urlunparse((scheme, netloc, path, params, query, fragment)) def _unquotepath(path: str) -> bytes: