From 32739364fb032dc1a55c62c0cb6c28ccfa95f765 Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Thu, 26 Sep 2024 23:18:37 +0100 Subject: [PATCH] Validate ASCII host values (#954) Co-authored-by: J. Nick Koston Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- CHANGES/880.bugfix.rst | 3 ++ CHANGES/954.bugfix.rst | 1 + docs/api.rst | 22 +++++++------ docs/spelling_wordlist.txt | 1 + tests/test_cache.py | 12 +++++-- tests/test_url.py | 11 +++++++ tests/test_url_build.py | 19 +++++++++++ tests/test_url_update_netloc.py | 20 ++++++++++++ yarl/_url.py | 57 ++++++++++++++++++++++++++++++--- 9 files changed, 130 insertions(+), 16 deletions(-) create mode 100644 CHANGES/880.bugfix.rst create mode 120000 CHANGES/954.bugfix.rst diff --git a/CHANGES/880.bugfix.rst b/CHANGES/880.bugfix.rst new file mode 100644 index 00000000..39a64ea0 --- /dev/null +++ b/CHANGES/880.bugfix.rst @@ -0,0 +1,3 @@ +Started rejecting ASCII hostnames with invalid characters. For host strings that +look like authority strings, the exception message includes advice on what to do +instead -- by :user:`mjpieters`. diff --git a/CHANGES/954.bugfix.rst b/CHANGES/954.bugfix.rst new file mode 120000 index 00000000..e6d28843 --- /dev/null +++ b/CHANGES/954.bugfix.rst @@ -0,0 +1 @@ +880.bugfix.rst \ No newline at end of file diff --git a/docs/api.rst b/docs/api.rst index 170df7f3..b7b80cf6 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1037,19 +1037,20 @@ Default port substitution Cache control ------------- -IDNA conversion and IP Address parsing used for host encoding are quite expensive -operations, that's why the ``yarl`` library caches these calls by storing -last ``256`` results in the global LRU cache. +IDNA conversion, host validation, and IP Address parsing used for host +encoding are quite expensive operations, that's why the ``yarl`` +library caches these calls by storing last ``256`` results in the +global LRU cache. .. function:: cache_clear() - Clear IDNA and IP Address caches. + Clear IDNA, host validation, and IP Address caches. .. function:: cache_info() - Return a dictionary with ``"idna_encode"``, ``"idna_decode"``, and - ``"ip_address"`` keys, each value + Return a dictionary with ``"idna_encode"``, ``"idna_decode"``, ``"ip_address"``, + and ``"host_validate"`` keys, each value points to corresponding ``CacheInfo`` structure (see :func:`functools.lru_cache` for details): @@ -1059,12 +1060,15 @@ last ``256`` results in the global LRU cache. >>> yarl.cache_info() {'idna_encode': CacheInfo(hits=5, misses=5, maxsize=256, currsize=5), 'idna_decode': CacheInfo(hits=24, misses=15, maxsize=256, currsize=15), - 'ip_address': CacheInfo(hits=46933, misses=84, maxsize=256, currsize=101)} + 'ip_address': CacheInfo(hits=46933, misses=84, maxsize=256, currsize=101), + 'host_validate': CacheInfo(hits=0, misses=0, maxsize=256, currsize=0)} -.. function:: cache_configure(*, idna_encode_size=256, idna_decode_size=256, ip_address_size=256) - Set the IP Address and IDNA encode and decode cache sizes (``256`` for each by default). +.. function:: cache_configure(*, idna_encode_size=256, idna_decode_size=256, ip_address_size=256, host_validate_size=256) + + Set the IP Address, host validation, and IDNA encode and + decode cache sizes (``256`` for each by default). Pass ``None`` to make the corresponding cache unbounded (may speed up host encoding operation a little but the memory footprint can be very high, diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 6b90d22e..8e66ca9e 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -27,6 +27,7 @@ facto glibc google hardcoded +hostnames macOS mailto manylinux diff --git a/tests/test_cache.py b/tests/test_cache.py index 46d5b01f..9b0e75a3 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -13,7 +13,7 @@ def test_cache_clear() -> None: def test_cache_info() -> None: info = yarl.cache_info() - assert info.keys() == {"idna_encode", "idna_decode", "ip_address"} + assert info.keys() == {"idna_encode", "idna_decode", "ip_address", "host_validate"} def test_cache_configure_default() -> None: @@ -22,11 +22,17 @@ def test_cache_configure_default() -> None: def test_cache_configure_None() -> None: yarl.cache_configure( - idna_encode_size=None, idna_decode_size=None, ip_address_size=None + idna_encode_size=None, + idna_decode_size=None, + ip_address_size=None, + host_validate_size=None, ) def test_cache_configure_explicit() -> None: yarl.cache_configure( - idna_encode_size=128, idna_decode_size=128, ip_address_size=128 + idna_encode_size=128, + idna_decode_size=128, + ip_address_size=128, + host_validate_size=128, ) diff --git a/tests/test_url.py b/tests/test_url.py index 5115c874..49245b8d 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -2058,3 +2058,14 @@ def test_parsing_populates_cache(): assert url._cache["raw_query_string"] == "a=b" assert url._cache["raw_fragment"] == "frag" assert url._cache["scheme"] == "http" + + +@pytest.mark.parametrize( + ("host", "is_authority"), + [ + *(("other_gen_delim_" + c, False) for c in "[]"), + ], +) +def test_build_with_invalid_ipv6_host(host: str, is_authority: bool): + with pytest.raises(ValueError, match="Invalid IPv6 URL"): + URL(f"http://{host}/") diff --git a/tests/test_url_build.py b/tests/test_url_build.py index b6675c29..0b7b9cd7 100644 --- a/tests/test_url_build.py +++ b/tests/test_url_build.py @@ -120,6 +120,25 @@ def test_build_with_authority_and_host(): URL.build(authority="host.com", host="example.com") +@pytest.mark.parametrize( + ("host", "is_authority"), + [ + ("user:pass@host.com", True), + ("user@host.com", True), + ("host:com", False), + ("not_percent_encoded%Zf", False), + ("still_not_percent_encoded%fZ", False), + *(("other_gen_delim_" + c, False) for c in "/?#[]"), + ], +) +def test_build_with_invalid_host(host: str, is_authority: bool): + match = r"Host '[^']+' cannot contain '[^']+' \(at position \d+\)" + if is_authority: + match += ", if .* use 'authority' instead of 'host'" + with pytest.raises(ValueError, match=f"{match}$"): + URL.build(host=host) + + def test_build_with_authority(): url = URL.build(scheme="http", authority="степан:bar@host.com:8000", path="path") assert ( diff --git a/tests/test_url_update_netloc.py b/tests/test_url_update_netloc.py index 295d6201..cdb07a2d 100644 --- a/tests/test_url_update_netloc.py +++ b/tests/test_url_update_netloc.py @@ -171,6 +171,26 @@ def test_with_host_non_ascii(): assert url2.authority == "оун-упа.укр:123" +@pytest.mark.parametrize( + ("host", "is_authority"), + [ + ("user:pass@host.com", True), + ("user@host.com", True), + ("host:com", False), + ("not_percent_encoded%Zf", False), + ("still_not_percent_encoded%fZ", False), + *(("other_gen_delim_" + c, False) for c in "/?#[]"), + ], +) +def test_with_invalid_host(host: str, is_authority: bool): + url = URL("http://example.com:123") + match = r"Host '[^']+' cannot contain '[^']+' \(at position \d+\)" + if is_authority: + match += ", if .* use 'authority' instead of 'host'" + with pytest.raises(ValueError, match=f"{match}$"): + url.with_host(host=host) + + def test_with_host_percent_encoded(): url = URL("http://%25cf%2580%cf%80:%25cf%2580%cf%80@example.com:123") url2 = url.with_host("%cf%80.org") diff --git a/yarl/_url.py b/yarl/_url.py index 37e4739f..25988f81 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -1,4 +1,5 @@ import math +import re import sys import warnings from collections.abc import Mapping, Sequence @@ -45,6 +46,22 @@ sentinel = object() +# reg-name: unreserved / pct-encoded / sub-delims +# this pattern matches anything that is *not* in those classes. and is only used +# on lower-cased ASCII values. +_not_reg_name = re.compile( + r""" + # any character not in the unreserved or sub-delims sets, plus % + # (validated with the additional check for pct-encoded sequences below) + [^a-z0-9\-._~!$&'()*+,;=%] + | + # % only allowed if it is part of a pct-encoded + # sequence of 2 hex digits. + %(?![0-9a-f]{2}) + """, + re.VERBOSE, +) + SimpleQuery = Union[str, int, float] QueryVariable = Union[SimpleQuery, "Sequence[SimpleQuery]"] Query = Union[ @@ -64,6 +81,7 @@ class CacheInfo(TypedDict): idna_encode: _CacheInfo idna_decode: _CacheInfo ip_address: _CacheInfo + host_validate: _CacheInfo class _SplitResultDict(TypedDict, total=False): @@ -269,7 +287,7 @@ def __new__( raise ValueError(msg) else: host = "" - host = cls._encode_host(host) + host = cls._encode_host(host, validate_host=False) raw_user = None if username is None else cls._REQUOTER(username) raw_password = None if password is None else cls._REQUOTER(password) netloc = cls._make_netloc( @@ -959,7 +977,9 @@ def _normalize_path(cls, path: str) -> str: return prefix + "/".join(_normalize_path_segments(segments)) @classmethod - def _encode_host(cls, host: str, human: bool = False) -> str: + def _encode_host( + cls, host: str, human: bool = False, validate_host: bool = True + ) -> str: if "%" in host: raw_ip, sep, zone = host.partition("%") else: @@ -999,11 +1019,18 @@ def _encode_host(cls, host: str, human: bool = False) -> str: return host host = host.lower() + if human: + return host + # IDNA encoding is slow, # skip it for ASCII-only strings # Don't move the check into _idna_encode() helper # to reduce the cache size - if human or host.isascii(): + if host.isascii(): + # Check for invalid characters explicitly; _idna_encode() does this + # for non-ascii host names. + if validate_host: + _host_validate(host) return host return _idna_encode(host) @@ -1559,12 +1586,31 @@ def _ip_compressed_version(raw_ip: str) -> Tuple[str, int]: return ip.compressed, ip.version +@lru_cache(_MAXCACHE) +def _host_validate(host: str) -> None: + """Validate an ascii host name.""" + invalid = _not_reg_name.search(host) + if invalid is None: + return + value, pos, extra = invalid.group(), invalid.start(), "" + if value == "@" or (value == ":" and "@" in host[pos:]): + # this looks like an authority string + extra = ( + ", if the value includes a username or password, " + "use 'authority' instead of 'host'" + ) + raise ValueError( + f"Host {host!r} cannot contain {value!r} (at position " f"{pos}){extra}" + ) from None + + @rewrite_module def cache_clear() -> None: """Clear all LRU caches.""" _idna_decode.cache_clear() _idna_encode.cache_clear() _ip_compressed_version.cache_clear() + _host_validate.cache_clear() @rewrite_module @@ -1574,6 +1620,7 @@ def cache_info() -> CacheInfo: "idna_encode": _idna_encode.cache_info(), "idna_decode": _idna_decode.cache_info(), "ip_address": _ip_compressed_version.cache_info(), + "host_validate": _host_validate.cache_info(), } @@ -1583,12 +1630,14 @@ def cache_configure( idna_encode_size: Union[int, None] = _MAXCACHE, idna_decode_size: Union[int, None] = _MAXCACHE, ip_address_size: Union[int, None] = _MAXCACHE, + host_validate_size: Union[int, None] = _MAXCACHE, ) -> None: """Configure LRU cache sizes.""" - global _idna_decode, _idna_encode, _ip_compressed_version + global _idna_decode, _idna_encode, _ip_compressed_version, _host_validate _idna_encode = lru_cache(idna_encode_size)(_idna_encode.__wrapped__) _idna_decode = lru_cache(idna_decode_size)(_idna_decode.__wrapped__) _ip_compressed_version = lru_cache(ip_address_size)( _ip_compressed_version.__wrapped__ ) + _host_validate = lru_cache(host_validate_size)(_host_validate.__wrapped__)