diff --git a/tests/test_url.py b/tests/test_url.py index d0b35ae..9223067 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1,13 +1,10 @@ #!/usr/bin/env python3 +import urllib.parse from pathlib import PurePosixPath from typing import Any, cast import pytest - -try: - import webob -except ImportError: - webob = None +import webob from urlpath import URL, JailedURL @@ -185,7 +182,6 @@ def test_trailing_sep() -> None: assert URL("htp://example.com/with/double-sep//").trailing_sep == "//" -@pytest.mark.skipif(webob is None, reason="webob not installed") def test_webob() -> None: base_url = "http://www.example.com" url = URL(webob.Request.blank("/webob/request", base_url=base_url)) @@ -195,7 +191,6 @@ def test_webob() -> None: assert str(url / webob.Request.blank("/replaced/path")) == "http://localhost/replaced/path" -@pytest.mark.skipif(webob is None, reason="webob not installed") def test_webob_jail() -> None: request = webob.Request.blank("/path/to/filename.ext", {"SCRIPT_NAME": "/app/root"}) @@ -319,6 +314,38 @@ def test_pchar() -> None: assert str(url) == "s3://mybucket/some_folder/123_2017-10-30T18:43:11.csv.gz" +@pytest.mark.parametrize( + ("raw", "expected"), + ( + (urllib.parse.urlsplit("http://example.com/from-split?x=1#frag"), "http://example.com/from-split?x=1#frag"), + (urllib.parse.urlparse("https://example.com/from-parse"), "https://example.com/from-parse"), + (b"http://example.com/from-bytes", "http://example.com/from-bytes"), + ), +) +def test_constructor_canonicalizes_supported_types(raw: Any, expected: str) -> None: + class PathLike: + def __fspath__(self) -> str: + return "http://example.com/from-fspath" + + assert str(URL(raw)) == expected + + # PathLike objects should be accepted consistently regardless of other inputs + assert str(URL(PathLike())) == "http://example.com/from-fspath" + + +def test_multi_argument_constructor_matches_joinpath_semantics() -> None: + base = URL("http://example.com/base/") + + combined = URL("http://example.com/base/", "child", "../final") + chained = base / "child" / "../final" + + assert str(combined) == str(chained) + assert "\x00" not in "".join(combined.parts) + + absolute_override = URL("http://example.com/base/", "https://other.com/override", "tail") + assert str(absolute_override) == "https://other.com/override/tail" + + def test_percent_encoding_spaces() -> None: """Test that %20 encoded spaces don't get double-encoded.""" # Reported bug: URL with %20 in middle of path segment gets double-encoded to %2520 diff --git a/urlpath/_url.py b/urlpath/_url.py index 7879380..1ad0a4d 100644 --- a/urlpath/_url.py +++ b/urlpath/_url.py @@ -73,17 +73,19 @@ def __new__(cls, *args: Any) -> URL: Returns: New URL instance """ + canonicalized_args = tuple(cls._canonicalize_arg(a) for a in args) + + if len(canonicalized_args) > 1: + canonicalized_args = cls._combine_args(canonicalized_args) + if IS_PY312_PLUS: # Python 3.12: Canonicalize for stricter PurePath validation # Note: This happens BEFORE _parse_args, so it's not redundant - canonicalized_args = tuple(cls._canonicalize_arg(a) for a in args) - if len(canonicalized_args) > 1: - combined = cls._combine_args(canonicalized_args) - return super().__new__(cls, *combined) return super().__new__(cls, *canonicalized_args) - else: - # Python < 3.12: No early validation, canonicalization happens in _parse_args - return super().__new__(cls, *args) + + # Python < 3.12: Parent class will still invoke _parse_args, but we feed it + # the canonicalized arguments so multi-argument construction matches joinpath. + return super().__new__(cls, *canonicalized_args) def __init__(self, *args: Any) -> None: """Initialize URL instance. @@ -105,25 +107,24 @@ def __init__(self, *args: Any) -> None: super().__init__(*canonicalized_args) # else: Python < 3.12 doesn't call parent __init__ (it's object.__init__) - if IS_PY312_PLUS: - - @classmethod - def _combine_args(cls, canonicalized_args: tuple[str, ...]) -> tuple[str, ...]: - """Combine raw constructor arguments to emulate legacy joining semantics.""" - if not canonicalized_args: - return canonicalized_args - - current = canonicalized_args[0] - for seg in canonicalized_args[1:]: - parsed_current = urllib.parse.urlsplit(current) - parsed_segment = urllib.parse.urlsplit(seg) - - if parsed_segment.scheme: - current = urllib.parse.urlunsplit(parsed_segment) - continue - - if seg.startswith("/"): - current = urllib.parse.urlunsplit( + @classmethod + def _combine_args(cls, canonicalized_args: tuple[str, ...]) -> tuple[str, ...]: + """Combine raw constructor arguments to emulate legacy joining semantics.""" + if not canonicalized_args: + return canonicalized_args + + current = canonicalized_args[0] + for seg in canonicalized_args[1:]: + parsed_current = urllib.parse.urlsplit(current) + parsed_segment = urllib.parse.urlsplit(seg) + + if parsed_segment.scheme: + current = cleanup_escapes(urllib.parse.urlunsplit(parsed_segment)) + continue + + if seg.startswith("/"): + current = cleanup_escapes( + urllib.parse.urlunsplit( ( parsed_current.scheme, parsed_current.netloc, @@ -132,17 +133,19 @@ def _combine_args(cls, canonicalized_args: tuple[str, ...]) -> tuple[str, ...]: parsed_segment.fragment, ) ) - continue + ) + continue - base_path = parsed_current.path or ("/" if parsed_current.netloc else "") - joined_path = posixpath.join(base_path, seg) - if joined_path == ".": - joined_path = "" - else: - parts = joined_path.split("/") - if "." in parts: - joined_path = "/".join(part for part in parts if part != ".") - current = urllib.parse.urlunsplit( + base_path = parsed_current.path or ("/" if parsed_current.netloc else "") + joined_path = posixpath.join(base_path, seg) + if joined_path == ".": + joined_path = "" + else: + parts = joined_path.split("/") + if "." in parts: + joined_path = "/".join(part for part in parts if part != ".") + current = cleanup_escapes( + urllib.parse.urlunsplit( ( parsed_current.scheme, parsed_current.netloc, @@ -151,8 +154,11 @@ def _combine_args(cls, canonicalized_args: tuple[str, ...]) -> tuple[str, ...]: "", ) ) + ) - return (current,) + return (cleanup_escapes(current),) + + if IS_PY312_PLUS: @classmethod def _parse_path(cls, path: str) -> tuple[str, str, list[str]]: @@ -306,7 +312,10 @@ def _parse_args(cls, args: Any) -> Any: Returns: Parsed arguments suitable for parent class """ - return super()._parse_args(cls._canonicalize_arg(a) for a in args) + canonicalized = tuple(cls._canonicalize_arg(a) for a in args) + if len(canonicalized) > 1: + canonicalized = cls._combine_args(canonicalized) + return super()._parse_args(canonicalized) @classmethod def _canonicalize_arg(cls, a: Any) -> str: diff --git a/urlpath/_utils.py b/urlpath/_utils.py index 92767ca..9f889c9 100644 --- a/urlpath/_utils.py +++ b/urlpath/_utils.py @@ -240,4 +240,4 @@ def cleanup_escapes(text: str) -> str: Returns: String with \x00 replaced by / """ - return text.replace("\\x00", "/") + return text.replace("\x00", "/").replace("\\x00", "/").replace("%5Cx00", "/").replace("%5cx00", "/")