From 395dd9b744069483549a39297bc2e2cd88edea88 Mon Sep 17 00:00:00 2001 From: Philip Bailey Date: Thu, 6 Jun 2024 20:14:58 +0000 Subject: [PATCH 01/11] add option to require explicit fetch/checkout --- dvc/output.py | 6 ++++++ dvc/repo/checkout.py | 10 +++++++++- dvc/repo/fetch.py | 2 ++ tests/func/test_checkout.py | 20 +++++++++++++++++++- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/dvc/output.py b/dvc/output.py index 7eef73d646..ebdbac1b09 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -88,6 +88,7 @@ def loadd_from(stage, d_list): for d in d_list: p = d.pop(Output.PARAM_PATH) cache = d.pop(Output.PARAM_CACHE, True) + explicit = d.pop(Output.PARAM_EXPLICIT, False) metric = d.pop(Output.PARAM_METRIC, False) plot = d.pop(Output.PARAM_PLOT, False) persist = d.pop(Output.PARAM_PERSIST, False) @@ -103,6 +104,7 @@ def loadd_from(stage, d_list): p, info=d, cache=cache, + explicit=explicit, metric=metric, plot=plot, persist=persist, @@ -282,6 +284,7 @@ class Output: PARAM_PATH = "path" PARAM_CACHE = "cache" + PARAM_EXPLICIT = "explicit" PARAM_FILES = "files" PARAM_METRIC = "metric" PARAM_METRIC_TYPE = "type" @@ -312,6 +315,7 @@ def __init__( # noqa: PLR0913 path, info=None, cache=True, + explicit=False, metric=False, plot=False, persist=False, @@ -384,6 +388,7 @@ def __init__( # noqa: PLR0913 files = [merge_file_meta_from_cloud(f) for f in files] self.files = files self.use_cache = False if self.IS_DEPENDENCY else cache + self.explicit = explicit self.metric = False if self.IS_DEPENDENCY else metric self.plot = False if self.IS_DEPENDENCY else plot self.persist = persist @@ -1500,6 +1505,7 @@ def _merge_dir_version_meta(self, other: "Output"): **ARTIFACT_SCHEMA, **ANNOTATION_SCHEMA, Output.PARAM_CACHE: bool, + Output.PARAM_EXPLICIT: bool, Output.PARAM_REMOTE: str, Output.PARAM_PUSH: bool, Output.PARAM_FILES: [DIR_FILES_SCHEMA], diff --git a/dvc/repo/checkout.py b/dvc/repo/checkout.py index e214fc8536..78bc2dacca 100644 --- a/dvc/repo/checkout.py +++ b/dvc/repo/checkout.py @@ -14,6 +14,7 @@ from . import locked if TYPE_CHECKING: + from dvc.output import Output from dvc_data.index import BaseDataIndex, DataIndexEntry from dvc_objects.fs.base import FileSystem @@ -127,8 +128,15 @@ def onerror(target, exc): raise CheckoutErrorSuggestGit(target) from exc raise + def outs_filter(out: "Output") -> bool: + return not out.explicit or any(t for t in targets) + view = self.index.targets_view( - targets, recursive=recursive, with_deps=with_deps, onerror=onerror + targets, + recursive=recursive, + with_deps=with_deps, + onerror=onerror, + outs_filter=outs_filter, ) with ui.progress(unit="entry", desc="Building workspace index", leave=True) as pb: diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index 3eafb01a2d..5e99f7e596 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -55,6 +55,8 @@ def stage_filter(stage: "Stage") -> bool: def outs_filter(out: "Output") -> bool: if push and not out.can_push: return False + if out.explicit and not any(t for t in targets): + return False return not (remote and out.remote and remote != out.remote) for rev in repo.brancher( diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index 1781aecda2..9ea5277bbe 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -11,7 +11,7 @@ from dvc.exceptions import CheckoutError, CheckoutErrorSuggestGit, NoOutputOrStageError from dvc.fs import system from dvc.stage.exceptions import StageFileDoesNotExistError -from dvc.utils import relpath +from dvc.utils import relpath, serialize from dvc.utils.fs import remove from tests.utils import get_gitignore_content @@ -744,3 +744,21 @@ def test_checkout_dir_compat(tmp_dir, dvc): remove("data") dvc.checkout() assert (tmp_dir / "data").read_text() == {"foo": "foo"} + + +def test_checkout_explicit(tmp_dir, dvc, copy_script): + tmp_dir.dvc_gen({"explicit": "x", "not_explicit": "y"}) + explicit = serialize.load_yaml(tmp_dir / "explicit.dvc") + assert explicit["outs"][0]["path"] == "explicit" + explicit["outs"][0]["explicit"] = True + serialize.dump_yaml(tmp_dir / "explicit.dvc", explicit) + + remove(tmp_dir / "explicit") + remove(tmp_dir / "not_explicit") + + dvc.checkout(force=True) + assert not (tmp_dir / "explicit").exists() + assert (tmp_dir / "not_explicit").read_text() == "y" + + dvc.checkout(targets="explicit.dvc", force=True) + assert (tmp_dir / "explicit").read_text() == "x" From 690259b03705e3d0d83286e410e837164e6e42cb Mon Sep 17 00:00:00 2001 From: Philip Bailey Date: Fri, 14 Jun 2024 20:03:51 +0000 Subject: [PATCH 02/11] simplify filter --- dvc/repo/checkout.py | 2 +- dvc/repo/fetch.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/repo/checkout.py b/dvc/repo/checkout.py index 78bc2dacca..a36d294855 100644 --- a/dvc/repo/checkout.py +++ b/dvc/repo/checkout.py @@ -129,7 +129,7 @@ def onerror(target, exc): raise def outs_filter(out: "Output") -> bool: - return not out.explicit or any(t for t in targets) + return not out.explicit or any(targets) view = self.index.targets_view( targets, diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index 5e99f7e596..525b003442 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -55,7 +55,7 @@ def stage_filter(stage: "Stage") -> bool: def outs_filter(out: "Output") -> bool: if push and not out.can_push: return False - if out.explicit and not any(t for t in targets): + if out.explicit and not any(targets): return False return not (remote and out.remote and remote != out.remote) From 3238f2f2f2218ffe8b6040243ff0a7a6fd1259ce Mon Sep 17 00:00:00 2001 From: Philip Bailey Date: Mon, 24 Jun 2024 17:35:56 +0000 Subject: [PATCH 03/11] update unit test to cover multiple explicit outs --- tests/func/test_checkout.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index 9ea5277bbe..faf62102d6 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -747,18 +747,23 @@ def test_checkout_dir_compat(tmp_dir, dvc): def test_checkout_explicit(tmp_dir, dvc, copy_script): - tmp_dir.dvc_gen({"explicit": "x", "not_explicit": "y"}) - explicit = serialize.load_yaml(tmp_dir / "explicit.dvc") - assert explicit["outs"][0]["path"] == "explicit" - explicit["outs"][0]["explicit"] = True - serialize.dump_yaml(tmp_dir / "explicit.dvc", explicit) - - remove(tmp_dir / "explicit") + tmp_dir.dvc_gen({"explicit1": "x", "explicit2": "y", "not_explicit": "z"}) + for name in ["explicit1", "explicit2"]: + path = tmp_dir / f"{name}.dvc" + explicit = serialize.load_yaml(path) + assert explicit["outs"][0]["path"] == name + explicit["outs"][0]["explicit"] = True + serialize.dump_yaml(path, explicit) + + remove(tmp_dir / "explicit1") + remove(tmp_dir / "explicit2") remove(tmp_dir / "not_explicit") dvc.checkout(force=True) - assert not (tmp_dir / "explicit").exists() - assert (tmp_dir / "not_explicit").read_text() == "y" + assert not (tmp_dir / "explicit1").exists() + assert not (tmp_dir / "explicit2").exists() + assert (tmp_dir / "not_explicit").read_text() == "z" - dvc.checkout(targets="explicit.dvc", force=True) - assert (tmp_dir / "explicit").read_text() == "x" + dvc.checkout(targets="explicit1.dvc", force=True) + assert (tmp_dir / "explicit1").read_text() == "x" + assert not (tmp_dir / "explicit2").exists() From d10a3d042068fa4b2ccbf7d0f39c7000339f1e9b Mon Sep 17 00:00:00 2001 From: Philip Bailey Date: Tue, 25 Jun 2024 20:33:38 +0000 Subject: [PATCH 04/11] switch option name to "pull" --- dvc/output.py | 13 +++++++------ dvc/repo/checkout.py | 4 +++- dvc/repo/fetch.py | 2 +- tests/func/test_checkout.py | 38 +++++++++++++++++++++---------------- 4 files changed, 33 insertions(+), 24 deletions(-) diff --git a/dvc/output.py b/dvc/output.py index ebdbac1b09..18e3a5defa 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -11,6 +11,7 @@ from funcy import collecting, first, project from dvc import prompt +from dvc.config_schema import Choices from dvc.exceptions import ( CacheLinkError, CheckoutError, @@ -88,13 +89,13 @@ def loadd_from(stage, d_list): for d in d_list: p = d.pop(Output.PARAM_PATH) cache = d.pop(Output.PARAM_CACHE, True) - explicit = d.pop(Output.PARAM_EXPLICIT, False) metric = d.pop(Output.PARAM_METRIC, False) plot = d.pop(Output.PARAM_PLOT, False) persist = d.pop(Output.PARAM_PERSIST, False) remote = d.pop(Output.PARAM_REMOTE, None) annot = {field: d.pop(field, None) for field in ANNOTATION_FIELDS} files = d.pop(Output.PARAM_FILES, None) + pull = d.pop(Output.PARAM_PULL, "always") push = d.pop(Output.PARAM_PUSH, True) hash_name = d.pop(Output.PARAM_HASH, None) fs_config = d.pop(Output.PARAM_FS_CONFIG, None) @@ -104,13 +105,13 @@ def loadd_from(stage, d_list): p, info=d, cache=cache, - explicit=explicit, metric=metric, plot=plot, persist=persist, remote=remote, **annot, files=files, + pull=pull, push=push, hash_name=hash_name, fs_config=fs_config, @@ -284,7 +285,6 @@ class Output: PARAM_PATH = "path" PARAM_CACHE = "cache" - PARAM_EXPLICIT = "explicit" PARAM_FILES = "files" PARAM_METRIC = "metric" PARAM_METRIC_TYPE = "type" @@ -299,6 +299,7 @@ class Output: PARAM_PLOT_HEADER = "header" PARAM_PERSIST = "persist" PARAM_REMOTE = "remote" + PARAM_PULL = "pull" PARAM_PUSH = "push" PARAM_CLOUD = "cloud" PARAM_HASH = "hash" @@ -315,7 +316,6 @@ def __init__( # noqa: PLR0913 path, info=None, cache=True, - explicit=False, metric=False, plot=False, persist=False, @@ -327,6 +327,7 @@ def __init__( # noqa: PLR0913 repo=None, fs_config=None, files: Optional[list[dict[str, Any]]] = None, + pull: str = "always", push: bool = True, hash_name: Optional[str] = DEFAULT_ALGORITHM, ): @@ -388,10 +389,10 @@ def __init__( # noqa: PLR0913 files = [merge_file_meta_from_cloud(f) for f in files] self.files = files self.use_cache = False if self.IS_DEPENDENCY else cache - self.explicit = explicit self.metric = False if self.IS_DEPENDENCY else metric self.plot = False if self.IS_DEPENDENCY else plot self.persist = persist + self.pull = pull self.can_push = push self.fs_path = self._parse_path(self.fs, fs_path) @@ -1505,8 +1506,8 @@ def _merge_dir_version_meta(self, other: "Output"): **ARTIFACT_SCHEMA, **ANNOTATION_SCHEMA, Output.PARAM_CACHE: bool, - Output.PARAM_EXPLICIT: bool, Output.PARAM_REMOTE: str, + Output.PARAM_PULL: vol.All(vol.Lower, Choices("always", "never", "explicit")), Output.PARAM_PUSH: bool, Output.PARAM_FILES: [DIR_FILES_SCHEMA], Output.PARAM_FS_CONFIG: dict, diff --git a/dvc/repo/checkout.py b/dvc/repo/checkout.py index a36d294855..056fc02f39 100644 --- a/dvc/repo/checkout.py +++ b/dvc/repo/checkout.py @@ -129,7 +129,9 @@ def onerror(target, exc): raise def outs_filter(out: "Output") -> bool: - return not out.explicit or any(targets) + if out.pull == "never": + return False + return out.pull != "explicit" or any(targets) view = self.index.targets_view( targets, diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index 525b003442..d0dfec4d3e 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -55,7 +55,7 @@ def stage_filter(stage: "Stage") -> bool: def outs_filter(out: "Output") -> bool: if push and not out.can_push: return False - if out.explicit and not any(targets): + if out.pull == "never" or out.pull == "explicit" and not any(targets): return False return not (remote and out.remote and remote != out.remote) diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index faf62102d6..e7fba4b117 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -746,24 +746,30 @@ def test_checkout_dir_compat(tmp_dir, dvc): assert (tmp_dir / "data").read_text() == {"foo": "foo"} -def test_checkout_explicit(tmp_dir, dvc, copy_script): - tmp_dir.dvc_gen({"explicit1": "x", "explicit2": "y", "not_explicit": "z"}) - for name in ["explicit1", "explicit2"]: +def test_checkout_pull_option(tmp_dir, dvc, copy_script): + tmp_dir.dvc_gen({"never": "w", "explicit_1": "x", "explicit_2": "y", "always": "z"}) + for name in ["never", "explicit_1", "explicit_2"]: path = tmp_dir / f"{name}.dvc" - explicit = serialize.load_yaml(path) - assert explicit["outs"][0]["path"] == name - explicit["outs"][0]["explicit"] = True - serialize.dump_yaml(path, explicit) + conf = serialize.load_yaml(path) + assert conf["outs"][0]["path"] == name + pull_type = name.split("_")[0] + conf["outs"][0]["pull"] = pull_type + serialize.dump_yaml(path, conf) - remove(tmp_dir / "explicit1") - remove(tmp_dir / "explicit2") - remove(tmp_dir / "not_explicit") + remove(tmp_dir / "never") + remove(tmp_dir / "explicit_1") + remove(tmp_dir / "explicit_2") + remove(tmp_dir / "always") dvc.checkout(force=True) - assert not (tmp_dir / "explicit1").exists() - assert not (tmp_dir / "explicit2").exists() - assert (tmp_dir / "not_explicit").read_text() == "z" + assert not (tmp_dir / "never").exists() + assert not (tmp_dir / "explicit_1").exists() + assert not (tmp_dir / "explicit_2").exists() + assert (tmp_dir / "always").read_text() == "z" - dvc.checkout(targets="explicit1.dvc", force=True) - assert (tmp_dir / "explicit1").read_text() == "x" - assert not (tmp_dir / "explicit2").exists() + dvc.checkout(targets="explicit_1.dvc", force=True) + assert (tmp_dir / "explicit_1").read_text() == "x" + assert not (tmp_dir / "explicit_2").exists() + + dvc.checkout(targets="never.dvc", force=True) + assert not (tmp_dir / "never").exists() From 473cf912c447dafb5b816e2ec6031cae25a25163 Mon Sep 17 00:00:00 2001 From: Philip Bailey Date: Tue, 25 Jun 2024 21:53:54 +0000 Subject: [PATCH 05/11] simplify pull option --- dvc/output.py | 7 +++---- dvc/repo/checkout.py | 4 +--- dvc/repo/fetch.py | 2 +- tests/func/test_checkout.py | 12 +++--------- 4 files changed, 8 insertions(+), 17 deletions(-) diff --git a/dvc/output.py b/dvc/output.py index 18e3a5defa..2cb3818cf7 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -11,7 +11,6 @@ from funcy import collecting, first, project from dvc import prompt -from dvc.config_schema import Choices from dvc.exceptions import ( CacheLinkError, CheckoutError, @@ -95,7 +94,7 @@ def loadd_from(stage, d_list): remote = d.pop(Output.PARAM_REMOTE, None) annot = {field: d.pop(field, None) for field in ANNOTATION_FIELDS} files = d.pop(Output.PARAM_FILES, None) - pull = d.pop(Output.PARAM_PULL, "always") + pull = d.pop(Output.PARAM_PULL, True) push = d.pop(Output.PARAM_PUSH, True) hash_name = d.pop(Output.PARAM_HASH, None) fs_config = d.pop(Output.PARAM_FS_CONFIG, None) @@ -327,7 +326,7 @@ def __init__( # noqa: PLR0913 repo=None, fs_config=None, files: Optional[list[dict[str, Any]]] = None, - pull: str = "always", + pull: bool = True, push: bool = True, hash_name: Optional[str] = DEFAULT_ALGORITHM, ): @@ -1507,7 +1506,7 @@ def _merge_dir_version_meta(self, other: "Output"): **ANNOTATION_SCHEMA, Output.PARAM_CACHE: bool, Output.PARAM_REMOTE: str, - Output.PARAM_PULL: vol.All(vol.Lower, Choices("always", "never", "explicit")), + Output.PARAM_PULL: bool, Output.PARAM_PUSH: bool, Output.PARAM_FILES: [DIR_FILES_SCHEMA], Output.PARAM_FS_CONFIG: dict, diff --git a/dvc/repo/checkout.py b/dvc/repo/checkout.py index 056fc02f39..7df12ff522 100644 --- a/dvc/repo/checkout.py +++ b/dvc/repo/checkout.py @@ -129,9 +129,7 @@ def onerror(target, exc): raise def outs_filter(out: "Output") -> bool: - if out.pull == "never": - return False - return out.pull != "explicit" or any(targets) + return out.pull or any(targets) view = self.index.targets_view( targets, diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index d0dfec4d3e..16efdd4021 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -55,7 +55,7 @@ def stage_filter(stage: "Stage") -> bool: def outs_filter(out: "Output") -> bool: if push and not out.can_push: return False - if out.pull == "never" or out.pull == "explicit" and not any(targets): + if not out.pull and not any(targets): return False return not (remote and out.remote and remote != out.remote) diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index e7fba4b117..86c97dddd9 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -747,22 +747,19 @@ def test_checkout_dir_compat(tmp_dir, dvc): def test_checkout_pull_option(tmp_dir, dvc, copy_script): - tmp_dir.dvc_gen({"never": "w", "explicit_1": "x", "explicit_2": "y", "always": "z"}) - for name in ["never", "explicit_1", "explicit_2"]: + tmp_dir.dvc_gen({"explicit_1": "x", "explicit_2": "y", "always": "z"}) + for name in ["explicit_1", "explicit_2"]: path = tmp_dir / f"{name}.dvc" conf = serialize.load_yaml(path) assert conf["outs"][0]["path"] == name - pull_type = name.split("_")[0] - conf["outs"][0]["pull"] = pull_type + conf["outs"][0]["pull"] = False serialize.dump_yaml(path, conf) - remove(tmp_dir / "never") remove(tmp_dir / "explicit_1") remove(tmp_dir / "explicit_2") remove(tmp_dir / "always") dvc.checkout(force=True) - assert not (tmp_dir / "never").exists() assert not (tmp_dir / "explicit_1").exists() assert not (tmp_dir / "explicit_2").exists() assert (tmp_dir / "always").read_text() == "z" @@ -770,6 +767,3 @@ def test_checkout_pull_option(tmp_dir, dvc, copy_script): dvc.checkout(targets="explicit_1.dvc", force=True) assert (tmp_dir / "explicit_1").read_text() == "x" assert not (tmp_dir / "explicit_2").exists() - - dvc.checkout(targets="never.dvc", force=True) - assert not (tmp_dir / "never").exists() From 8d4e10326894b6fef7a1e2cc6c5c50423ec6beba Mon Sep 17 00:00:00 2001 From: Philip Bailey Date: Thu, 27 Jun 2024 15:10:15 +0000 Subject: [PATCH 06/11] handle targets==None --- dvc/repo/checkout.py | 2 +- dvc/repo/fetch.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/repo/checkout.py b/dvc/repo/checkout.py index 7df12ff522..47f47c8e95 100644 --- a/dvc/repo/checkout.py +++ b/dvc/repo/checkout.py @@ -129,7 +129,7 @@ def onerror(target, exc): raise def outs_filter(out: "Output") -> bool: - return out.pull or any(targets) + return out.pull or (targets and any(targets)) view = self.index.targets_view( targets, diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index 16efdd4021..b2c7a8a642 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -55,7 +55,7 @@ def stage_filter(stage: "Stage") -> bool: def outs_filter(out: "Output") -> bool: if push and not out.can_push: return False - if not out.pull and not any(targets): + if not out.pull and not (targets and any(targets)): return False return not (remote and out.remote and remote != out.remote) From 98a9d1880031c0c54272bda948b83cd444b8b22b Mon Sep 17 00:00:00 2001 From: Philip Bailey Date: Fri, 28 Jun 2024 14:37:27 +0000 Subject: [PATCH 07/11] handle reuse of `_collect_indexes` in pull context --- dvc/repo/fetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index b2c7a8a642..1b508739a2 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -55,7 +55,7 @@ def stage_filter(stage: "Stage") -> bool: def outs_filter(out: "Output") -> bool: if push and not out.can_push: return False - if not out.pull and not (targets and any(targets)): + if not push and not out.pull and not (targets and any(targets)): return False return not (remote and out.remote and remote != out.remote) From 9a0bc2370162a0e813e84c1c79b67d365591a3a0 Mon Sep 17 00:00:00 2001 From: Philip Bailey Date: Wed, 3 Jul 2024 14:32:59 +0000 Subject: [PATCH 08/11] add fetch test and simplify --- tests/func/test_checkout.py | 9 +++------ tests/func/test_data_cloud.py | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index 86c97dddd9..a0598881a1 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -11,7 +11,7 @@ from dvc.exceptions import CheckoutError, CheckoutErrorSuggestGit, NoOutputOrStageError from dvc.fs import system from dvc.stage.exceptions import StageFileDoesNotExistError -from dvc.utils import relpath, serialize +from dvc.utils import relpath from dvc.utils.fs import remove from tests.utils import get_gitignore_content @@ -749,11 +749,8 @@ def test_checkout_dir_compat(tmp_dir, dvc): def test_checkout_pull_option(tmp_dir, dvc, copy_script): tmp_dir.dvc_gen({"explicit_1": "x", "explicit_2": "y", "always": "z"}) for name in ["explicit_1", "explicit_2"]: - path = tmp_dir / f"{name}.dvc" - conf = serialize.load_yaml(path) - assert conf["outs"][0]["path"] == name - conf["outs"][0]["pull"] = False - serialize.dump_yaml(path, conf) + with (tmp_dir / f"{name}.dvc").modify() as d: + d["outs"][0]["pull"] = False remove(tmp_dir / "explicit_1") remove(tmp_dir / "explicit_2") diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index a506a205ce..118624fc7b 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -678,3 +678,30 @@ def test_pull_granular_excluding_import_that_cannot_be_pulled( dvc.pull() with pytest.raises(CloneError, match="SCM error"): dvc.pull(imp_stage.addressing) + + +def test_fetch_pull_option(tmp_dir, dvc, local_remote): + file_config = {"explicit_1": "x", "explicit_2": "y", "always": "z"} + tmp_dir.dvc_gen(file_config) + for name in ["explicit_1", "explicit_2"]: + with (tmp_dir / f"{name}.dvc").modify() as d: + d["outs"][0]["pull"] = False + dvc.push() + oids = { + name: (tmp_dir / f"{name}.dvc").parse()["outs"][0]["md5"] + for name in file_config + } + expected = list(oids.values()) + assert dvc.cache.local.oids_exist(expected) == expected + + # purge cache + dvc.cache.local.clear() + assert dvc.cache.local.oids_exist(oids.values()) == [] + + dvc.fetch() + expected = [oids[name] for name in ["always"]] + assert dvc.cache.local.oids_exist(expected) == expected + + dvc.fetch("explicit_1") + expected = [oids[name] for name in ["always", "explicit_1"]] + assert dvc.cache.local.oids_exist(expected) == expected From 15033f9d3426c8465c1162b710397209280cfeed Mon Sep 17 00:00:00 2001 From: Philip Bailey Date: Wed, 3 Jul 2024 14:40:37 +0000 Subject: [PATCH 09/11] fix test --- tests/func/test_data_cloud.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 118624fc7b..80699e2928 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -687,21 +687,21 @@ def test_fetch_pull_option(tmp_dir, dvc, local_remote): with (tmp_dir / f"{name}.dvc").modify() as d: d["outs"][0]["pull"] = False dvc.push() - oids = { + oids_by_name = { name: (tmp_dir / f"{name}.dvc").parse()["outs"][0]["md5"] for name in file_config } - expected = list(oids.values()) - assert dvc.cache.local.oids_exist(expected) == expected + all_oids = set(oids_by_name.values()) + assert set(dvc.cache.local.oids_exist(all_oids)) == all_oids # purge cache dvc.cache.local.clear() - assert dvc.cache.local.oids_exist(oids.values()) == [] + assert set(dvc.cache.local.oids_exist(all_oids)) == set() dvc.fetch() - expected = [oids[name] for name in ["always"]] - assert dvc.cache.local.oids_exist(expected) == expected + expected = {oids_by_name[name] for name in ["always"]} + assert set(dvc.cache.local.oids_exist(all_oids)) == expected dvc.fetch("explicit_1") - expected = [oids[name] for name in ["always", "explicit_1"]] - assert dvc.cache.local.oids_exist(expected) == expected + expected = {oids_by_name[name] for name in ["always", "explicit_1"]} + assert set(dvc.cache.local.oids_exist(all_oids)) == expected From 0935bbe4f32e99bff96d0cd40e9eba48a731fed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Thu, 4 Jul 2024 08:45:45 +0545 Subject: [PATCH 10/11] add support for stages --- dvc/output.py | 4 ++ dvc/schema.py | 1 + dvc/stage/serialize.py | 3 ++ tests/func/test_checkout.py | 10 ++-- tests/func/test_data_cloud.py | 88 +++++++++++++++++++++++++++-------- 5 files changed, 81 insertions(+), 25 deletions(-) diff --git a/dvc/output.py b/dvc/output.py index 2cb3818cf7..28c2d46a4d 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -191,6 +191,7 @@ def load_from_pipeline(stage, data, typ="outs"): Output.PARAM_PERSIST, Output.PARAM_REMOTE, Output.PARAM_PUSH, + Output.PARAM_PULL, *ANNOTATION_FIELDS, ], ) @@ -875,6 +876,9 @@ def dumpd(self, **kwargs): # noqa: C901, PLR0912 if not self.can_push: ret[self.PARAM_PUSH] = self.can_push + if not self.pull: + ret[self.PARAM_PULL] = self.pull + if with_files: obj = self.obj or self.get_obj() if obj: diff --git a/dvc/schema.py b/dvc/schema.py index 85001016fc..bad6eb5049 100644 --- a/dvc/schema.py +++ b/dvc/schema.py @@ -59,6 +59,7 @@ "checkpoint": bool, Output.PARAM_REMOTE: str, Output.PARAM_PUSH: bool, + Output.PARAM_PULL: bool, } } diff --git a/dvc/stage/serialize.py b/dvc/stage/serialize.py index fef6514b71..d780b480ca 100644 --- a/dvc/stage/serialize.py +++ b/dvc/stage/serialize.py @@ -29,6 +29,7 @@ PARAM_DESC = Annotation.PARAM_DESC PARAM_REMOTE = Output.PARAM_REMOTE PARAM_PUSH = Output.PARAM_PUSH +PARAM_PULL = Output.PARAM_PULL DEFAULT_PARAMS_FILE = ParamsDependency.DEFAULT_PARAMS_FILE @@ -51,6 +52,8 @@ def _get_flags(out): yield PARAM_REMOTE, out.remote if not out.can_push: yield PARAM_PUSH, False + if not out.pull: + yield PARAM_PULL, False def _serialize_out(out): diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index a0598881a1..42ccb35929 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -746,11 +746,11 @@ def test_checkout_dir_compat(tmp_dir, dvc): assert (tmp_dir / "data").read_text() == {"foo": "foo"} -def test_checkout_pull_option(tmp_dir, dvc, copy_script): - tmp_dir.dvc_gen({"explicit_1": "x", "explicit_2": "y", "always": "z"}) - for name in ["explicit_1", "explicit_2"]: - with (tmp_dir / f"{name}.dvc").modify() as d: - d["outs"][0]["pull"] = False +def test_checkout_for_files_with_explicit_pull_option_set(tmp_dir, dvc, copy_script): + stages = tmp_dir.dvc_gen({"explicit_1": "x", "explicit_2": "y", "always": "z"}) + for stage in stages[:-1]: + stage.outs[0].pull = False + stage.dump() remove(tmp_dir / "explicit_1") remove(tmp_dir / "explicit_2") diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 80699e2928..55ac6debd9 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -680,28 +680,76 @@ def test_pull_granular_excluding_import_that_cannot_be_pulled( dvc.pull(imp_stage.addressing) -def test_fetch_pull_option(tmp_dir, dvc, local_remote): - file_config = {"explicit_1": "x", "explicit_2": "y", "always": "z"} - tmp_dir.dvc_gen(file_config) - for name in ["explicit_1", "explicit_2"]: - with (tmp_dir / f"{name}.dvc").modify() as d: - d["outs"][0]["pull"] = False +def test_fetch_for_files_with_explicit_pull_set(tmp_dir, dvc, local_remote): + stages = tmp_dir.dvc_gen({"explicit_1": "x", "explicit_2": "y", "always": "z"}) + for stage in stages[:-1]: + stage.outs[0].pull = False + stage.dump() + + dvc.push() + dvc.cache.local.clear() # purge cache + + oids = {stage.outs[0].def_path: stage.outs[0].hash_info.value for stage in stages} + + assert dvc.fetch() == 1 + assert set(dvc.cache.local.all()) == {oids["always"]} + + assert dvc.fetch("explicit_1") == 1 + assert set(dvc.cache.local.all()) == {oids["always"], oids["explicit_1"]} + + +def test_pull_for_files_with_explicit_pull_set(tmp_dir, dvc, local_remote): + stages = tmp_dir.dvc_gen({"explicit_1": "x", "explicit_2": "y", "always": "z"}) + for stage in stages[:-1]: + stage.outs[0].pull = False + stage.dump() + dvc.push() - oids_by_name = { - name: (tmp_dir / f"{name}.dvc").parse()["outs"][0]["md5"] - for name in file_config + dvc.cache.local.clear() # purge cache + remove("explicit_1") + remove("explicit_2") + remove("always") + + assert dvc.pull() == { + "added": ["always"], + "deleted": [], + "fetched": 1, + "modified": [], } - all_oids = set(oids_by_name.values()) - assert set(dvc.cache.local.oids_exist(all_oids)) == all_oids - # purge cache - dvc.cache.local.clear() - assert set(dvc.cache.local.oids_exist(all_oids)) == set() + assert dvc.pull("explicit_1") == { + "added": ["explicit_1"], + "deleted": [], + "fetched": 1, + "modified": [], + } - dvc.fetch() - expected = {oids_by_name[name] for name in ["always"]} - assert set(dvc.cache.local.oids_exist(all_oids)) == expected - dvc.fetch("explicit_1") - expected = {oids_by_name[name] for name in ["always", "explicit_1"]} - assert set(dvc.cache.local.oids_exist(all_oids)) == expected +def test_pull_for_stage_outputs_with_explicit_pull_set(tmp_dir, dvc, local_remote): + stage1 = dvc.stage.add(name="always", outs=["always"], cmd="echo always > always") + stage2 = dvc.stage.add( + name="explicit", outs=["explicit"], cmd="echo explicit > explicit" + ) + stage2.outs[0].pull = False + stage2.dump() + + assert set(dvc.reproduce()) == {stage1, stage2} + dvc.push() + + dvc.cache.local.clear() # purge cache + remove("explicit") + remove("always") + + assert dvc.pull() == { + "added": ["always"], + "deleted": [], + "fetched": 1, + "modified": [], + } + + assert dvc.pull("explicit") == { + "added": ["explicit"], + "deleted": [], + "fetched": 1, + "modified": [], + } From 514b15efac3d52ffb0241534dd5baefd7384fd4f Mon Sep 17 00:00:00 2001 From: Philip Bailey Date: Fri, 12 Jul 2024 20:30:34 +0000 Subject: [PATCH 11/11] remove pull option from checkout but allow missing for pull=False --- dvc/repo/checkout.py | 26 +++++++++++++------------- tests/func/test_checkout.py | 14 ++++++++++---- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/dvc/repo/checkout.py b/dvc/repo/checkout.py index 47f47c8e95..2948a06055 100644 --- a/dvc/repo/checkout.py +++ b/dvc/repo/checkout.py @@ -14,7 +14,6 @@ from . import locked if TYPE_CHECKING: - from dvc.output import Output from dvc_data.index import BaseDataIndex, DataIndexEntry from dvc_objects.fs.base import FileSystem @@ -128,15 +127,11 @@ def onerror(target, exc): raise CheckoutErrorSuggestGit(target) from exc raise - def outs_filter(out: "Output") -> bool: - return out.pull or (targets and any(targets)) - view = self.index.targets_view( targets, recursive=recursive, with_deps=with_deps, onerror=onerror, - outs_filter=outs_filter, ) with ui.progress(unit="entry", desc="Building workspace index", leave=True) as pb: @@ -153,16 +148,16 @@ def outs_filter(out: "Output") -> bool: _check_can_delete(diff.files_delete, new, self.root_dir, self.fs) failed = set() - out_paths = [out.fs_path for out in view.outs if out.use_cache and out.is_in_repo] + outs = [out for out in view.outs if out.use_cache and out.is_in_repo] def checkout_onerror(src_path, dest_path, _exc): logger.debug( "failed to create '%s' from '%s'", dest_path, src_path, exc_info=True ) - for out_path in out_paths: - if self.fs.isin_or_eq(dest_path, out_path): - failed.add(out_path) + for out in outs: + if self.fs.isin_or_eq(dest_path, out.fs_path): + failed.add(out) with ui.progress(unit="file", desc="Applying changes", leave=True) as pb: apply( @@ -179,16 +174,21 @@ def checkout_onerror(src_path, dest_path, _exc): out_changes = _build_out_changes(view, diff.changes) typ_map = {ADD: "added", DELETE: "deleted", MODIFY: "modified"} + failed_paths = {out.fs_path for out in failed} for key, typ in out_changes.items(): out_path = self.fs.join(self.root_dir, *key) - if out_path in failed: + if out_path in failed_paths: self.fs.remove(out_path, recursive=True) else: self.state.save_link(out_path, self.fs) stats[typ_map[typ]].append(_fspath_dir(out_path)) - if failed and not allow_missing: - raise CheckoutError([relpath(out_path) for out_path in failed], stats) - + unexpected_failure = [ + out.fs_path for out in failed if out.pull or (targets and any(targets)) + ] + if unexpected_failure and not allow_missing: + raise CheckoutError( + [relpath(out_path) for out_path in unexpected_failure], stats + ) return stats diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index 42ccb35929..e53558ff90 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -756,11 +756,17 @@ def test_checkout_for_files_with_explicit_pull_option_set(tmp_dir, dvc, copy_scr remove(tmp_dir / "explicit_2") remove(tmp_dir / "always") + # ensure missing pull=False file does not cause an error + explicit2_oid = (tmp_dir / "explicit_2.dvc").parse()["outs"][0]["md5"] + dvc.cache.local.delete(explicit2_oid) + dvc.checkout(force=True) - assert not (tmp_dir / "explicit_1").exists() + # pull=False, but present in cache + assert (tmp_dir / "explicit_1").read_text() == "x" + # pull=False, not in cache assert not (tmp_dir / "explicit_2").exists() + # pull=True assert (tmp_dir / "always").read_text() == "z" - dvc.checkout(targets="explicit_1.dvc", force=True) - assert (tmp_dir / "explicit_1").read_text() == "x" - assert not (tmp_dir / "explicit_2").exists() + with pytest.raises(CheckoutError): + dvc.checkout(targets="explicit_2.dvc", force=True)