Skip to content

Commit c92f346

Browse files
committed
refactoring and add add a function to verify cached repos
1 parent d66d0b3 commit c92f346

File tree

7 files changed

+555
-180
lines changed

7 files changed

+555
-180
lines changed

docs/source/en/guides/cli.md

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -671,23 +671,42 @@ As with the other cache commands, `--dry-run`, `--yes`, and `--cache-dir` are av
671671

672672
## hf cache verify
673673

674-
Use `hf cache verify` to validate cached files against their checksums on the Hub. You can pass one or more targets:
674+
Use `hf cache verify` to validate local files against their checksums on the Hub. Target a single repo per invocation and choose between verifying the cache snapshot or a regular local directory.
675675

676-
- Repository ID like `model/sentence-transformers/all-MiniLM-L6-v2` to verify all cached revisions for that repo.
677-
- Specific revision hashes.
676+
Examples:
678677

679678
```bash
680-
>>> hf cache verify model/sentence-transformers/all-MiniLM-L6-v2 11c5a3d5811f50298f278a704980280950aedb10
681-
✅ Verified 60 file(s) across 2 revision(s); no checksum mismatches detected.
679+
# Verify main revision of a model in cache
680+
>>> hf cache verify deepseek-ai/DeepSeek-OCR
681+
682+
# Verify a specific revision
683+
>>> hf cache verify deepseek-ai/DeepSeek-OCR --revision refs/pr/1
684+
>>> hf cache verify deepseek-ai/DeepSeek-OCR --revision abcdef123
685+
686+
# Verify a private repo
687+
>>> hf cache verify me/private-model --token hf_***
688+
689+
# Verify a dataset
690+
>>> hf cache verify karpathy/fineweb-edu-100b-shuffle --repo-type dataset
691+
692+
# Verify files in a local directory
693+
>>> hf cache verify deepseek-ai/DeepSeek-OCR --local-dir /path/to/repo
682694
```
683695

684-
If mismatches are detected, the command prints a detailed list of issues (missing locally, not present on the Hub for this revision, or checksum mismatches) and exits with a non-zero status:
696+
By default, the command warns about missing or extra files but does not fail. Use flags to make these conditions fail the command:
697+
698+
```bash
699+
>>> hf cache verify gpt2 --fail-on-missing-files --fail-on-extra-files
700+
```
701+
702+
On success, you will see a summary:
685703

686704
```text
687-
❌ Checksum verification failed for the following file(s):
688-
- dataset/nyu-mll/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c::cola/test-00000-of-00001.parquet: missing locally.
705+
✅ Verified 60 file(s) at e7da7f221d5bf496a48136c0cd264e630fe9fcc8; no checksum mismatches.
689706
```
690707

708+
If mismatches are detected, the command prints a detailed list and exits with a non-zero status.
709+
691710
## hf repo tag create
692711

693712
The `hf repo tag create` command allows you to tag, untag, and list tags for repositories.

src/huggingface_hub/cli/cache.py

Lines changed: 79 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
)
4141
from ..utils._parsing import parse_duration, parse_size
4242
from ..utils.sha import git_hash, sha_fileobj
43-
from ._cli_utils import TokenOpt, get_hf_api, typer_factory
43+
from ._cli_utils import RepoIdArg, RepoTypeOpt, RevisionOpt, TokenOpt, get_hf_api, typer_factory
4444

4545

4646
cache_cli = typer_factory(help="Manage local cache directory.")
@@ -749,65 +749,97 @@ def prune(
749749

750750
@cache_cli.command()
751751
def verify(
752-
targets: Annotated[
753-
list[str],
754-
typer.Argument(
755-
help="One or more repo IDs (e.g. model/bert-base-uncased) or revision hashes to verify.",
756-
),
757-
],
752+
repo_id: RepoIdArg,
753+
repo_type: RepoTypeOpt = RepoTypeOpt.model,
754+
revision: RevisionOpt = None,
758755
cache_dir: Annotated[
759756
Optional[str],
760757
typer.Option(
761-
help="Cache directory to scan (defaults to Hugging Face cache).",
758+
help="Cache directory to use when verifying files from cache (defaults to Hugging Face cache).",
762759
),
763760
] = None,
761+
local_dir: Annotated[
762+
Optional[str],
763+
typer.Option(
764+
help="If set, verify files under this directory instead of the cache.",
765+
),
766+
] = None,
767+
fail_on_missing_files: Annotated[
768+
bool,
769+
typer.Option(
770+
help="Fail if some files exist on the remote but are missing locally.",
771+
),
772+
] = False,
773+
fail_on_extra_files: Annotated[
774+
bool,
775+
typer.Option(
776+
help="Fail if some files exist locally but are not present on the remote revision.",
777+
),
778+
] = False,
764779
token: TokenOpt = None,
765780
) -> None:
766-
"""Verify cached repos or revisions by comparing against Hub metadata."""
781+
"""Verify checksums for a single repo revision from cache or a local directory.
767782
768-
try:
769-
hf_cache_info = scan_cache_dir(cache_dir)
770-
except CacheNotFound as exc:
771-
print(f"Cache directory not found: {str(exc.cache_dir)}")
772-
raise typer.Exit(code=1)
783+
Examples:
784+
- Verify main revision in cache: `hf cache verify gpt2`
785+
- Verify specific revision: `hf cache verify gpt2 --revision refs/pr/1`
786+
- Verify dataset: `hf cache verify karpathy/fineweb-edu-100b-shuffle --repo-type dataset`
787+
- Verify local dir: `hf cache verify deepseek-ai/DeepSeek-OCR --local-dir /path/to/repo`
788+
"""
773789

774-
resolution = _resolve_targets(hf_cache_info, targets)
775-
if resolution.missing:
776-
print("Could not find the following targets in the cache:")
777-
for entry in resolution.missing:
778-
print(f" - {entry}")
790+
if local_dir is not None and cache_dir is not None:
791+
print("Cannot pass both --local-dir and --cache-dir. Use one or the other.")
792+
raise typer.Exit(code=2)
779793

780-
if len(resolution.revisions) == 0:
781-
print("Nothing to verify.")
782-
raise typer.Exit(code=0)
794+
api = get_hf_api(token=token)
783795

784-
checked_blobs: set[Path] = set()
785-
git_hash_cache: dict[Path, str] = {}
786-
issues: list[str] = []
796+
try:
797+
result = api.verify_repo_checksums(
798+
repo_id=repo_id,
799+
repo_type=repo_type.value if hasattr(repo_type, "value") else str(repo_type),
800+
revision=revision,
801+
local_dir=local_dir,
802+
cache_dir=cache_dir,
803+
token=token,
804+
)
805+
except ValueError as exc:
806+
print(str(exc))
807+
raise typer.Exit(code=1)
787808

788-
for repo in sorted(resolution.selected.keys(), key=lambda item: (item.repo_type, item.repo_id.lower())):
789-
for revision in sorted(resolution.selected[repo], key=lambda rev: rev.commit_hash):
790-
revision_label = f"{repo.cache_id}@{revision.commit_hash}"
791-
revision_issues, revision_checked = _verify_revision(
792-
repo=repo,
793-
revision=revision,
794-
revision_label=revision_label,
795-
git_hash_cache=git_hash_cache,
796-
token=token,
809+
# Print mismatches first if any
810+
if result.mismatches:
811+
print("❌ Checksum verification failed for the following file(s):")
812+
for m in result.mismatches:
813+
print(f" - {m['path']}: expected {m['expected']} ({m['algorithm']}), got {m['actual']}")
814+
815+
# Handle missing/extra
816+
exit_code = 0
817+
if result.missing_paths:
818+
if fail_on_missing_files:
819+
print("Missing files (present remotely, absent locally):")
820+
for p in result.missing_paths:
821+
print(f" - {p}")
822+
exit_code = 1
823+
else:
824+
print(
825+
f"{len(result.missing_paths)} remote file(s) are missing locally. Use --fail-on-missing-files for details."
797826
)
798-
issues.extend(revision_issues)
799-
checked_blobs.update(revision_checked)
800827

801-
if not checked_blobs:
802-
print("No cached files found for the requested target.")
803-
return
828+
if result.extra_paths:
829+
if fail_on_extra_files:
830+
print("Extra files (present locally, absent remotely):")
831+
for p in result.extra_paths:
832+
print(f" - {p}")
833+
exit_code = 1
834+
else:
835+
print(
836+
f"{len(result.extra_paths)} local file(s) do not exist on remote repo. Use --fail-on-extra-files for more details."
837+
)
804838

805-
if issues:
806-
print("❌ Checksum verification failed for the following file(s):")
807-
for entry in issues:
808-
print(f" - {entry}")
809-
raise typer.Exit(code=1)
839+
if result.mismatches:
840+
exit_code = 1
810841

811-
print(
812-
f"✅ Verified {len(checked_blobs)} file(s) across {len(resolution.revisions)} revision(s); no checksum mismatches detected."
813-
)
842+
if exit_code != 0:
843+
raise typer.Exit(code=exit_code)
844+
845+
print(f"✅ Verified {result.checked_count} file(s) at {result.revision}; no checksum mismatches.")

src/huggingface_hub/hf_api.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,11 +105,13 @@
105105
from .utils._auth import _get_token_from_environment, _get_token_from_file, _get_token_from_google_colab
106106
from .utils._deprecation import _deprecate_arguments
107107
from .utils._typing import CallableT
108+
from .utils._verification import collect_local_files, resolve_local_root, verify_maps
108109
from .utils.endpoint_helpers import _is_emission_within_threshold
109110

110111

111112
if TYPE_CHECKING:
112113
from .inference._providers import PROVIDER_T
114+
from .utils._verification import Verification
113115

114116
R = TypeVar("R") # Return type
115117
CollectionItemType_T = Literal["model", "dataset", "space", "paper", "collection"]
@@ -3057,6 +3059,84 @@ def list_repo_tree(
30573059
for path_info in paginate(path=tree_url, headers=headers, params={"recursive": recursive, "expand": expand}):
30583060
yield (RepoFile(**path_info) if path_info["type"] == "file" else RepoFolder(**path_info))
30593061

3062+
@validate_hf_hub_args
3063+
def verify_repo_checksums(
3064+
self,
3065+
repo_id: str,
3066+
*,
3067+
repo_type: Optional[str] = None,
3068+
revision: Optional[str] = None,
3069+
local_dir: Optional[Union[str, Path]] = None,
3070+
cache_dir: Optional[Union[str, Path]] = None,
3071+
token: Union[str, bool, None] = None,
3072+
) -> "Verification":
3073+
"""
3074+
Verify local files for a repo against Hub checksums.
3075+
3076+
Args:
3077+
repo_id (`str`):
3078+
A namespace (user or an organization) and a repo name separated by a `/`.
3079+
repo_type (`str`, *optional*):
3080+
The type of the repository from which to get the tree (`"model"`, `"dataset"` or `"space"`.
3081+
Defaults to `"model"`.
3082+
revision (`str`, *optional*):
3083+
The revision of the repository from which to get the tree. Defaults to `"main"` branch.
3084+
local_dir (`str` or `Path`, *optional*):
3085+
The local directory to verify.
3086+
cache_dir (`str` or `Path`, *optional*):
3087+
The cache directory to verify.
3088+
token (Union[bool, str, None], optional):
3089+
A valid user access token (string). Defaults to the locally saved
3090+
token, which is the recommended method for authentication (see
3091+
https://huggingface.co/docs/huggingface_hub/quick-start#authentication).
3092+
To disable authentication, pass `False`.
3093+
3094+
Returns:
3095+
[`Verification`]: a structured result containing the verification details.
3096+
3097+
Raises:
3098+
[`~utils.RepositoryNotFoundError`]:
3099+
If repository is not found (error 404): wrong repo_id/repo_type, private but not authenticated or repo
3100+
does not exist.
3101+
[`~utils.RevisionNotFoundError`]:
3102+
If revision is not found (error 404) on the repo.
3103+
[`~utils.RemoteEntryNotFoundError`]:
3104+
If the tree (folder) does not exist (error 404) on the repo.
3105+
3106+
"""
3107+
3108+
if repo_type is None:
3109+
repo_type = constants.REPO_TYPE_MODEL
3110+
3111+
if local_dir is not None and cache_dir is not None:
3112+
raise ValueError("Pass either `local_dir` or `cache_dir`, not both.")
3113+
3114+
root, remote_revision = resolve_local_root(
3115+
repo_id=repo_id,
3116+
repo_type=repo_type,
3117+
revision=revision,
3118+
cache_dir=Path(cache_dir) if cache_dir is not None else None,
3119+
local_dir=Path(local_dir) if local_dir is not None else None,
3120+
)
3121+
local_by_path = collect_local_files(root)
3122+
3123+
# get remote entries
3124+
remote_by_path: dict[str, object] = {}
3125+
for entry in self.list_repo_tree(
3126+
repo_id=repo_id, recursive=True, revision=remote_revision, repo_type=repo_type, token=token
3127+
):
3128+
path = getattr(entry, "path", None)
3129+
if not path:
3130+
continue
3131+
lfs = getattr(entry, "lfs", None)
3132+
has_lfs_sha = (getattr(lfs, "sha256", None) is not None) or (
3133+
isinstance(lfs, dict) and lfs.get("sha256") is not None
3134+
)
3135+
if hasattr(entry, "blob_id") or has_lfs_sha:
3136+
remote_by_path[path] = entry
3137+
3138+
return verify_maps(remote_by_path=remote_by_path, local_by_path=local_by_path, revision=remote_revision)
3139+
30603140
@validate_hf_hub_args
30613141
def list_repo_refs(
30623142
self,
@@ -10666,6 +10746,7 @@ def _parse_revision_from_pr_url(pr_url: str) -> str:
1066610746
list_repo_commits = api.list_repo_commits
1066710747
list_repo_tree = api.list_repo_tree
1066810748
get_paths_info = api.get_paths_info
10749+
verify_repo_checksums = api.verify_repo_checksums
1066910750

1067010751
get_model_tags = api.get_model_tags
1067110752
get_dataset_tags = api.get_dataset_tags

0 commit comments

Comments
 (0)