From 99180e45f4ec20e77972b8b5b3a130820a0235f8 Mon Sep 17 00:00:00 2001 From: Jitka Obselkova Date: Fri, 16 May 2025 17:41:11 +0200 Subject: [PATCH] Support on-demand content in repair_metadata --- pulp_python/app/tasks/repair.py | 64 ++++++++++++- pulp_python/app/utils.py | 16 ++++ .../tests/functional/api/test_repair.py | 95 +++++++++++++++++++ 3 files changed, 170 insertions(+), 5 deletions(-) diff --git a/pulp_python/app/tasks/repair.py b/pulp_python/app/tasks/repair.py index c1fa6a71..c9a10ec0 100644 --- a/pulp_python/app/tasks/repair.py +++ b/pulp_python/app/tasks/repair.py @@ -7,7 +7,11 @@ from pulpcore.plugin.util import get_domain from pulp_python.app.models import PythonPackageContent, PythonRepository -from pulp_python.app.utils import artifact_to_python_content_data +from pulp_python.app.utils import ( + artifact_to_python_content_data, + fetch_json_release_metadata, + parse_metadata, +) log = logging.getLogger(__name__) @@ -47,8 +51,16 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int: Returns: int: The number of packages that were repaired. """ - # TODO: Add on_demand content repair - immediate_content = content.filter(contentartifact__artifact__isnull=False) + immediate_content = ( + content.filter(contentartifact__artifact__isnull=False) + .distinct() + .prefetch_related("_artifacts") + ) + on_demand_content = ( + content.filter(contentartifact__artifact__isnull=True) + .distinct() + .prefetch_related("contentartifact_set__remoteartifact_set") + ) domain = get_domain() batch = [] @@ -58,12 +70,12 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int: progress_report = ProgressReport( message="Repairing packages' metadata", code="repair.metadata", - total=immediate_content.count(), + total=content.count(), ) progress_report.save() with progress_report: for package in progress_report.iter( - immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000) + immediate_content.iterator(chunk_size=1000) ): new_data = artifact_to_python_content_data( package.filename, package._artifacts.get(), domain @@ -82,6 +94,48 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int: batch = [] set_of_update_fields.clear() + for package in progress_report.iter( + on_demand_content.iterator(chunk_size=1000) + ): + remote_artifacts = ( + package.contentartifact_set.get().remoteartifact_set.all() + ) + # We expect that PythonPackageContent always has correct name and version, + # and RemoteArtifact always has correct sha256 + json_data = fetch_json_release_metadata( + package.name, package.version, remote_artifacts.get().remote + ) + dist_data = next( + ( + dist + for ra in remote_artifacts + for dist in json_data["urls"] + if ra.sha256 == dist["digests"]["sha256"] + ), + None, + ) + if not dist_data: + log.warning( + _("No matching distribution for {} was found.").format(package.name) + ) + continue + + new_data = parse_metadata(json_data["info"], package.version, dist_data) + new_data.pop("url") # belongs to RemoteArtifact, not PythonPackageContent + changed = False + for field, value in new_data.items(): + if getattr(package, field) != value: + setattr(package, field, value) + set_of_update_fields.add(field) + changed = True + if changed: + batch.append(package) + if len(batch) == 1000: + total_repaired += len(batch) + PythonPackageContent.objects.bulk_update(batch, set_of_update_fields) + batch = [] + set_of_update_fields.clear() + if batch: total_repaired += len(batch) PythonPackageContent.objects.bulk_update(batch, set_of_update_fields) diff --git a/pulp_python/app/utils.py b/pulp_python/app/utils.py index 8baf0e9d..b4119fc4 100644 --- a/pulp_python/app/utils.py +++ b/pulp_python/app/utils.py @@ -1,5 +1,6 @@ import pkginfo import re +import requests import shutil import tempfile import json @@ -10,6 +11,8 @@ from packaging.requirements import Requirement from packaging.version import parse, InvalidVersion +from pulpcore.plugin.models import Remote + PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL" """TODO This serial constant is temporary until Python repositories implements serials""" @@ -189,6 +192,19 @@ def artifact_to_python_content_data(filename, artifact, domain=None): return data +def fetch_json_release_metadata(name: str, version: str, remote: Remote) -> dict: + """ + Fetches metadata for a specific release from PyPI's JSON API. A release can contain + multiple distributions. See https://docs.pypi.org/api/json/#get-a-release for more details. + + Returns dict containing "info", "last_serial", "urls", and "vulnerabilities" keys. + """ + url = f"{remote.url}pypi/{name}/{version}/json" + response = requests.get(url, timeout=10) + response.raise_for_status() + return response.json() + + def python_content_to_json(base_path, content_query, version=None, domain=None): """ Converts a QuerySet of PythonPackageContent into the PyPi JSON format diff --git a/pulp_python/tests/functional/api/test_repair.py b/pulp_python/tests/functional/api/test_repair.py index 4b2bce55..728ea66b 100644 --- a/pulp_python/tests/functional/api/test_repair.py +++ b/pulp_python/tests/functional/api/test_repair.py @@ -32,6 +32,49 @@ def _create(artifact_filename, filename, content_data): return _create +@pytest.fixture +def create_content_remote(python_bindings): + def _create(filename, ra_url, ra_sha256, content_data, remote): + commands = ( + "from pulpcore.plugin.models import ContentArtifact, RemoteArtifact; " + "from pulpcore.plugin.util import extract_pk, get_url; " + "from pulp_python.app.models import PythonPackageContent, PythonRemote; " + f"c = PythonPackageContent(filename={filename!r}, **{content_data!r}); " + "c.save(); " + f"ca = ContentArtifact(artifact=None, content=c, relative_path={filename!r}); " + "ca.save(); " + f"r = PythonRemote.objects.get(pk=extract_pk({remote.pulp_href!r})); " + f"ra = RemoteArtifact(content_artifact=ca, remote=r, sha256={ra_sha256!r}, url={ra_url!r}); " # noqa: E501 + "ra.save(); " + "print(get_url(c))" + ) + process = subprocess.run( + ["pulpcore-manager", "shell", "-c", commands], capture_output=True + ) + + assert process.returncode == 0 + content_href = process.stdout.decode().strip() + return python_bindings.ContentPackagesApi.read(content_href) + + return _create + + +@pytest.mark.django_db +@pytest.fixture +def delete_content(): + def _delete(content_href): + from pulpcore.plugin.util import extract_pk + from pulp_python.app.models import PythonPackageContent + + content = PythonPackageContent.objects.get(pk=extract_pk(content_href)) + content.version_memberships.all().delete() + artifacts = content._artifacts.all() + content.delete() + artifacts.delete() + + return _delete + + @pytest.fixture def move_to_repository(python_bindings, monitor_task): def _move(repo_href, content_hrefs): @@ -84,6 +127,7 @@ def test_metadata_repair_command( def test_metadata_repair_endpoint( create_content_direct, + delete_content, download_python_file, monitor_task, move_to_repository, @@ -124,3 +168,54 @@ def test_metadata_repair_endpoint( assert content.packagetype == "sdist" assert content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*" assert content.author == "" + delete_content(content.pulp_href) + + +def test_metadata_repair_endpoint_on_demand( + create_content_remote, + delete_content, + monitor_task, + move_to_repository, + python_bindings, + python_remote_factory, + python_repo_factory, +): + """ + Test repairing of package metadata via `Repositories.repair_metadata` endpoint + when only RemoteArtifact is present. + """ + python_egg_filename = "scipy-1.1.0.tar.gz" + python_egg_url = urljoin( + urljoin(PYTHON_FIXTURES_URL, "packages/"), python_egg_filename + ) + python_egg_sha256 = ( + "878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1" + ) + data = { + "name": "scipy", + "version": "1.1.0", + # Wrong metadata + "author": "ME", + "packagetype": "bdist", + "requires_python": ">=3.8", + } + remote = python_remote_factory(includes=["scipy"]) + repo = python_repo_factory(remote=remote) + + content = create_content_remote( + python_egg_filename, python_egg_url, python_egg_sha256, data, remote + ) + for field, test_value in data.items(): + assert getattr(content, field) == test_value + move_to_repository(repo.pulp_href, [content.pulp_href]) + + response = python_bindings.RepositoriesPythonApi.repair_metadata(repo.pulp_href) + monitor_task(response.task) + + new_content = python_bindings.ContentPackagesApi.read(content.pulp_href) + assert new_content.author == "" + assert new_content.name == "scipy" + assert new_content.packagetype == "sdist" + assert new_content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*" + assert new_content.version == "1.1.0" + delete_content(content.pulp_href)