Skip to content

DRAFT: Add on_demand content to repair_metadata #848

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 59 additions & 5 deletions pulp_python/app/tasks/repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
from pulpcore.plugin.util import get_domain

from pulp_python.app.models import PythonPackageContent, PythonRepository
from pulp_python.app.utils import artifact_to_python_content_data
from pulp_python.app.utils import (
artifact_to_python_content_data,
fetch_json_release_metadata,
parse_metadata,
)

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -47,8 +51,16 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
Returns:
int: The number of packages that were repaired.
"""
# TODO: Add on_demand content repair
immediate_content = content.filter(contentartifact__artifact__isnull=False)
immediate_content = (
content.filter(contentartifact__artifact__isnull=False)
.distinct()
.prefetch_related("_artifacts")
)
on_demand_content = (
content.filter(contentartifact__artifact__isnull=True)
.distinct()
.prefetch_related("contentartifact_set__remoteartifact_set")
)
domain = get_domain()

batch = []
Expand All @@ -58,12 +70,12 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
progress_report = ProgressReport(
message="Repairing packages' metadata",
code="repair.metadata",
total=immediate_content.count(),
total=content.count(),
)
progress_report.save()
with progress_report:
for package in progress_report.iter(
immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000)
immediate_content.iterator(chunk_size=1000)
):
new_data = artifact_to_python_content_data(
package.filename, package._artifacts.get(), domain
Expand All @@ -82,6 +94,48 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
batch = []
set_of_update_fields.clear()

for package in progress_report.iter(
on_demand_content.iterator(chunk_size=1000)
):
remote_artifacts = (
package.contentartifact_set.get().remoteartifact_set.all()
)
# We expect that PythonPackageContent always has correct name and version,
# and RemoteArtifact always has correct sha256
json_data = fetch_json_release_metadata(
package.name, package.version, remote_artifacts.get().remote
)
dist_data = next(
(
dist
for ra in remote_artifacts
for dist in json_data["urls"]
if ra.sha256 == dist["digests"]["sha256"]
),
None,
)
if not dist_data:
log.warning(
_("No matching distribution for {} was found.").format(package.name)
)
continue

new_data = parse_metadata(json_data["info"], package.version, dist_data)
new_data.pop("url") # belongs to RemoteArtifact, not PythonPackageContent
changed = False
for field, value in new_data.items():
if getattr(package, field) != value:
setattr(package, field, value)
set_of_update_fields.add(field)
changed = True
if changed:
batch.append(package)
if len(batch) == 1000:
total_repaired += len(batch)
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
batch = []
set_of_update_fields.clear()

if batch:
total_repaired += len(batch)
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
Expand Down
16 changes: 16 additions & 0 deletions pulp_python/app/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pkginfo
import re
import requests
import shutil
import tempfile
import json
Expand All @@ -10,6 +11,8 @@
from packaging.requirements import Requirement
from packaging.version import parse, InvalidVersion

from pulpcore.plugin.models import Remote


PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL"
"""TODO This serial constant is temporary until Python repositories implements serials"""
Expand Down Expand Up @@ -189,6 +192,19 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
return data


def fetch_json_release_metadata(name: str, version: str, remote: Remote) -> dict:
"""
Fetches metadata for a specific release from PyPI's JSON API. A release can contain
multiple distributions. See https://docs.pypi.org/api/json/#get-a-release for more details.

Returns dict containing "info", "last_serial", "urls", and "vulnerabilities" keys.
"""
url = f"{remote.url}pypi/{name}/{version}/json"
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.json()


def python_content_to_json(base_path, content_query, version=None, domain=None):
"""
Converts a QuerySet of PythonPackageContent into the PyPi JSON format
Expand Down
95 changes: 95 additions & 0 deletions pulp_python/tests/functional/api/test_repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,49 @@ def _create(artifact_filename, filename, content_data):
return _create


@pytest.fixture
def create_content_remote(python_bindings):
def _create(filename, ra_url, ra_sha256, content_data, remote):
commands = (
"from pulpcore.plugin.models import ContentArtifact, RemoteArtifact; "
"from pulpcore.plugin.util import extract_pk, get_url; "
"from pulp_python.app.models import PythonPackageContent, PythonRemote; "
f"c = PythonPackageContent(filename={filename!r}, **{content_data!r}); "
"c.save(); "
f"ca = ContentArtifact(artifact=None, content=c, relative_path={filename!r}); "
"ca.save(); "
f"r = PythonRemote.objects.get(pk=extract_pk({remote.pulp_href!r})); "
f"ra = RemoteArtifact(content_artifact=ca, remote=r, sha256={ra_sha256!r}, url={ra_url!r}); " # noqa: E501
"ra.save(); "
"print(get_url(c))"
)
process = subprocess.run(
["pulpcore-manager", "shell", "-c", commands], capture_output=True
)

assert process.returncode == 0
content_href = process.stdout.decode().strip()
return python_bindings.ContentPackagesApi.read(content_href)

return _create


@pytest.mark.django_db
@pytest.fixture
def delete_content():
def _delete(content_href):
from pulpcore.plugin.util import extract_pk
from pulp_python.app.models import PythonPackageContent

content = PythonPackageContent.objects.get(pk=extract_pk(content_href))
content.version_memberships.all().delete()
artifacts = content._artifacts.all()
content.delete()
artifacts.delete()

return _delete


@pytest.fixture
def move_to_repository(python_bindings, monitor_task):
def _move(repo_href, content_hrefs):
Expand Down Expand Up @@ -84,6 +127,7 @@ def test_metadata_repair_command(

def test_metadata_repair_endpoint(
create_content_direct,
delete_content,
download_python_file,
monitor_task,
move_to_repository,
Expand Down Expand Up @@ -124,3 +168,54 @@ def test_metadata_repair_endpoint(
assert content.packagetype == "sdist"
assert content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
assert content.author == ""
delete_content(content.pulp_href)


def test_metadata_repair_endpoint_on_demand(
create_content_remote,
delete_content,
monitor_task,
move_to_repository,
python_bindings,
python_remote_factory,
python_repo_factory,
):
"""
Test repairing of package metadata via `Repositories.repair_metadata` endpoint
when only RemoteArtifact is present.
"""
python_egg_filename = "scipy-1.1.0.tar.gz"
python_egg_url = urljoin(
urljoin(PYTHON_FIXTURES_URL, "packages/"), python_egg_filename
)
python_egg_sha256 = (
"878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1"
)
data = {
"name": "scipy",
"version": "1.1.0",
# Wrong metadata
"author": "ME",
"packagetype": "bdist",
"requires_python": ">=3.8",
}
remote = python_remote_factory(includes=["scipy"])
repo = python_repo_factory(remote=remote)

content = create_content_remote(
python_egg_filename, python_egg_url, python_egg_sha256, data, remote
)
for field, test_value in data.items():
assert getattr(content, field) == test_value
move_to_repository(repo.pulp_href, [content.pulp_href])

response = python_bindings.RepositoriesPythonApi.repair_metadata(repo.pulp_href)
monitor_task(response.task)

new_content = python_bindings.ContentPackagesApi.read(content.pulp_href)
assert new_content.author == ""
assert new_content.name == "scipy"
assert new_content.packagetype == "sdist"
assert new_content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
assert new_content.version == "1.1.0"
delete_content(content.pulp_href)