Skip to content

Commit 42e9834

Browse files
committed
Support on-demand content in repair_metadata
closes #849
1 parent 134afaa commit 42e9834

File tree

4 files changed

+243
-5
lines changed

4 files changed

+243
-5
lines changed

CHANGES/849.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added support for on-demand content to `repair_metadata` endpoint.

pulp_python/app/tasks/repair.py

Lines changed: 99 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,18 @@
22
import uuid
33
from gettext import gettext as _
44

5+
from requests.exceptions import RequestException
56
from django.db.models.query import QuerySet
67
from pulpcore.plugin.models import ProgressReport
78
from pulpcore.plugin.util import get_domain
89

910
from pulp_python.app.models import PythonPackageContent, PythonRepository
10-
from pulp_python.app.utils import artifact_to_python_content_data
11+
from pulp_python.app.utils import (
12+
artifact_to_python_content_data,
13+
fetch_json_release_metadata,
14+
parse_metadata,
15+
)
16+
from itertools import groupby
1117

1218
log = logging.getLogger(__name__)
1319

@@ -47,8 +53,17 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
4753
Returns:
4854
int: The number of packages that were repaired.
4955
"""
50-
# TODO: Add on_demand content repair
51-
immediate_content = content.filter(contentartifact__artifact__isnull=False)
56+
immediate_content = (
57+
content.filter(contentartifact__artifact__isnull=False)
58+
.distinct()
59+
.prefetch_related("_artifacts")
60+
)
61+
on_demand_content = (
62+
content.filter(contentartifact__artifact__isnull=True)
63+
.distinct()
64+
.prefetch_related("contentartifact_set__remoteartifact_set")
65+
.order_by("name", "version")
66+
)
5267
domain = get_domain()
5368

5469
batch = []
@@ -58,12 +73,12 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
5873
progress_report = ProgressReport(
5974
message="Repairing packages' metadata",
6075
code="repair.metadata",
61-
total=immediate_content.count(),
76+
total=content.count(),
6277
)
6378
progress_report.save()
6479
with progress_report:
6580
for package in progress_report.iter(
66-
immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000)
81+
immediate_content.iterator(chunk_size=1000)
6782
):
6883
new_data = artifact_to_python_content_data(
6984
package.filename, package._artifacts.get(), domain
@@ -82,6 +97,85 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
8297
batch = []
8398
set_of_update_fields.clear()
8499

100+
# For on-demand content, we expect that:
101+
# 1. PythonPackageContent always has correct name and version, and one ContentArtifact
102+
# 2. RemoteArtifact always has correct sha256
103+
# Repair is only supported if all PythonPackageContent items with the same name and
104+
# version (i.e. group) share the same remote URL. Otherwise, the entire group is skipped
105+
for (name, version), group in groupby(
106+
on_demand_content.iterator(chunk_size=1000),
107+
key=lambda x: (x.name, x.version),
108+
):
109+
group = list(group)
110+
remotes = set(
111+
remote
112+
for content in group
113+
for remote in content.contentartifact_set.get()
114+
.remoteartifact_set.all()
115+
.values_list("remote__url", flat=True)
116+
)
117+
if len(remotes) != 1:
118+
log.warning(
119+
_("Only one remote url is supported for {} {}").format(
120+
name, version
121+
)
122+
)
123+
continue
124+
remote_url = remotes.pop()
125+
126+
# Retrieve data with all distributions for the given package version
127+
try:
128+
json_data = fetch_json_release_metadata(name, version, remote_url)
129+
except RequestException as exc:
130+
log.warning(
131+
_("Could not fetch metadata for {} {} from {}. Error: {}").format(
132+
name, version, remote_url, exc
133+
)
134+
)
135+
continue
136+
137+
for package in progress_report.iter(group):
138+
remote_artifacts = (
139+
package.contentartifact_set.get().remoteartifact_set.all()
140+
)
141+
# Extract data only for the specific distribution being checked
142+
dist_data = next(
143+
(
144+
dist
145+
for ra in remote_artifacts
146+
for dist in json_data["urls"]
147+
if ra.sha256 == dist["digests"]["sha256"]
148+
),
149+
None,
150+
)
151+
if not dist_data:
152+
log.warning(
153+
_(
154+
"Could not fetch distribution for {} {} with sha256 {}."
155+
).format(name, version, package.sha256)
156+
)
157+
continue
158+
159+
new_data = parse_metadata(json_data["info"], package.version, dist_data)
160+
new_data.pop("url") # belongs to RemoteArtifact
161+
new_data["pulp_domain"] = domain
162+
new_data["_pulp_domain"] = new_data["pulp_domain"]
163+
changed = False
164+
for field, value in new_data.items():
165+
if getattr(package, field) != value:
166+
setattr(package, field, value)
167+
set_of_update_fields.add(field)
168+
changed = True
169+
if changed:
170+
batch.append(package)
171+
if len(batch) == 1000:
172+
total_repaired += len(batch)
173+
PythonPackageContent.objects.bulk_update(
174+
batch, set_of_update_fields
175+
)
176+
batch = []
177+
set_of_update_fields.clear()
178+
85179
if batch:
86180
total_repaired += len(batch)
87181
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)

pulp_python/app/utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import pkginfo
22
import re
3+
import requests
34
import shutil
45
import tempfile
56
import json
@@ -189,6 +190,19 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
189190
return data
190191

191192

193+
def fetch_json_release_metadata(name: str, version: str, remote_url: str) -> dict:
194+
"""
195+
Fetches metadata for a specific release from PyPI's JSON API. A release can contain
196+
multiple distributions. See https://docs.pypi.org/api/json/#get-a-release for more details.
197+
198+
Returns dict containing "info", "last_serial", "urls", and "vulnerabilities" keys.
199+
"""
200+
url = f"{remote_url}pypi/{name}/{version}/json"
201+
response = requests.get(url, timeout=10)
202+
response.raise_for_status()
203+
return response.json()
204+
205+
192206
def python_content_to_json(base_path, content_query, version=None, domain=None):
193207
"""
194208
Converts a QuerySet of PythonPackageContent into the PyPi JSON format

pulp_python/tests/functional/api/test_repair.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,33 @@ def _create(artifact_filename, filename, content_data):
3232
return _create
3333

3434

35+
@pytest.fixture
36+
def create_content_remote(python_bindings):
37+
def _create(filename, content_data, ra_sha256, remote):
38+
commands = (
39+
"from pulpcore.plugin.models import ContentArtifact, RemoteArtifact; "
40+
"from pulpcore.plugin.util import extract_pk, get_url; "
41+
"from pulp_python.app.models import PythonPackageContent, PythonRemote; "
42+
f"c = PythonPackageContent(filename={filename!r}, **{content_data!r}); "
43+
"c.save(); "
44+
f"ca = ContentArtifact(content=c, relative_path={filename!r}); "
45+
"ca.save(); "
46+
f"r = PythonRemote.objects.get(pk=extract_pk({remote.pulp_href!r})); "
47+
f"ra = RemoteArtifact(content_artifact=ca, remote=r, sha256={ra_sha256!r}); "
48+
"ra.save(); "
49+
"print(get_url(c))"
50+
)
51+
process = subprocess.run(
52+
["pulpcore-manager", "shell", "-c", commands], capture_output=True
53+
)
54+
55+
assert process.returncode == 0
56+
content_href = process.stdout.decode().strip()
57+
return python_bindings.ContentPackagesApi.read(content_href)
58+
59+
return _create
60+
61+
3562
@pytest.fixture
3663
def move_to_repository(python_bindings, monitor_task):
3764
def _move(repo_href, content_hrefs):
@@ -84,6 +111,7 @@ def test_metadata_repair_command(
84111

85112
def test_metadata_repair_endpoint(
86113
create_content_direct,
114+
delete_orphans_pre,
87115
download_python_file,
88116
monitor_task,
89117
move_to_repository,
@@ -124,3 +152,104 @@ def test_metadata_repair_endpoint(
124152
assert content.packagetype == "sdist"
125153
assert content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
126154
assert content.author == ""
155+
156+
157+
def test_metadata_repair_endpoint_on_demand(
158+
create_content_remote,
159+
delete_orphans_pre,
160+
monitor_task,
161+
move_to_repository,
162+
python_bindings,
163+
python_remote_factory,
164+
python_repo_factory,
165+
):
166+
"""
167+
Test repairing of package metadata via `Repositories.repair_metadata` endpoint
168+
when only RemoteArtifacts are present.
169+
"""
170+
# 1. Set up tested data
171+
python_remote = python_remote_factory()
172+
python_repo = python_repo_factory(remote=python_remote)
173+
174+
scipy_filename_1 = "scipy-1.1.0.tar.gz"
175+
scipy_sha256_1 = "878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1"
176+
scipy_data_1 = {
177+
"name": "scipy",
178+
"version": "1.1.0",
179+
# Wrong metadata
180+
"author": "ME",
181+
"packagetype": "bdist",
182+
"requires_python": ">=3.8",
183+
"sha256": scipy_sha256_1,
184+
}
185+
186+
scipy_filename_2 = "scipy-1.1.0-cp36-none-win32.whl"
187+
scipy_sha256_2 = "0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3"
188+
scipy_data_2 = scipy_data_1.copy()
189+
scipy_data_2["sha256"] = scipy_sha256_2
190+
191+
celery_filename_1 = "celery-2.4.1.tar.gz"
192+
celery_sha256_1 = "c77652ca179d14473975822dbfb1b5dab950c88c171ef6bc2257ddb9066e6790"
193+
celery_data_1 = {
194+
"name": "celery",
195+
"version": "2.4.1",
196+
# Wrong metadata
197+
"author": "ME",
198+
"packagetype": "bdist",
199+
"requires_python": ">=3.8",
200+
}
201+
202+
celery_filename_2 = "celery-4.0.0.tar.gz"
203+
celery_sha256_2 = "3e38a9a7f2868f774dffbb49e3afd2e56f57875deb06cb3ee3808f572601a8f0"
204+
celery_data_2 = celery_data_1.copy()
205+
celery_data_2["sha256"] = celery_sha256_2
206+
celery_data_2["version"] = "4.0.0"
207+
208+
# 2. Create content and store its href
209+
content_hrefs = {}
210+
for filename, data, sha256 in [
211+
(scipy_filename_1, scipy_data_1, scipy_sha256_1),
212+
(scipy_filename_2, scipy_data_2, scipy_sha256_2),
213+
(celery_filename_1, celery_data_1, celery_sha256_1),
214+
(celery_filename_2, celery_data_2, celery_sha256_2),
215+
]:
216+
content = create_content_remote(filename, data, sha256, python_remote)
217+
for field, test_value in data.items():
218+
assert getattr(content, field) == test_value
219+
content_hrefs[filename] = content.pulp_href
220+
move_to_repository(python_repo.pulp_href, list(content_hrefs.values()))
221+
222+
# 3. Repair metadata
223+
response = python_bindings.RepositoriesPythonApi.repair_metadata(
224+
python_repo.pulp_href
225+
)
226+
monitor_task(response.task)
227+
228+
# 4. Check newly created metadata
229+
new_metadata = [
230+
(
231+
"scipy-1.1.0.tar.gz",
232+
"",
233+
"scipy",
234+
"sdist",
235+
">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*",
236+
"1.1.0",
237+
),
238+
(
239+
"scipy-1.1.0-cp36-none-win32.whl",
240+
"",
241+
"scipy",
242+
"bdist_wheel",
243+
">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*",
244+
"1.1.0",
245+
),
246+
("celery-2.4.1.tar.gz", "Ask Solem", "celery", "sdist", "", "2.4.1"),
247+
("celery-4.0.0.tar.gz", "Ask Solem", "celery", "sdist", "", "4.0.0"),
248+
]
249+
for filename, author, name, packagetype, requires_python, version in new_metadata:
250+
new_content = python_bindings.ContentPackagesApi.read(content_hrefs[filename])
251+
assert new_content.author == author
252+
assert new_content.name == name
253+
assert new_content.packagetype == packagetype
254+
assert new_content.requires_python == requires_python
255+
assert new_content.version == version

0 commit comments

Comments
 (0)