Skip to content

Commit 75a0686

Browse files
authored
Canvas - skip TutorProblemFile OCR if content is the same (#2557)
* adding migration * adding test
1 parent f9d07ac commit 75a0686

File tree

4 files changed

+78
-4
lines changed

4 files changed

+78
-4
lines changed

learning_resources/etl/canvas.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from learning_resources_search.constants import (
4040
CONTENT_FILE_TYPE,
4141
)
42+
from main.utils import checksum_for_content
4243

4344
log = logging.getLogger(__name__)
4445

@@ -217,7 +218,6 @@ def transform_canvas_problem_files(
217218
problem_file_data = {
218219
key: file_data[key] for key in keys_to_keep if key in file_data
219220
}
220-
221221
path = file_data["source_path"]
222222
path = path[len(settings.CANVAS_TUTORBOT_FOLDER) :]
223223
path_parts = path.split("/", 1)
@@ -239,9 +239,16 @@ def transform_canvas_problem_files(
239239
else:
240240
problem_file_data["type"] = TUTOR_PROBLEM_TYPE
241241

242+
problem_file_data["checksum"] = checksum_for_content(
243+
problem_file_data["content"]
244+
)
245+
242246
if (
243247
problem_file_data["file_extension"].lower() == ".pdf"
244248
and settings.CANVAS_PDF_TRANSCRIPTION_MODEL
249+
and not run.problem_files.filter(
250+
checksum=problem_file_data["checksum"]
251+
).exists()
245252
):
246253
markdown_content = _pdf_to_markdown(
247254
Path(olx_path) / Path(problem_file_data["source_path"])

learning_resources/etl/canvas_test.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,11 @@
3030
LearningResourceFactory,
3131
LearningResourcePlatformFactory,
3232
LearningResourceRunFactory,
33+
TutorProblemFileFactory,
3334
)
3435
from learning_resources.models import LearningResource
3536
from learning_resources_search.constants import CONTENT_FILE_TYPE
36-
from main.utils import now_in_utc
37+
from main.utils import checksum_for_content, now_in_utc
3738

3839
pytestmark = pytest.mark.django_db
3940

@@ -471,8 +472,7 @@ def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown(
471472
return_value="markdown content from pdf",
472473
)
473474

474-
# Patch Path(olx_path) / Path(problem_file_data["source_path"]) to exist
475-
run = mocker.Mock()
475+
run = LearningResourceRunFactory.create()
476476

477477
results = list(transform_canvas_problem_files(zip_path, run, overwrite=True))
478478

@@ -1534,3 +1534,52 @@ def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path):
15341534
}
15351535
published = get_published_items(zip_path, url_config)
15361536
assert Path("web_resources/file1.pdf").resolve() in published
1537+
1538+
1539+
def test_transform_canvas_problem_files_skips_pdf_to_markdown_if_checksum_exists(
1540+
tmp_path, mocker, settings
1541+
):
1542+
"""
1543+
Test that transform_canvas_problem_files does not call _pdf_to_markdown if the checksum already exists.
1544+
"""
1545+
settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
1546+
settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
1547+
pdf_filename = "problemset3/problem.pdf"
1548+
pdf_content = b"%PDF-1.4 fake pdf content"
1549+
zip_path = make_canvas_zip(
1550+
tmp_path, files=[(f"tutorbot/{pdf_filename}", pdf_content)]
1551+
)
1552+
1553+
original_pdf_content = "original pdf content"
1554+
existing_checksum = checksum_for_content(original_pdf_content)
1555+
1556+
mock_run = LearningResourceRunFactory.create()
1557+
TutorProblemFileFactory.create(
1558+
run=mock_run,
1559+
problem_title="Problem Set 1",
1560+
type="problem",
1561+
checksum=existing_checksum,
1562+
)
1563+
1564+
fake_file_data = {
1565+
"run": mock_run,
1566+
"content": original_pdf_content,
1567+
"archive_checksum": "checksum",
1568+
"source_path": f"tutorbot/{pdf_filename}",
1569+
"file_extension": ".pdf",
1570+
}
1571+
1572+
mocker.patch(
1573+
"learning_resources.etl.canvas._process_olx_path",
1574+
return_value=iter([fake_file_data]),
1575+
)
1576+
1577+
pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown")
1578+
1579+
results = list(transform_canvas_problem_files(zip_path, mock_run, overwrite=True))
1580+
1581+
pdf_to_md.assert_not_called()
1582+
1583+
assert len(results) == 1
1584+
assert results[0]["content"] == "original pdf content"
1585+
assert results[0]["source_path"] == f"tutorbot/{pdf_filename}"
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Generated by Django 4.2.24 on 2025-10-01 13:30
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
dependencies = [
8+
("learning_resources", "0096_tutorproblemfile_file_name"),
9+
]
10+
11+
operations = [
12+
migrations.AddField(
13+
model_name="tutorproblemfile",
14+
name="checksum",
15+
field=models.CharField(blank=True, max_length=32, null=True),
16+
),
17+
]

learning_resources/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,7 @@ class TutorProblemFile(TimestampedModel):
916916
content = models.TextField(null=True, blank=True) # noqa: DJ001
917917

918918
archive_checksum = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
919+
checksum = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
919920
source_path = models.CharField(max_length=1024, null=True, blank=True) # noqa: DJ001
920921
file_extension = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
921922
file_name = models.CharField(max_length=256, null=True, blank=True) # noqa: DJ001

0 commit comments

Comments
 (0)