Skip to content

Commit b5fcde3

Browse files
authored
canvas: citation urls for html content (#2521)
* updates to support urls for non-file items * adding canvas type * working urls for assignments and pages * adding title to contentfile and test for urls * update test * titles for embedded pages
1 parent 9348c8e commit b5fcde3

File tree

2 files changed

+151
-22
lines changed

2 files changed

+151
-22
lines changed

learning_resources/etl/canvas.py

Lines changed: 52 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,14 @@ def _get_url_config(bucket, export_tempdir: str, url_config_file: str) -> dict:
110110
bucket.download_file(url_config_file, url_config_path)
111111
url_config = {}
112112
with Path.open(url_config_path, "rb") as f:
113-
for url_item in json.loads(f.read().decode("utf-8")).get("course_files", []):
113+
url_json = json.loads(f.read().decode("utf-8"))
114+
for url_item in url_json.get("course_files", []):
114115
url_key = url_item["file_path"]
115116
url_key = unquote_plus(url_key.lstrip(url_key.split("/")[0]))
116117
url_config[url_key] = url_item["url"]
118+
for url_item in url_json.get("assignments", []) + url_json.get("pages", []):
119+
url_key = url_item.get("name", url_item.get("title"))
120+
url_config[url_key] = url_item.get("html_url")
117121
return url_config
118122

119123

@@ -199,22 +203,23 @@ def transform_canvas_content_files(
199203
"""
200204
basedir = course_zipfile.name.split(".")[0]
201205
zipfile_path = course_zipfile.absolute()
202-
# grab published module and file items
203-
published_items = [
204-
Path(item["path"]).resolve()
205-
for item in parse_module_meta(zipfile_path)["active"]
206-
] + [
207-
Path(item["path"]).resolve()
208-
for item in parse_files_meta(zipfile_path)["active"]
209-
]
210-
for item in parse_web_content(zipfile_path)["active"]:
211-
published_items.append(Path(item["path"]).resolve())
212-
published_items.extend(
213-
[
214-
Path(embedded_file).resolve()
215-
for embedded_file in item.get("embedded_files", [])
216-
]
217-
)
206+
all_published_items = (
207+
parse_module_meta(zipfile_path)["active"]
208+
+ parse_files_meta(zipfile_path)["active"]
209+
+ parse_web_content(zipfile_path)["active"]
210+
)
211+
published_items = {}
212+
for item in all_published_items:
213+
path = Path(item["path"]).resolve()
214+
published_items[path] = item
215+
for embedded_file in item.get("embedded_files", []):
216+
embedded_path = Path(embedded_file).resolve()
217+
if embedded_path in all_published_items:
218+
continue
219+
published_items[embedded_path] = {
220+
"path": embedded_path,
221+
"title": "",
222+
}
218223

219224
def _generate_content():
220225
"""Inner generator for yielding content data"""
@@ -223,7 +228,8 @@ def _generate_content():
223228
zipfile.ZipFile(zipfile_path, "r") as course_archive,
224229
):
225230
for member in course_archive.infolist():
226-
if Path(member.filename).resolve() in published_items:
231+
member_path = Path(member.filename).resolve()
232+
if member_path in published_items:
227233
course_archive.extract(member, path=olx_path)
228234
log.debug("processing active file %s", member.filename)
229235
else:
@@ -233,7 +239,14 @@ def _generate_content():
233239
url_path = content_data["source_path"].lstrip(
234240
content_data["source_path"].split("/")[0]
235241
)
236-
content_url = url_config.get(url_path, "")
242+
item_meta = published_items.get(
243+
Path(content_data["source_path"]).resolve(), {}
244+
)
245+
246+
content_url = url_config.get(url_path) or url_config.get(
247+
item_meta.get("title")
248+
)
249+
content_data["content_title"] = item_meta.get("title")
237250
if content_url:
238251
content_data["url"] = content_url
239252
yield content_data
@@ -561,6 +574,19 @@ def _title_from_html(html: str) -> str:
561574
return title.get_text().strip() if title else ""
562575

563576

577+
def _title_from_assignment_settings(xml_string: str) -> str:
578+
"""
579+
Extract the title from assignment_settings.xml
580+
"""
581+
try:
582+
root = ElementTree.fromstring(xml_string)
583+
except Exception:
584+
log.exception("Error parsing XML: %s", sys.stderr)
585+
return ""
586+
title_elem = root.find("cccv1p0:title", NAMESPACES)
587+
return title_elem.text.strip() if title_elem is not None and title_elem.text else ""
588+
589+
564590
def parse_web_content(course_archive_path: str) -> dict:
565591
"""
566592
Parse html pages and assignments and return publish/active status of resources
@@ -588,9 +614,12 @@ def parse_web_content(course_archive_path: str) -> dict:
588614
if assignment_settings:
589615
xml_content = course_archive.read(assignment_settings)
590616
workflow_state = _workflow_state_from_xml(xml_content)
617+
title = _title_from_assignment_settings(xml_content)
618+
canvas_type = "assignment"
591619
else:
592620
workflow_state = _workflow_state_from_html(html_content)
593-
title = _title_from_html(html_content)
621+
title = _title_from_html(html_content)
622+
canvas_type = "page"
594623

595624
lom_elem = (
596625
resource_map_item.get("metadata", {})
@@ -606,6 +635,7 @@ def parse_web_content(course_archive_path: str) -> dict:
606635
{
607636
"title": title,
608637
"path": file_path,
638+
"canvas_type": canvas_type,
609639
"embedded_files": embedded_files,
610640
}
611641
)
@@ -614,6 +644,7 @@ def parse_web_content(course_archive_path: str) -> dict:
614644
{
615645
"title": title,
616646
"path": file_path,
647+
"canvas_type": canvas_type,
617648
"embedded_files": embedded_files,
618649
}
619650
)
@@ -635,7 +666,7 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
635666
title = (
636667
item.find("imscp:title", NAMESPACES).text
637668
if item.find("imscp:title", NAMESPACES) is not None
638-
else "No Title"
669+
else ""
639670
)
640671
resource = root.find(
641672
f'.//imscp:resource[@identifier="{identifierref}"]', NAMESPACES

learning_resources/etl/canvas_test.py

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def test_parse_module_meta_returns_active_and_unpublished(tmp_path):
195195
"""
196196
module_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
197197
<modules xmlns="http://canvas.instructure.com/xsd/cccv1p0">
198-
<module>
198+
<module identifier="g31fce32d15019702eb75d5664798a877">
199199
<title>Module 1</title>
200200
<items>
201201
<item>
@@ -981,3 +981,101 @@ def test_embedded_files_from_html(tmp_path, mocker):
981981
)
982982

983983
assert any(file["source_path"] == "web_resources/html_page.html" for file in files)
984+
985+
986+
def test_get_url_config_assignments_and_pages(mocker, tmp_path):
987+
"""
988+
Test that _get_url_config correctly maps assignments and pages to URLs using their titles.
989+
"""
990+
hmtl_page_title = "html page"
991+
992+
url_config = {
993+
hmtl_page_title: "https://example.com/htmlpage",
994+
"/file1.html": "https://example.com/file1",
995+
}
996+
997+
run = LearningResourceRunFactory.create()
998+
999+
html_content = f"""
1000+
<html>
1001+
<head><meta name="workflow_state" content="active"/><title>{hmtl_page_title}</title></head>
1002+
<body>
1003+
1004+
</body>
1005+
</html>
1006+
"""
1007+
module_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
1008+
<modules xmlns="http://canvas.instructure.com/xsd/cccv1p0"
1009+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
1010+
>
1011+
<module>
1012+
<title>Module 1</title>
1013+
<items>
1014+
<item>
1015+
<workflow_state>active</workflow_state>
1016+
<title>Item 1</title>
1017+
<hidden>false</hidden>
1018+
<locked>false</locked>
1019+
<identifierref>RES1</identifierref>
1020+
<content_type>resource</content_type>
1021+
</item>
1022+
</items>
1023+
</module>
1024+
</modules>
1025+
"""
1026+
manifest_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
1027+
<manifest xmlns="http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1">
1028+
<resources>
1029+
<resource identifier="RES1" identifierref="RES1" type="webcontent" href="web_resources/file1.html">
1030+
<file href="web_resources/file1.html"/>
1031+
</resource>
1032+
<resource identifier="RES2" type="webcontent" href="web_resources/file2.html">
1033+
<file href="web_resources/file2.html"/>
1034+
</resource>
1035+
<resource identifier="RES3" type="webcontent" href="web_resources/html_page.html">
1036+
<file href="web_resources/html_page.html"/>
1037+
</resource>
1038+
</resources>
1039+
<organizations>
1040+
<organization>
1041+
<item identifierref="RES1">
1042+
<title>module 1</title>
1043+
</item>
1044+
1045+
</organization>
1046+
</organizations>
1047+
</manifest>
1048+
"""
1049+
zip_path = make_canvas_zip_with_module_meta(tmp_path, module_xml, manifest_xml)
1050+
with zipfile.ZipFile(zip_path, "a") as zf:
1051+
zf.writestr("web_resources/file1.html", "content of file1")
1052+
zf.writestr("web_resources/file2.html", "content of file2")
1053+
zf.writestr("web_resources/html_page.html", html_content)
1054+
mocker.patch(
1055+
"learning_resources.etl.utils.extract_text_metadata",
1056+
return_value={"content": "test"},
1057+
)
1058+
mocker.patch("learning_resources.etl.canvas.bulk_resources_unpublished_actions")
1059+
results = list(
1060+
transform_canvas_content_files(
1061+
Path(zip_path),
1062+
run=run,
1063+
url_config=url_config,
1064+
overwrite=False,
1065+
)
1066+
)
1067+
results = list(results)
1068+
list(
1069+
zip(
1070+
[result["content_title"] for result in results],
1071+
[result["url"] for result in results],
1072+
)
1073+
)
1074+
results = dict(
1075+
zip(
1076+
[result["content_title"] for result in results],
1077+
[result["url"] for result in results],
1078+
)
1079+
)
1080+
assert results["html page"] == "https://example.com/htmlpage"
1081+
assert results["Item 1"] == "https://example.com/file1"

0 commit comments

Comments
 (0)