@@ -110,10 +110,14 @@ def _get_url_config(bucket, export_tempdir: str, url_config_file: str) -> dict:
110
110
bucket .download_file (url_config_file , url_config_path )
111
111
url_config = {}
112
112
with Path .open (url_config_path , "rb" ) as f :
113
- for url_item in json .loads (f .read ().decode ("utf-8" )).get ("course_files" , []):
113
+ url_json = json .loads (f .read ().decode ("utf-8" ))
114
+ for url_item in url_json .get ("course_files" , []):
114
115
url_key = url_item ["file_path" ]
115
116
url_key = unquote_plus (url_key .lstrip (url_key .split ("/" )[0 ]))
116
117
url_config [url_key ] = url_item ["url" ]
118
+ for url_item in url_json .get ("assignments" , []) + url_json .get ("pages" , []):
119
+ url_key = url_item .get ("name" , url_item .get ("title" ))
120
+ url_config [url_key ] = url_item .get ("html_url" )
117
121
return url_config
118
122
119
123
@@ -199,22 +203,23 @@ def transform_canvas_content_files(
199
203
"""
200
204
basedir = course_zipfile .name .split ("." )[0 ]
201
205
zipfile_path = course_zipfile .absolute ()
202
- # grab published module and file items
203
- published_items = [
204
- Path (item ["path" ]).resolve ()
205
- for item in parse_module_meta (zipfile_path )["active" ]
206
- ] + [
207
- Path (item ["path" ]).resolve ()
208
- for item in parse_files_meta (zipfile_path )["active" ]
209
- ]
210
- for item in parse_web_content (zipfile_path )["active" ]:
211
- published_items .append (Path (item ["path" ]).resolve ())
212
- published_items .extend (
213
- [
214
- Path (embedded_file ).resolve ()
215
- for embedded_file in item .get ("embedded_files" , [])
216
- ]
217
- )
206
+ all_published_items = (
207
+ parse_module_meta (zipfile_path )["active" ]
208
+ + parse_files_meta (zipfile_path )["active" ]
209
+ + parse_web_content (zipfile_path )["active" ]
210
+ )
211
+ published_items = {}
212
+ for item in all_published_items :
213
+ path = Path (item ["path" ]).resolve ()
214
+ published_items [path ] = item
215
+ for embedded_file in item .get ("embedded_files" , []):
216
+ embedded_path = Path (embedded_file ).resolve ()
217
+ if embedded_path in all_published_items :
218
+ continue
219
+ published_items [embedded_path ] = {
220
+ "path" : embedded_path ,
221
+ "title" : "" ,
222
+ }
218
223
219
224
def _generate_content ():
220
225
"""Inner generator for yielding content data"""
@@ -223,7 +228,8 @@ def _generate_content():
223
228
zipfile .ZipFile (zipfile_path , "r" ) as course_archive ,
224
229
):
225
230
for member in course_archive .infolist ():
226
- if Path (member .filename ).resolve () in published_items :
231
+ member_path = Path (member .filename ).resolve ()
232
+ if member_path in published_items :
227
233
course_archive .extract (member , path = olx_path )
228
234
log .debug ("processing active file %s" , member .filename )
229
235
else :
@@ -233,7 +239,14 @@ def _generate_content():
233
239
url_path = content_data ["source_path" ].lstrip (
234
240
content_data ["source_path" ].split ("/" )[0 ]
235
241
)
236
- content_url = url_config .get (url_path , "" )
242
+ item_meta = published_items .get (
243
+ Path (content_data ["source_path" ]).resolve (), {}
244
+ )
245
+
246
+ content_url = url_config .get (url_path ) or url_config .get (
247
+ item_meta .get ("title" )
248
+ )
249
+ content_data ["content_title" ] = item_meta .get ("title" )
237
250
if content_url :
238
251
content_data ["url" ] = content_url
239
252
yield content_data
@@ -561,6 +574,19 @@ def _title_from_html(html: str) -> str:
561
574
return title .get_text ().strip () if title else ""
562
575
563
576
577
+ def _title_from_assignment_settings (xml_string : str ) -> str :
578
+ """
579
+ Extract the title from assignment_settings.xml
580
+ """
581
+ try :
582
+ root = ElementTree .fromstring (xml_string )
583
+ except Exception :
584
+ log .exception ("Error parsing XML: %s" , sys .stderr )
585
+ return ""
586
+ title_elem = root .find ("cccv1p0:title" , NAMESPACES )
587
+ return title_elem .text .strip () if title_elem is not None and title_elem .text else ""
588
+
589
+
564
590
def parse_web_content (course_archive_path : str ) -> dict :
565
591
"""
566
592
Parse html pages and assignments and return publish/active status of resources
@@ -588,9 +614,12 @@ def parse_web_content(course_archive_path: str) -> dict:
588
614
if assignment_settings :
589
615
xml_content = course_archive .read (assignment_settings )
590
616
workflow_state = _workflow_state_from_xml (xml_content )
617
+ title = _title_from_assignment_settings (xml_content )
618
+ canvas_type = "assignment"
591
619
else :
592
620
workflow_state = _workflow_state_from_html (html_content )
593
- title = _title_from_html (html_content )
621
+ title = _title_from_html (html_content )
622
+ canvas_type = "page"
594
623
595
624
lom_elem = (
596
625
resource_map_item .get ("metadata" , {})
@@ -606,6 +635,7 @@ def parse_web_content(course_archive_path: str) -> dict:
606
635
{
607
636
"title" : title ,
608
637
"path" : file_path ,
638
+ "canvas_type" : canvas_type ,
609
639
"embedded_files" : embedded_files ,
610
640
}
611
641
)
@@ -614,6 +644,7 @@ def parse_web_content(course_archive_path: str) -> dict:
614
644
{
615
645
"title" : title ,
616
646
"path" : file_path ,
647
+ "canvas_type" : canvas_type ,
617
648
"embedded_files" : embedded_files ,
618
649
}
619
650
)
@@ -635,7 +666,7 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
635
666
title = (
636
667
item .find ("imscp:title" , NAMESPACES ).text
637
668
if item .find ("imscp:title" , NAMESPACES ) is not None
638
- else "No Title "
669
+ else ""
639
670
)
640
671
resource = root .find (
641
672
f'.//imscp:resource[@identifier="{ identifierref } "]' , NAMESPACES
0 commit comments