@@ -110,10 +110,14 @@ def _get_url_config(bucket, export_tempdir: str, url_config_file: str) -> dict:
110110 bucket .download_file (url_config_file , url_config_path )
111111 url_config = {}
112112 with Path .open (url_config_path , "rb" ) as f :
113- for url_item in json .loads (f .read ().decode ("utf-8" )).get ("course_files" , []):
113+ url_json = json .loads (f .read ().decode ("utf-8" ))
114+ for url_item in url_json .get ("course_files" , []):
114115 url_key = url_item ["file_path" ]
115116 url_key = unquote_plus (url_key .lstrip (url_key .split ("/" )[0 ]))
116117 url_config [url_key ] = url_item ["url" ]
118+ for url_item in url_json .get ("assignments" , []) + url_json .get ("pages" , []):
119+ url_key = url_item .get ("name" , url_item .get ("title" ))
120+ url_config [url_key ] = url_item .get ("html_url" )
117121 return url_config
118122
119123
@@ -199,22 +203,23 @@ def transform_canvas_content_files(
199203 """
200204 basedir = course_zipfile .name .split ("." )[0 ]
201205 zipfile_path = course_zipfile .absolute ()
202- # grab published module and file items
203- published_items = [
204- Path (item ["path" ]).resolve ()
205- for item in parse_module_meta (zipfile_path )["active" ]
206- ] + [
207- Path (item ["path" ]).resolve ()
208- for item in parse_files_meta (zipfile_path )["active" ]
209- ]
210- for item in parse_web_content (zipfile_path )["active" ]:
211- published_items .append (Path (item ["path" ]).resolve ())
212- published_items .extend (
213- [
214- Path (embedded_file ).resolve ()
215- for embedded_file in item .get ("embedded_files" , [])
216- ]
217- )
206+ all_published_items = (
207+ parse_module_meta (zipfile_path )["active" ]
208+ + parse_files_meta (zipfile_path )["active" ]
209+ + parse_web_content (zipfile_path )["active" ]
210+ )
211+ published_items = {}
212+ for item in all_published_items :
213+ path = Path (item ["path" ]).resolve ()
214+ published_items [path ] = item
215+ for embedded_file in item .get ("embedded_files" , []):
216+ embedded_path = Path (embedded_file ).resolve ()
217+ if embedded_path in all_published_items :
218+ continue
219+ published_items [embedded_path ] = {
220+ "path" : embedded_path ,
221+ "title" : "" ,
222+ }
218223
219224 def _generate_content ():
220225 """Inner generator for yielding content data"""
@@ -223,7 +228,8 @@ def _generate_content():
223228 zipfile .ZipFile (zipfile_path , "r" ) as course_archive ,
224229 ):
225230 for member in course_archive .infolist ():
226- if Path (member .filename ).resolve () in published_items :
231+ member_path = Path (member .filename ).resolve ()
232+ if member_path in published_items :
227233 course_archive .extract (member , path = olx_path )
228234 log .debug ("processing active file %s" , member .filename )
229235 else :
@@ -233,7 +239,14 @@ def _generate_content():
233239 url_path = content_data ["source_path" ].lstrip (
234240 content_data ["source_path" ].split ("/" )[0 ]
235241 )
236- content_url = url_config .get (url_path , "" )
242+ item_meta = published_items .get (
243+ Path (content_data ["source_path" ]).resolve (), {}
244+ )
245+
246+ content_url = url_config .get (url_path ) or url_config .get (
247+ item_meta .get ("title" )
248+ )
249+ content_data ["content_title" ] = item_meta .get ("title" )
237250 if content_url :
238251 content_data ["url" ] = content_url
239252 yield content_data
@@ -561,6 +574,19 @@ def _title_from_html(html: str) -> str:
561574 return title .get_text ().strip () if title else ""
562575
563576
577+ def _title_from_assignment_settings (xml_string : str ) -> str :
578+ """
579+ Extract the title from assignment_settings.xml
580+ """
581+ try :
582+ root = ElementTree .fromstring (xml_string )
583+ except Exception :
584+ log .exception ("Error parsing XML: %s" , sys .stderr )
585+ return ""
586+ title_elem = root .find ("cccv1p0:title" , NAMESPACES )
587+ return title_elem .text .strip () if title_elem is not None and title_elem .text else ""
588+
589+
564590def parse_web_content (course_archive_path : str ) -> dict :
565591 """
566592 Parse html pages and assignments and return publish/active status of resources
@@ -588,9 +614,12 @@ def parse_web_content(course_archive_path: str) -> dict:
588614 if assignment_settings :
589615 xml_content = course_archive .read (assignment_settings )
590616 workflow_state = _workflow_state_from_xml (xml_content )
617+ title = _title_from_assignment_settings (xml_content )
618+ canvas_type = "assignment"
591619 else :
592620 workflow_state = _workflow_state_from_html (html_content )
593- title = _title_from_html (html_content )
621+ title = _title_from_html (html_content )
622+ canvas_type = "page"
594623
595624 lom_elem = (
596625 resource_map_item .get ("metadata" , {})
@@ -606,6 +635,7 @@ def parse_web_content(course_archive_path: str) -> dict:
606635 {
607636 "title" : title ,
608637 "path" : file_path ,
638+ "canvas_type" : canvas_type ,
609639 "embedded_files" : embedded_files ,
610640 }
611641 )
@@ -614,6 +644,7 @@ def parse_web_content(course_archive_path: str) -> dict:
614644 {
615645 "title" : title ,
616646 "path" : file_path ,
647+ "canvas_type" : canvas_type ,
617648 "embedded_files" : embedded_files ,
618649 }
619650 )
@@ -635,7 +666,7 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
635666 title = (
636667 item .find ("imscp:title" , NAMESPACES ).text
637668 if item .find ("imscp:title" , NAMESPACES ) is not None
638- else "No Title "
669+ else ""
639670 )
640671 resource = root .find (
641672 f'.//imscp:resource[@identifier="{ identifierref } "]' , NAMESPACES
0 commit comments