mpi2 · ficolo · Jul 29, 2025 · Jun 26, 2025 · Jul 4, 2025 · Jul 10, 2025
diff --git a/impc_etl/jobs/parse/experiment_extractor.py b/impc_etl/jobs/parse/experiment_extractor.py
diff --git a/impc_etl/jobs/parse/experiment_parser.py b/impc_etl/jobs/parse/experiment_parser.py
@@ -0,0 +1,128 @@
+"""
+DCC Experiment Extractor module
+    This module takes care of extracting experiment data from XML DCC files to Spark DataFrames.
+    In the XML files we can find two experiment classes: specimen level experiments and line level experiments.
+
+    The files are expected to be organized by data source in the following directory structure <DCC_XML_PATH>/<DATASOURCE>/*experiment*.xml
+    Each directory containing the raw XML in the XML schema defined by the DCC.
+"""
+
+import logging
+import textwrap
+
+from airflow.sdk import Variable, asset
+
+from impc_etl.utils.airflow import create_input_asset, create_output_asset
+from impc_etl.utils.spark import with_spark_session
+from impc_etl.jobs.parse.xml_extraction_helper import (
+    extract_dcc_xml_files,
+    get_entity_by_type,
+)
+from impc_etl.utils.exceptions import UnsupportedEntityError
+
+task_logger = logging.getLogger("airflow.task")
+dr_tag = Variable.get("data_release_tag")
+
+dcc_experiment_xml_path_asset = create_input_asset("xml/impc/data")
+specimen_level_experiment_parquet_asset = create_output_asset(
+    "specimen_level_experiment_raw_parquet"
+)
+line_level_experiment_parquet_asset = create_output_asset(
+    "line_level_experiment_raw_parquet"
+)
+
+
+@asset.multi(
+    schedule=[dcc_experiment_xml_path_asset],
+    outlets=[
+        specimen_level_experiment_parquet_asset,
+        line_level_experiment_parquet_asset,
+    ],
+    dag_id=f"{dr_tag}_dcc_experiment_parser",
+    description=textwrap.dedent(
+        """DCC Experiment Extractor.
+        Extracts both specimen level and line level experiment data from XML DCC files to Spark DataFrames.
+        Processes XML files organized by data source and converts them to parquet format.
+        Efficiently processes both experiment types in a single task to minimize XML file reads."""
+    ),
+    tags=["dcc", "experiment", "xml"],
+)
+@with_spark_session(include_xml=True)
+def experiment_parser():
+    from pyspark.sql import SparkSession, DataFrame
+
+    def get_experiments_by_type(dcc_df: DataFrame, entity_type: str) -> DataFrame:
+        """
+        Takes a DataFrame generated by `impc_etl.jobs.parse.dcc_extractor_helper.extract_dcc_xml_files`, an entity_type
+        (can be 'specimen_level' or 'line_level') and gets the experiment data entities.
+        It also expands the procedure struct column
+        and adds all its fields to the top level. raises UnsupportedEntityError when the given entity_type is not supported.
+        """
+        if entity_type not in ["specimen_level", "line_level"]:
+            raise UnsupportedEntityError
+
+        input_to_xml_entity_map = {"specimen_level": "experiment", "line_level": "line"}
+        entity_type = input_to_xml_entity_map[entity_type]
+
+        experiment_df = get_entity_by_type(
+            dcc_df,
+            entity_type,
+            [
+                "_centreID",
+                "_pipeline",
+                "_project",
+                "_sourceFile",
+                "_dataSource",
+                "_sourcePhenotypingStatus",
+            ],
+        )
+
+        return experiment_df.select(
+            ["procedure.*"]
+            + [column for column in experiment_df.columns if column != "procedure"]
+        ).drop("procedure")
+
+    spark = SparkSession.builder.getOrCreate()
+
+    spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
+    spark.sql("set spark.sql.legacy.parquet.datetimeRebaseModeInWrite=LEGACY")
+
+    dcc_experiment_xml_path = dcc_experiment_xml_path_asset.uri
+
+    task_logger.info(
+        f"Starting DCC experiment extraction from {dcc_experiment_xml_path}"
+    )
+
+    # Extract DCC XML files
+    dcc_df = extract_dcc_xml_files(spark, dcc_experiment_xml_path, "experiment")
+    task_logger.info(
+        f"Successfully extracted DCC XML files, total records: {dcc_df.count()}"
+    )
+
+    # Cache the DataFrame
+    dcc_df.cache()
+
+    # Process specimen level experiments
+    task_logger.info("Processing specimen level experiments...")
+    specimen_experiment_df = get_experiments_by_type(dcc_df, "specimen_level")
+    specimen_output_path = specimen_level_experiment_parquet_asset.uri
+    specimen_experiment_df.write.mode("overwrite").parquet(specimen_output_path)
+    specimen_count = specimen_experiment_df.count()
+    task_logger.info(
+        f"Successfully processed {specimen_count} specimen level experiments"
+    )
+
+    # Process line level experiments
+    task_logger.info("Processing line level experiments...")
+    line_experiment_df = get_experiments_by_type(dcc_df, "line_level")
+    line_output_path = line_level_experiment_parquet_asset.uri
+    line_experiment_df.write.mode("overwrite").parquet(line_output_path)
+    line_count = line_experiment_df.count()
+    task_logger.info(f"Successfully processed {line_count} line level experiments")
+
+    # Unpersist cached DataFrame to free memory
+    dcc_df.unpersist()
+
+    task_logger.info(
+        f"DCC experiment extraction completed successfully. Total: {specimen_count + line_count} experiments processed"
+    )