Skip to content

gene_ref_parser #333

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jul 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 0 additions & 113 deletions impc_etl/jobs/parse/experiment_extractor.py

This file was deleted.

128 changes: 128 additions & 0 deletions impc_etl/jobs/parse/experiment_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""
DCC Experiment Extractor module
This module takes care of extracting experiment data from XML DCC files to Spark DataFrames.
In the XML files we can find two experiment classes: specimen level experiments and line level experiments.

The files are expected to be organized by data source in the following directory structure <DCC_XML_PATH>/<DATASOURCE>/*experiment*.xml
Each directory containing the raw XML in the XML schema defined by the DCC.
"""

import logging
import textwrap

from airflow.sdk import Variable, asset

from impc_etl.utils.airflow import create_input_asset, create_output_asset
from impc_etl.utils.spark import with_spark_session
from impc_etl.jobs.parse.xml_extraction_helper import (
extract_dcc_xml_files,
get_entity_by_type,
)
from impc_etl.utils.exceptions import UnsupportedEntityError

task_logger = logging.getLogger("airflow.task")
dr_tag = Variable.get("data_release_tag")

dcc_experiment_xml_path_asset = create_input_asset("xml/impc/data")
specimen_level_experiment_parquet_asset = create_output_asset(
"specimen_level_experiment_raw_parquet"
)
line_level_experiment_parquet_asset = create_output_asset(
"line_level_experiment_raw_parquet"
)


@asset.multi(
schedule=[dcc_experiment_xml_path_asset],
outlets=[
specimen_level_experiment_parquet_asset,
line_level_experiment_parquet_asset,
],
dag_id=f"{dr_tag}_dcc_experiment_parser",
description=textwrap.dedent(
"""DCC Experiment Extractor.
Extracts both specimen level and line level experiment data from XML DCC files to Spark DataFrames.
Processes XML files organized by data source and converts them to parquet format.
Efficiently processes both experiment types in a single task to minimize XML file reads."""
),
tags=["dcc", "experiment", "xml"],
)
@with_spark_session(include_xml=True)
def experiment_parser():
from pyspark.sql import SparkSession, DataFrame

def get_experiments_by_type(dcc_df: DataFrame, entity_type: str) -> DataFrame:
"""
Takes a DataFrame generated by `impc_etl.jobs.parse.dcc_extractor_helper.extract_dcc_xml_files`, an entity_type
(can be 'specimen_level' or 'line_level') and gets the experiment data entities.
It also expands the procedure struct column
and adds all its fields to the top level. raises UnsupportedEntityError when the given entity_type is not supported.
"""
if entity_type not in ["specimen_level", "line_level"]:
raise UnsupportedEntityError

input_to_xml_entity_map = {"specimen_level": "experiment", "line_level": "line"}
entity_type = input_to_xml_entity_map[entity_type]

experiment_df = get_entity_by_type(
dcc_df,
entity_type,
[
"_centreID",
"_pipeline",
"_project",
"_sourceFile",
"_dataSource",
"_sourcePhenotypingStatus",
],
)

return experiment_df.select(
["procedure.*"]
+ [column for column in experiment_df.columns if column != "procedure"]
).drop("procedure")

spark = SparkSession.builder.getOrCreate()

spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
spark.sql("set spark.sql.legacy.parquet.datetimeRebaseModeInWrite=LEGACY")

dcc_experiment_xml_path = dcc_experiment_xml_path_asset.uri

task_logger.info(
f"Starting DCC experiment extraction from {dcc_experiment_xml_path}"
)

# Extract DCC XML files
dcc_df = extract_dcc_xml_files(spark, dcc_experiment_xml_path, "experiment")
task_logger.info(
f"Successfully extracted DCC XML files, total records: {dcc_df.count()}"
)

# Cache the DataFrame
dcc_df.cache()

# Process specimen level experiments
task_logger.info("Processing specimen level experiments...")
specimen_experiment_df = get_experiments_by_type(dcc_df, "specimen_level")
specimen_output_path = specimen_level_experiment_parquet_asset.uri
specimen_experiment_df.write.mode("overwrite").parquet(specimen_output_path)
specimen_count = specimen_experiment_df.count()
task_logger.info(
f"Successfully processed {specimen_count} specimen level experiments"
)

# Process line level experiments
task_logger.info("Processing line level experiments...")
line_experiment_df = get_experiments_by_type(dcc_df, "line_level")
line_output_path = line_level_experiment_parquet_asset.uri
line_experiment_df.write.mode("overwrite").parquet(line_output_path)
line_count = line_experiment_df.count()
task_logger.info(f"Successfully processed {line_count} line level experiments")

# Unpersist cached DataFrame to free memory
dcc_df.unpersist()

task_logger.info(
f"DCC experiment extraction completed successfully. Total: {specimen_count + line_count} experiments processed"
)
Loading