mpi2
diff --git a/‎impc_etl/jobs/load/impc_web_api/impc_batch_query_mapper.py‎
Lines changed: 93 additions & 117 deletions b/‎impc_etl/jobs/load/impc_web_api/impc_batch_query_mapper.py‎
Lines changed: 93 additions & 117 deletions
@@ -1,124 +1,100 @@
-from impc_etl.jobs.load.impc_web_api import (
-    ImpcConfig,
-    PySparkTask,
-    SparkContext,
-    SparkSession,
-    col,
-    collect_set,
-    explode_outer,
-    luigi,
-    phenotype_term_zip_udf,
+import logging
+import textwrap
+from airflow.sdk import Variable, asset
+
+from impc_etl.utils.airflow import create_input_asset, create_output_asset
+from impc_etl.utils.spark import with_spark_session
+from impc_etl.utils.impc_web_api import phenotype_term_zip_udf
+
+task_logger = logging.getLogger("airflow.task")
+dr_tag = Variable.get("data_release_tag")
+
+ortholog_mapping_report_tsv_path_asset = create_input_asset("impc_web_api/mouse_human_ortholog_report.tsv")
+mp_hp_matches_csv_path_asset = create_input_asset("impc_web_api/mp_hp_matches.csv")
+gene_stats_results_json_path_asset = create_input_asset("output/impc_web_api/gene_statistical_results_service_json")
+
+batch_query_data_parquet_asset = create_output_asset("impc_web_api/batch_query_data_parquet")
+
+@asset.multi(
+    schedule=[ortholog_mapping_report_tsv_path_asset, mp_hp_matches_csv_path_asset, gene_stats_results_json_path_asset],
+    outlets=[batch_query_data_parquet_asset],
+    dag_id=f"{dr_tag}_impc_batch_query_mapper",
+    description=textwrap.dedent(
+        """IMPC Web API batch query mapper DAG."""
+    ),
+    tags=["impc_web_api", "batch query"],
 )
-
-
-class ImpcBatchQueryMapper(PySparkTask):
-    """
-    PySpark Task class to parse GenTar Product report data.
-    """
-
-    #: Name of the Spark task
-    name: str = "ImpcBatchQueryMapper"
-
-    ortholog_mapping_report_tsv_path = luigi.Parameter()
-    mp_hp_matches_csv_path = luigi.Parameter()
-
-    #: Path of the output directory where the new parquet file will be generated.
-    output_path: luigi.Parameter = luigi.Parameter()
-
-    def requires(self):
-        return [ImpcGeneStatsResultsMapper()]
-
-    def output(self):
-        """
-        Returns the full parquet path as an output for the Luigi Task
-        (e.g. impc/dr15.2/parquet/product_report_parquet)
-        """
-        return ImpcConfig().get_target(
-            f"{self.output_path}/impc_web_api/batch_query_data_parquet"
-        )
-
-    def app_options(self):
-        """
-        Generates the options pass to the PySpark job
-        """
-        return [
-            self.ortholog_mapping_report_tsv_path,
-            self.mp_hp_matches_csv_path,
-            self.input()[0].path,
-            self.output().path,
-        ]
-
-    def main(self, sc: SparkContext, *args):
-        """
-        Takes in a SparkContext and the list of arguments generated by `app_options` and executes the PySpark job.
-        """
-        spark = SparkSession(sc)
-
-        # Parsing app options
-        ortholog_mapping_report_tsv_path = args[0]
-        mp_hp_matches_csv_path = args[1]
-        gene_stats_results_json_path = args[2]
-        output_path = args[3]
-
-        ortholog_mapping_df = spark.read.csv(
-            ortholog_mapping_report_tsv_path, sep="\t", header=True
-        )
-        stats_results = spark.read.json(gene_stats_results_json_path)
-
-        ortholog_mapping_df = ortholog_mapping_df.select(
-            col("Mgi Gene Acc Id").alias("mgiGeneAccessionId"),
-            col("Human Gene Symbol").alias("humanGeneSymbol"),
-            col("Hgnc Acc Id").alias("hgncGeneAccessionId"),
-        ).distinct()
-
-        stats_results = stats_results.join(
-            ortholog_mapping_df, "mgiGeneAccessionId", how="left_outer"
-        )
-
-        mp_matches_df = spark.read.csv(mp_hp_matches_csv_path, header=True)
-        mp_matches_df = mp_matches_df.select(
-            col("curie_x").alias("id"),
-            col("curie_y").alias("hp_term_id"),
-            col("label_y").alias("hp_term_name"),
-        ).distinct()
-
-        stats_mp_hp_df = stats_results.select(
-            "statisticalResultId",
-            "potentialPhenotypes",
-            "intermediatePhenotypes",
-            "topLevelPhenotypes",
-            "significantPhenotype",
+@with_spark_session
+def impc_batch_query_mapper():
+    from pyspark.sql import SparkSession
+    from pyspark.sql.functions import col, explode_outer, collect_set, when, struct, lit
+
+    spark = SparkSession.builder.getOrCreate()
+
+    ortholog_mapping_report_tsv_path = ortholog_mapping_report_tsv_path_asset.uri
+    mp_hp_matches_csv_path = mp_hp_matches_csv_path_asset.uri
+    gene_stats_results_json_path = gene_stats_results_json_path_asset.uri
+    output_path = batch_query_data_parquet_asset.uri
+
+    ortholog_mapping_df = spark.read.csv(
+        ortholog_mapping_report_tsv_path, sep="\t", header=True
+    )
+    stats_results = spark.read.json(gene_stats_results_json_path)
+
+    ortholog_mapping_df = ortholog_mapping_df.select(
+        col("Mgi Gene Acc Id").alias("mgiGeneAccessionId"),
+        col("Human Gene Symbol").alias("humanGeneSymbol"),
+        col("Hgnc Acc Id").alias("hgncGeneAccessionId"),
+    ).distinct()
+
+    stats_results = stats_results.join(
+        ortholog_mapping_df, "mgiGeneAccessionId", how="left_outer"
+    )
+
+    mp_matches_df = spark.read.csv(mp_hp_matches_csv_path, header=True)
+    mp_matches_df = mp_matches_df.select(
+        col("curie_x").alias("id"),
+        col("curie_y").alias("hp_term_id"),
+        col("label_y").alias("hp_term_name"),
+    ).distinct()
+
+    stats_mp_hp_df = stats_results.select(
+        "statisticalResultId",
+        "potentialPhenotypes",
+        "intermediatePhenotypes",
+        "topLevelPhenotypes",
+        "significantPhenotype",
+    )
+    for phenotype_list_col in [
+        "potentialPhenotypes",
+        "intermediatePhenotypes",
+        "topLevelPhenotypes",
+    ]:
+        stats_mp_hp_df = stats_mp_hp_df.withColumn(
+            phenotype_list_col[:-1], explode_outer(phenotype_list_col)
         )
-        for phenotype_list_col in [
-            "potentialPhenotypes",
-            "intermediatePhenotypes",
-            "topLevelPhenotypes",
-        ]:
-            stats_mp_hp_df = stats_mp_hp_df.withColumn(
-                phenotype_list_col[:-1], explode_outer(phenotype_list_col)
-            )
 
-        stats_mp_hp_df = stats_mp_hp_df.join(
-            mp_matches_df,
-            (
+    stats_mp_hp_df = stats_mp_hp_df.join(
+        mp_matches_df,
+        (
                 (col("significantPhenotype.id") == col("id"))
                 | (col("potentialPhenotype.id") == col("id"))
                 | (col("intermediatePhenotype.id") == col("id"))
                 | (col("topLevelPhenotype.id") == col("id"))
-            ),
-            how="left_outer",
-        )
-        stats_mp_hp_df = stats_mp_hp_df.withColumn(
-            "humanPhenotype",
-            phenotype_term_zip_udf(col("hp_term_id"), col("hp_term_name")),
-        )
-        stats_mp_hp_df = (
-            stats_mp_hp_df.groupBy("statisticalResultId")
-            .agg(collect_set("humanPhenotype").alias("humanPhenotypes"))
-            .select("statisticalResultId", "humanPhenotypes")
-            .distinct()
-        )
-
-        stats_results = stats_results.join(stats_mp_hp_df, "statisticalResultId")
-
-        stats_results.write.parquet(output_path)
+        ),
+        how="left_outer",
+    )
+    stats_mp_hp_df = stats_mp_hp_df.withColumn(
+        "humanPhenotype",
+        phenotype_term_zip_udf(col("hp_term_id"), col("hp_term_name")),
+    )
+    stats_mp_hp_df = (
+        stats_mp_hp_df.groupBy("statisticalResultId")
+        .agg(collect_set("humanPhenotype").alias("humanPhenotypes"))
+        .select("statisticalResultId", "humanPhenotypes")
+        .distinct()
+    )
+
+    stats_results = stats_results.join(stats_mp_hp_df, "statisticalResultId")
+
+    stats_results.coalesce(100).write.parquet(output_path, mode="overwrite")