Skip to content

Commit

Permalink
Merge pull request #16 from bcgsc/improvement/SDEV-4431_migration_to_…
Browse files Browse the repository at this point in the history
…pori_python

Improvement/sdev 4431 migration to pori python
  • Loading branch information
elewis2 authored Jul 25, 2024
2 parents 83c2cb4 + ba62186 commit 28682d0
Show file tree
Hide file tree
Showing 16 changed files with 320 additions and 349 deletions.
12 changes: 6 additions & 6 deletions pori_python/graphkb/classes.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<body>
<script src="https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js"></script>
<div class="mermaid">

classDiagram
class BasicPosition {
pos : int
Expand Down Expand Up @@ -78,9 +78,9 @@
untemplatedSeqSize : Optional[int]
}
class Statement {
conditions : List[OntologyLink]
evidence : List[OntologyLink]
evidenceLevel : List[OntologyLink]
conditions : List[Ontology]
evidence : List[Ontology]
evidenceLevel : List[Ontology]
relevance : Union
source : Union
sourceId : str
Expand All @@ -90,12 +90,12 @@
displayName : str
germline : bool
reference1 : Union
reference2 : Optional[OntologyLink]
reference2 : Optional[Ontology]
type : Union
zygosity : str
}
PositionalVariant --|> Variant

</div>
</body>
</html>
127 changes: 73 additions & 54 deletions pori_python/graphkb/genes.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Methods for retrieving gene annotation lists from GraphKB."""

from __future__ import annotations

from typing import Any, Dict, List, Sequence, Set, Tuple, cast

from pori_python.types import Ontology, Statement, Variant
from pori_python.types import IprGene, Ontology, Record, Statement, Variant

from . import GraphKBConnection
from .constants import (
Expand Down Expand Up @@ -110,6 +112,7 @@ def get_therapeutic_associated_genes(graphkb_conn: GraphKBConnection) -> List[On
)
genes: List[Ontology] = []
for statement in statements:
statement = cast(Statement, statement)
if statement["reviewStatus"] == "failed":
continue
for condition in statement["conditions"]:
Expand Down Expand Up @@ -145,9 +148,9 @@ def get_genes_from_variant_types(
Returns:
List.<dict>: gene (Feature) records
"""
filters: List[Dict[str, Any]] = []
variant_filters: List[Dict[str, Any]] = []
if types:
filters.append(
variant_filters.append(
{"type": {"target": "Vocabulary", "filters": {"name": types, "operator": "IN"}}}
)

Expand All @@ -156,7 +159,7 @@ def get_genes_from_variant_types(
conn.query(
{
"target": "Variant",
"filters": filters,
"filters": variant_filters,
"returnProperties": ["reference1", "reference2"],
},
ignore_cache=ignore_cache,
Expand All @@ -171,14 +174,18 @@ def get_genes_from_variant_types(
if not genes:
return []

filters: List[Dict[str, Any]] = [{"biotype": "gene"}]
gene_filters: List[Dict[str, Any]] = [{"biotype": "gene"}]
if source_record_ids:
filters.append({"source": source_record_ids, "operator": "IN"})
gene_filters.append({"source": source_record_ids, "operator": "IN"})

result = cast(
List[Ontology],
conn.query(
{"target": list(genes), "returnProperties": GENE_RETURN_PROPERTIES, "filters": filters},
{
"target": list(genes),
"returnProperties": GENE_RETURN_PROPERTIES,
"filters": gene_filters,
},
ignore_cache=ignore_cache,
),
)
Expand Down Expand Up @@ -256,43 +263,52 @@ def get_gene_linked_cancer_predisposition_info(
genes = set()
non_genes = set()
infer_genes = set()
variants = {}
variants: Dict[str, Tuple[str, List[str]]] = {}

terms: dict = {term: lst for term, lst in RELEVANCE_BASE_TERMS}
relevance_rids = list(get_terms_set(conn, terms.get("cancer predisposition", [])))

for record in conn.query(
{
"target": "Statement",
"filters": {
"AND": [
{
"evidence": {
"target": "Source",
"filters": {"@rid": get_rid(conn, "Source", "CGL")},
}
},
{"relevance": {"target": "Vocabulary", "filters": {"@rid": relevance_rids}}},
]
predisp_statements = [
cast(Statement, record)
for record in conn.query(
{
"target": "Statement",
"filters": {
"AND": [
{
"evidence": {
"target": "Source",
"filters": {"@rid": get_rid(conn, "Source", "CGL")},
}
},
{
"relevance": {
"target": "Vocabulary",
"filters": {"@rid": relevance_rids},
}
},
]
},
"returnProperties": [
"conditions.@class",
"conditions.@rid",
"conditions.displayName",
"conditions.reference1.biotype",
"conditions.reference1.displayName",
"conditions.reference2.biotype",
"conditions.reference2.displayName",
],
},
"returnProperties": [
"conditions.@class",
"conditions.@rid",
"conditions.displayName",
"conditions.reference1.biotype",
"conditions.reference1.displayName",
"conditions.reference2.biotype",
"conditions.reference2.displayName",
],
},
ignore_cache=False,
):
for condition in record["conditions"]: # type: ignore
ignore_cache=False,
)
]
for record in predisp_statements:
for condition in record["conditions"]:
if condition["@class"] == "PositionalVariant":
assoc_gene_list = []
assoc_gene_list: List[str] = []
for reference in ["reference1", "reference2"]:
name = (condition.get(reference) or {}).get("displayName", "")
biotype = (condition.get(reference) or {}).get("biotype", "")
name = (condition.get(reference) or {}).get("displayName", "") # type: ignore
biotype = (condition.get(reference) or {}).get("biotype", "") # type: ignore
if name and biotype == "gene":
genes.add(name)
assoc_gene_list.append(name)
Expand All @@ -306,7 +322,7 @@ def get_gene_linked_cancer_predisposition_info(
logger.error(
f"Non-gene cancer predisposition {biotype}: {name} for {condition['displayName']}"
)
variants[condition["@rid"]] = [condition["displayName"], assoc_gene_list]
variants[condition["@rid"]] = (condition["displayName"], assoc_gene_list)

for gene, name, biotype in infer_genes:
logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
Expand Down Expand Up @@ -348,7 +364,7 @@ def get_gene_linked_pharmacogenomic_info(
genes = set()
non_genes = set()
infer_genes = set()
variants = {}
variants: Dict[str, Tuple] = {}

relevance_rids = list(get_terms_set(conn, "pharmacogenomic"))

Expand Down Expand Up @@ -394,7 +410,7 @@ def get_gene_linked_pharmacogenomic_info(
logger.error(
f"Non-gene pharmacogenomic {biotype}: {name} for {condition['displayName']}"
)
variants[condition["@rid"]] = [condition["displayName"], assoc_gene_list]
variants[condition["@rid"]] = (condition["displayName"], assoc_gene_list)
for gene, name, biotype in infer_genes:
logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
genes.add(gene)
Expand All @@ -405,13 +421,13 @@ def get_gene_linked_pharmacogenomic_info(
return sorted(genes), variants


def convert_to_rid_set(records: Sequence[Dict]) -> Set[str]:
def convert_to_rid_set(records: List[Record] | List[Ontology]) -> Set[str]:
return {r["@rid"] for r in records}


def get_gene_information(
graphkb_conn: GraphKBConnection, gene_names: Sequence[str]
) -> List[Dict[str, bool]]:
) -> List[IprGene]:
"""Create a list of gene_info flag dicts for IPR report upload.
Function is originally from pori_ipr_python::annotate.py
Expand Down Expand Up @@ -455,16 +471,19 @@ def get_gene_information(
}

for statement in statements:
statement = cast(Statement, statement)
for condition in statement["conditions"]:
if not condition.get("reference1"):
continue
gene_flags["kbStatementRelated"].add(condition["reference1"])
if condition["reference2"]:
gene_flags["kbStatementRelated"].add(condition["reference2"])
gene_flags["knownFusionPartner"].add(condition["reference1"])
gene_flags["knownFusionPartner"].add(condition["reference2"])
elif condition["@class"] == "PositionalVariant":
gene_flags["knownSmallMutation"].add(condition["reference1"])
# ignore types, as there can be various types of conditions
if condition.get("reference1"):
gene_flags["kbStatementRelated"].add(condition["reference1"]) # type: ignore
if condition.get("reference2"):
# Having a reference2 implies the event is a fusion
gene_flags["kbStatementRelated"].add(condition["reference2"]) # type: ignore
gene_flags["knownFusionPartner"].add(condition["reference1"]) # type: ignore
gene_flags["knownFusionPartner"].add(condition["reference2"]) # type: ignore
elif condition["@class"] == "PositionalVariant":
# PositionalVariant without a reference2 implies a smallMutation type
gene_flags["knownSmallMutation"].add(condition["reference1"]) # type: ignore

logger.info("fetching oncogenes list")
gene_flags["oncogene"] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn))
Expand All @@ -479,16 +498,16 @@ def get_gene_information(
)

logger.info(f"Setting gene_info flags on {len(gene_names)} genes")
result = []
result: List[IprGene] = []
for gene_name in gene_names:
equivalent = convert_to_rid_set(get_equivalent_features(graphkb_conn, gene_name))
row = {"name": gene_name}
row: Dict[str, str | bool] = {"name": gene_name}
flagged = False
for flag in gene_flags:
# make smaller JSON to upload since all default to false already
if equivalent.intersection(gene_flags[flag]):
row[flag] = flagged = True
if flagged:
result.append(row)
result.append(cast(IprGene, row))

return result
10 changes: 4 additions & 6 deletions pori_python/graphkb/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,15 +453,13 @@ def type_screening(
return default_type

# When size is given
if parsed.get("untemplatedSeqSize", 0) >= threshold:
if (parsed.get("untemplatedSeqSize") or 0) >= threshold:
return parsed["type"]

# When size needs to be computed from positions
pos_start = parsed.get("break1Start", {}).get("pos", 1)
pos_end = parsed.get("break2Start", {}).get("pos", pos_start)
pos_size = 1
if prefix == "p":
pos_size = 3
pos_start: int = parsed.get("break1Start", {}).get("pos", 1) # type: ignore
pos_end: int = parsed.get("break2Start", {}).get("pos", pos_start) # type: ignore
pos_size = 3 if prefix == "p" else 1
if ((pos_end - pos_start) + 1) * pos_size >= threshold:
return parsed["type"]

Expand Down
Loading

0 comments on commit 28682d0

Please sign in to comment.