diff --git a/pori_python/__init__.py b/pori_python/__init__.py index 44df203..71a12b1 100644 --- a/pori_python/__init__.py +++ b/pori_python/__init__.py @@ -1,2 +1 @@ -from . import ipr -from . import graphkb +from . import graphkb, ipr diff --git a/pori_python/graphkb/constants.py b/pori_python/graphkb/constants.py index f1ebf66..9f4b812 100644 --- a/pori_python/graphkb/constants.py +++ b/pori_python/graphkb/constants.py @@ -177,7 +177,7 @@ def __getitem__(self, key): } # For match.type_screening() [KBDEV-1056] -DEFAULT_NON_STRUCTURAL_VARIANT_TYPE = 'mutation' +DEFAULT_NON_STRUCTURAL_VARIANT_TYPE = "mutation" STRUCTURAL_VARIANT_SIZE_THRESHOLD = 48 # bp STRUCTURAL_VARIANT_TYPES = [ "structural variant", diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py index cc71ce6..1edb263 100644 --- a/pori_python/graphkb/genes.py +++ b/pori_python/graphkb/genes.py @@ -24,7 +24,10 @@ def _get_tumourigenesis_genes_list( - conn: GraphKBConnection, relevance: str, sources: List[str], ignore_cache: bool = False + conn: GraphKBConnection, + relevance: str, + sources: List[str], + ignore_cache: bool = False, ) -> List[Ontology]: statements = cast( List[Statement], @@ -34,10 +37,17 @@ def _get_tumourigenesis_genes_list( "filters": { "AND": [ {"source": {"target": "Source", "filters": {"name": sources}}}, - {"relevance": {"target": "Vocabulary", "filters": {"name": relevance}}}, + { + "relevance": { + "target": "Vocabulary", + "filters": {"name": relevance}, + } + }, ] }, - "returnProperties": [f"subject.{prop}" for prop in GENE_RETURN_PROPERTIES], + "returnProperties": [ + f"subject.{prop}" for prop in GENE_RETURN_PROPERTIES + ], }, ignore_cache=ignore_cache, ), @@ -74,7 +84,9 @@ def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]: Returns: gene (Feature) records """ - return _get_tumourigenesis_genes_list(conn, TUMOUR_SUPPRESSIVE, [ONCOKB_SOURCE_NAME]) + return _get_tumourigenesis_genes_list( + conn, TUMOUR_SUPPRESSIVE, [ONCOKB_SOURCE_NAME] + ) def get_cancer_genes(conn: GraphKBConnection) -> List[Ontology]: @@ -147,7 +159,12 @@ def get_genes_from_variant_types( filters: List[Dict[str, Any]] = [] if types: filters.append( - {"type": {"target": "Vocabulary", "filters": {"name": types, "operator": "IN"}}} + { + "type": { + "target": "Vocabulary", + "filters": {"name": types, "operator": "IN"}, + } + } ) variants = cast( @@ -177,7 +194,11 @@ def get_genes_from_variant_types( result = cast( List[Ontology], conn.query( - {"target": list(genes), "returnProperties": GENE_RETURN_PROPERTIES, "filters": filters}, + { + "target": list(genes), + "returnProperties": GENE_RETURN_PROPERTIES, + "filters": filters, + }, ignore_cache=ignore_cache, ), ) @@ -273,7 +294,12 @@ def get_gene_linked_cancer_predisposition_info( "filters": {"@rid": get_rid(conn, "Source", "CGL")}, } }, - {"relevance": {"target": "Vocabulary", "filters": {"@rid": relevance_rids}}}, + { + "relevance": { + "target": "Vocabulary", + "filters": {"@rid": relevance_rids}, + } + }, ] }, "returnProperties": [ @@ -307,7 +333,10 @@ def get_gene_linked_cancer_predisposition_info( logger.error( f"Non-gene cancer predisposition {biotype}: {name} for {condition['displayName']}" ) - variants[condition["@rid"]] = [condition["displayName"], assoc_gene_list] + variants[condition["@rid"]] = [ + condition["displayName"], + assoc_gene_list, + ] for gene, name, biotype in infer_genes: logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})") @@ -359,7 +388,12 @@ def get_gene_linked_pharmacogenomic_info( { "target": "Statement", "filters": [ - {"relevance": {"target": "Vocabulary", "filters": {"@rid": relevance_rids}}} + { + "relevance": { + "target": "Vocabulary", + "filters": {"@rid": relevance_rids}, + } + } ], "returnProperties": [ "conditions.@class", @@ -397,7 +431,10 @@ def get_gene_linked_pharmacogenomic_info( logger.error( f"Non-gene pharmacogenomic {biotype}: {name} for {condition['displayName']}" ) - variants[condition["@rid"]] = [condition["displayName"], assoc_gene_list] + variants[condition["@rid"]] = [ + condition["displayName"], + assoc_gene_list, + ] for gene, name, biotype in infer_genes: logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})") genes.add(gene) @@ -449,7 +486,9 @@ def get_gene_information( gene_names = sorted(set(gene_names)) statements = graphkb_conn.query(body) - statements = [s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS] + statements = [ + s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS + ] gene_flags: Dict[str, Set[str]] = { "kbStatementRelated": set(), @@ -472,9 +511,13 @@ def get_gene_information( logger.info("fetching oncogenes list") gene_flags["oncogene"] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn)) logger.info("fetching tumour supressors list") - gene_flags["tumourSuppressor"] = convert_to_rid_set(get_oncokb_tumour_supressors(graphkb_conn)) + gene_flags["tumourSuppressor"] = convert_to_rid_set( + get_oncokb_tumour_supressors(graphkb_conn) + ) logger.info("fetching cancerGeneListMatch list") - gene_flags["cancerGeneListMatch"] = convert_to_rid_set(get_cancer_genes(graphkb_conn)) + gene_flags["cancerGeneListMatch"] = convert_to_rid_set( + get_cancer_genes(graphkb_conn) + ) logger.info("fetching therapeutic associated genes lists") gene_flags["therapeuticAssociated"] = convert_to_rid_set( @@ -484,7 +527,9 @@ def get_gene_information( logger.info(f"Setting gene_info flags on {len(gene_names)} genes") result = [] for gene_name in gene_names: - equivalent = convert_to_rid_set(get_equivalent_features(graphkb_conn, gene_name)) + equivalent = convert_to_rid_set( + get_equivalent_features(graphkb_conn, gene_name) + ) row = {"name": gene_name} flagged = False for flag in gene_flags: diff --git a/pori_python/graphkb/match.py b/pori_python/graphkb/match.py index ec82e57..631d073 100644 --- a/pori_python/graphkb/match.py +++ b/pori_python/graphkb/match.py @@ -15,7 +15,14 @@ STRUCTURAL_VARIANT_TYPES, VARIANT_RETURN_PROPERTIES, ) -from .types import BasicPosition, Ontology, ParsedVariant, PositionalVariant, Record, Variant +from .types import ( + BasicPosition, + Ontology, + ParsedVariant, + PositionalVariant, + Record, + Variant, +) from .util import ( FeatureNotFoundError, convert_to_rid_list, @@ -23,7 +30,7 @@ looks_like_rid, stringifyVariant, ) -from .vocab import get_equivalent_terms, get_terms_set, get_term_tree +from .vocab import get_equivalent_terms, get_term_tree, get_terms_set FEATURES_CACHE: Set[str] = set() @@ -63,7 +70,8 @@ def get_equivalent_features( return cast( List[Ontology], conn.query( - {"target": [gene_name], "queryType": "similarTo"}, ignore_cache=ignore_cache + {"target": [gene_name], "queryType": "similarTo"}, + ignore_cache=ignore_cache, ), ) @@ -82,9 +90,16 @@ def get_equivalent_features( filters.append({"sourceId": gene_name}) if source_id_version: filters.append( - {"OR": [{"sourceIdVersion": source_id_version}, {"sourceIdVersion": None}]} + { + "OR": [ + {"sourceIdVersion": source_id_version}, + {"sourceIdVersion": None}, + ] + } ) - elif FEATURES_CACHE and gene_name.lower() not in FEATURES_CACHE and not ignore_cache: + elif ( + FEATURES_CACHE and gene_name.lower() not in FEATURES_CACHE and not ignore_cache + ): return [] else: filters.append({"OR": [{"sourceId": gene_name}, {"name": gene_name}]}) @@ -92,7 +107,10 @@ def get_equivalent_features( return cast( List[Ontology], conn.query( - {"target": {"target": "Feature", "filters": filters}, "queryType": "similarTo"}, + { + "target": {"target": "Feature", "filters": filters}, + "queryType": "similarTo", + }, ignore_cache=ignore_cache, ), ) @@ -105,7 +123,13 @@ def cache_missing_features(conn: GraphKBConnection) -> None: """ genes = cast( List[Ontology], - conn.query({"target": "Feature", "returnProperties": ["name", "sourceId"], "neighbors": 0}), + conn.query( + { + "target": "Feature", + "returnProperties": ["name", "sourceId"], + "neighbors": 0, + } + ), ) for gene in genes: if gene["name"]: @@ -160,7 +184,9 @@ def match_category_variant( ) if not terms: - raise ValueError(f"unable to find the term/category ({category}) or any equivalent") + raise ValueError( + f"unable to find the term/category ({category}) or any equivalent" + ) # find the variant list return cast( @@ -175,7 +201,12 @@ def match_category_variant( ], }, "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf", "GeneralizationOf"], + "edges": [ + "AliasOf", + "DeprecatedBy", + "CrossReferenceOf", + "GeneralizationOf", + ], "treeEdges": ["Infers"], "returnProperties": VARIANT_RETURN_PROPERTIES, }, @@ -185,7 +216,11 @@ def match_category_variant( def match_copy_variant( - conn: GraphKBConnection, gene_name: str, category: str, drop_homozygous: bool = False, **kwargs + conn: GraphKBConnection, + gene_name: str, + category: str, + drop_homozygous: bool = False, + **kwargs, ) -> List[Variant]: """ Returns a list of variants matching the input variant @@ -226,7 +261,9 @@ def match_expression_variant( def positions_overlap( - pos_record: BasicPosition, range_start: BasicPosition, range_end: Optional[BasicPosition] = None + pos_record: BasicPosition, + range_start: BasicPosition, + range_end: Optional[BasicPosition] = None, ) -> bool: """ Check if 2 Position records from GraphKB indicate an overlap @@ -350,9 +387,14 @@ def compare_positional_variants( reference_variant["untemplatedSeq"] not in AMBIGUOUS_AA and variant["untemplatedSeq"] not in AMBIGUOUS_AA ): - if reference_variant["untemplatedSeq"].lower() != variant["untemplatedSeq"].lower(): + if ( + reference_variant["untemplatedSeq"].lower() + != variant["untemplatedSeq"].lower() + ): return False - elif len(variant["untemplatedSeq"]) != len(reference_variant["untemplatedSeq"]): + elif len(variant["untemplatedSeq"]) != len( + reference_variant["untemplatedSeq"] + ): return False # If both variants have a reference sequence, @@ -374,9 +416,7 @@ def compare_positional_variants( def type_screening( - conn: GraphKBConnection, - parsed: ParsedVariant, - updateStructuralTypes=False, + conn: GraphKBConnection, parsed: ParsedVariant, updateStructuralTypes=False ) -> str: """ [KBDEV-1056] @@ -424,40 +464,42 @@ def type_screening( # Will use either hardcoded type list or an updated list from the API if updateStructuralTypes: - rids = list(get_terms_set(conn, ['structural variant'])) + rids = list(get_terms_set(conn, ["structural variant"])) records = conn.get_records_by_id(rids) - structuralVariantTypes = [el['name'] for el in records] + structuralVariantTypes = [el["name"] for el in records] # Unambiguous non-structural variation type - if parsed['type'] not in structuralVariantTypes: - return parsed['type'] + if parsed["type"] not in structuralVariantTypes: + return parsed["type"] # Unambiguous structural variation type - if parsed['type'] in ['fusion', 'translocation']: - return parsed['type'] - if parsed.get('reference2', None): - return parsed['type'] - prefix = parsed.get('prefix', 'g') - if prefix == 'y': # Assuming all variations using cytoband coordiantes meet the size threshold - return parsed['type'] + if parsed["type"] in ["fusion", "translocation"]: + return parsed["type"] + if parsed.get("reference2", None): + return parsed["type"] + prefix = parsed.get("prefix", "g") + if ( + prefix == "y" + ): # Assuming all variations using cytoband coordiantes meet the size threshold + return parsed["type"] # When size cannot be determined: exonic and intronic coordinates # e.g. "MET:e.14del" meaning "Any deletion occuring at the 14th exon" - if prefix in ['e', 'i']: # Assuming they don't meet the size threshold + if prefix in ["e", "i"]: # Assuming they don't meet the size threshold return default_type # When size is given - if parsed.get('untemplatedSeqSize', 0) >= threshold: - return parsed['type'] + if parsed.get("untemplatedSeqSize", 0) >= threshold: + return parsed["type"] # When size needs to be computed from positions - pos_start = parsed.get('break1Start', {}).get('pos', 1) - pos_end = parsed.get('break2Start', {}).get('pos', pos_start) + pos_start = parsed.get("break1Start", {}).get("pos", 1) + pos_end = parsed.get("break2Start", {}).get("pos", pos_start) pos_size = 1 - if prefix == 'p': + if prefix == "p": pos_size = 3 if ((pos_end - pos_start) + 1) * pos_size >= threshold: - return parsed['type'] + return parsed["type"] # Default return default_type @@ -533,7 +575,11 @@ def match_positional_variant( gene1 = parsed["reference1"] gene1_features = get_equivalent_features( - conn, gene1, source=gene_source, is_source_id=gene_is_source_id, ignore_cache=ignore_cache + conn, + gene1, + source=gene_source, + is_source_id=gene_is_source_id, + ignore_cache=ignore_cache, ) features = convert_to_rid_list(gene1_features) @@ -584,12 +630,15 @@ def match_positional_variant( ] filtered_similarOnly: List[Record] = [] # For post filter match use - filtered_similarAndGeneric: List[Record] = [] # To be added to the matches at the very end + filtered_similarAndGeneric: List[Record] = ( + [] + ) # To be added to the matches at the very end for row in cast( List[Record], conn.query( - {"target": "PositionalVariant", "filters": query_filters}, ignore_cache=ignore_cache + {"target": "PositionalVariant", "filters": query_filters}, + ignore_cache=ignore_cache, ), ): # TODO: Check if variant and reference_variant should be interchanged @@ -612,7 +661,12 @@ def match_positional_variant( { "target": convert_to_rid_list(filtered_similarOnly), "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf", "GeneralizationOf"], + "edges": [ + "AliasOf", + "DeprecatedBy", + "CrossReferenceOf", + "GeneralizationOf", + ], "treeEdges": ["Infers"], "returnProperties": POS_VARIANT_RETURN_PROPERTIES, }, diff --git a/pori_python/graphkb/statement.py b/pori_python/graphkb/statement.py index c969e8f..032498b 100644 --- a/pori_python/graphkb/statement.py +++ b/pori_python/graphkb/statement.py @@ -1,7 +1,11 @@ from typing import List, cast from . import GraphKBConnection -from .constants import FAILED_REVIEW_STATUS, RELEVANCE_BASE_TERMS, STATEMENT_RETURN_PROPERTIES +from .constants import ( + FAILED_REVIEW_STATUS, + RELEVANCE_BASE_TERMS, + STATEMENT_RETURN_PROPERTIES, +) from .types import CategoryBaseTermMapping, Statement, Variant from .util import convert_to_rid_list from .vocab import get_terms_set @@ -23,7 +27,9 @@ def categorize_relevance( def get_statements_from_variants( - graphkb_conn: GraphKBConnection, variants: List[Variant], failed_review: bool = False + graphkb_conn: GraphKBConnection, + variants: List[Variant], + failed_review: bool = False, ) -> List[Statement]: """Given a list of variant records from GraphKB, return related statements. @@ -38,10 +44,15 @@ def get_statements_from_variants( statements = graphkb_conn.query( { "target": "Statement", - "filters": {"conditions": convert_to_rid_list(variants), "operator": "CONTAINSANY"}, + "filters": { + "conditions": convert_to_rid_list(variants), + "operator": "CONTAINSANY", + }, "returnProperties": STATEMENT_RETURN_PROPERTIES, } ) if not failed_review: - statements = [s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS] + statements = [ + s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS + ] return [cast(Statement, s) for s in statements] diff --git a/pori_python/graphkb/util.py b/pori_python/graphkb/util.py index 7c6ef94..64b82e0 100644 --- a/pori_python/graphkb/util.py +++ b/pori_python/graphkb/util.py @@ -1,3 +1,7 @@ +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + import hashlib import json import logging @@ -6,10 +10,6 @@ from datetime import datetime from typing import Any, Dict, Iterable, List, Optional, Union, cast -import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry - from .constants import DEFAULT_LIMIT, DEFAULT_URL, TYPES_TO_NOTATION, AA_3to1_MAPPING from .types import OntologyTerm, ParsedVariant, PositionalVariant, Record @@ -113,7 +113,10 @@ def __init__( self.url = url self.username = username self.password = password - self.headers = {"Accept": "application/json", "Content-Type": "application/json"} + self.headers = { + "Accept": "application/json", + "Content-Type": "application/json", + } self.cache: Dict[Any, Any] = {} if not use_global_cache else QUERY_CACHE self.request_count = 0 self.first_request: Optional[datetime] = None @@ -125,7 +128,9 @@ def __init__( def load(self) -> Optional[float]: if self.first_request and self.last_request: return ( - self.request_count * 1000 / millis_interval(self.first_request, self.last_request) + self.request_count + * 1000 + / millis_interval(self.first_request, self.last_request) ) return None @@ -266,7 +271,9 @@ def query( return self.cache[hash_code] while True: - content = self.post("query", data={**request_body, "limit": limit, "skip": len(result)}) + content = self.post( + "query", data={**request_body, "limit": limit, "skip": len(result)} + ) records = content["result"] result.extend(records) if len(records) < limit or not paginate: @@ -358,7 +365,9 @@ def stripRefSeq(breakRepr: str) -> str: return breakRepr -def stripDisplayName(displayName: str, withRef: bool = True, withRefSeq: bool = True) -> str: +def stripDisplayName( + displayName: str, withRef: bool = True, withRefSeq: bool = True +) -> str: match: object = re.search(r"^(.*)(\:)(.*)$", displayName) if match and not withRef: if withRefSeq: @@ -376,7 +385,9 @@ def stripDisplayName(displayName: str, withRef: bool = True, withRefSeq: bool = while new_matches: new_matches = re.search(r"(.*)([A-Z]|\?)([0-9]+)(.*)", rest) if new_matches: - rest = new_matches.group(1) + new_matches.group(3) + new_matches.group(4) + rest = ( + new_matches.group(1) + new_matches.group(3) + new_matches.group(4) + ) # refSeq before '>' new_matches = re.search(r"^([0-9]*)([A-Z]*|\?)(\>)(.*)$", rest) @@ -392,7 +403,9 @@ def stripDisplayName(displayName: str, withRef: bool = True, withRefSeq: bool = def stringifyVariant( - variant: Union[PositionalVariant, ParsedVariant], withRef: bool = True, withRefSeq: bool = True + variant: Union[PositionalVariant, ParsedVariant], + withRef: bool = True, + withRefSeq: bool = True, ) -> str: """ Convert variant record to a string representation (displayName/hgvs) @@ -458,8 +471,12 @@ def stringifyVariant( break2Repr_noParentheses = stripParentheses(break2Repr) result.append(f"({break1Repr_noParentheses},{break2Repr_noParentheses})") else: - break1Repr_noParentheses_noRefSeq = stripRefSeq(stripParentheses(break1Repr)) - break2Repr_noParentheses_noRefSeq = stripRefSeq(stripParentheses(break2Repr)) + break1Repr_noParentheses_noRefSeq = stripRefSeq( + stripParentheses(break1Repr) + ) + break2Repr_noParentheses_noRefSeq = stripRefSeq( + stripParentheses(break2Repr) + ) result.append( f"({break1Repr_noParentheses_noRefSeq},{break2Repr_noParentheses_noRefSeq})" ) diff --git a/pori_python/graphkb/vocab.py b/pori_python/graphkb/vocab.py index 51446db..1b6c609 100644 --- a/pori_python/graphkb/vocab.py +++ b/pori_python/graphkb/vocab.py @@ -24,7 +24,9 @@ def get_equivalent_terms( base_term_name: the name to get superclasses of root_exclude_term: the parent term to exlcude along with all of its parent terms """ - base_records = convert_to_rid_list(conn.query(build_base_query(ontology_class, base_term_name))) + base_records = convert_to_rid_list( + conn.query(build_base_query(ontology_class, base_term_name)) + ) if not base_records: return [] base_term_parents = cast( @@ -34,7 +36,13 @@ def get_equivalent_terms( "target": {"target": base_records, "queryType": "descendants"}, "queryType": "similarTo", "treeEdges": [], - "returnProperties": ["sourceId", "sourceIdVersion", "deprecated", "name", "@rid"], + "returnProperties": [ + "sourceId", + "sourceIdVersion", + "deprecated", + "name", + "@rid", + ], }, ignore_cache=ignore_cache, ), @@ -94,7 +102,9 @@ def get_term_tree( Note: this must be done in 2 calls to avoid going up and down the tree in a single query (exclude adjacent siblings) """ # get all child terms of the subclass tree and disambiguate them - base_records = convert_to_rid_list(conn.query(build_base_query(ontology_class, base_term_name))) + base_records = convert_to_rid_list( + conn.query(build_base_query(ontology_class, base_term_name)) + ) if not base_records: return [] child_terms = cast( @@ -104,7 +114,13 @@ def get_term_tree( "target": {"target": base_records, "queryType": "ancestors"}, "queryType": "similarTo", "treeEdges": [], - "returnProperties": ["sourceId", "sourceIdVersion", "deprecated", "name", "@rid"], + "returnProperties": [ + "sourceId", + "sourceIdVersion", + "deprecated", + "name", + "@rid", + ], }, ignore_cache=ignore_cache, ), @@ -176,7 +192,9 @@ def get_term_by_name( def get_terms_set( - graphkb_conn: GraphKBConnection, base_terms: Iterable[str], ignore_cache: bool = False + graphkb_conn: GraphKBConnection, + base_terms: Iterable[str], + ignore_cache: bool = False, ) -> Set[str]: """Get a set of vocabulary rids given some base/parent term names.""" base_terms = [base_terms] if isinstance(base_terms, str) else base_terms @@ -188,7 +206,10 @@ def get_terms_set( terms.update( convert_to_rid_list( get_term_tree( - graphkb_conn, base_term, include_superclasses=False, ignore_cache=ignore_cache + graphkb_conn, + base_term, + include_superclasses=False, + ignore_cache=ignore_cache, ) ) ) diff --git a/pori_python/ipr/annotate.py b/pori_python/ipr/annotate.py index a7f20d3..92aff40 100644 --- a/pori_python/ipr/annotate.py +++ b/pori_python/ipr/annotate.py @@ -4,19 +4,26 @@ from requests.exceptions import HTTPError +from pandas import isnull +from tqdm import tqdm +from typing import Dict, List, Sequence + from pori_python.graphkb import GraphKBConnection from pori_python.graphkb import match as gkb_match from pori_python.graphkb.match import INPUT_COPY_CATEGORIES from pori_python.graphkb.statement import get_statements_from_variants from pori_python.graphkb.types import Variant from pori_python.graphkb.util import FeatureNotFoundError -from pandas import isnull -from tqdm import tqdm -from typing import Dict, List, Sequence from .constants import TMB_HIGH_CATEGORY from .ipr import convert_statements_to_alterations -from .types import GkbStatement, IprCopyVariant, IprExprVariant, IprStructuralVariant, KbMatch +from .types import ( + GkbStatement, + IprCopyVariant, + IprExprVariant, + IprStructuralVariant, + KbMatch, +) from .util import Hashabledict, convert_to_rid_set, logger REPORTED_COPY_VARIANTS = (INPUT_COPY_CATEGORIES.AMP, INPUT_COPY_CATEGORIES.DEEP) @@ -31,16 +38,18 @@ def get_second_pass_variants( # second-pass matching all_inferred_matches: Dict[str, Variant] = {} inferred_variants = { - (s['subject']['@rid'], s['relevance']['name']) + (s["subject"]["@rid"], s["relevance"]["name"]) for s in statements - if s['subject'] and s['subject']['@class'] in ('Feature', 'Signature') + if s["subject"] and s["subject"]["@class"] in ("Feature", "Signature") } for reference1, variant_type in inferred_variants: - variants = gkb_match.match_category_variant(graphkb_conn, reference1, variant_type) + variants = gkb_match.match_category_variant( + graphkb_conn, reference1, variant_type + ) for variant in variants: - all_inferred_matches[variant['@rid']] = variant + all_inferred_matches[variant["@rid"]] = variant inferred_matches: List[Variant] = list(all_inferred_matches.values()) return inferred_matches @@ -57,7 +66,7 @@ def get_ipr_statements_from_variants( return [] rows = [] statements = get_statements_from_variants(graphkb_conn, matches) - existing_statements = {s['@rid'] for s in statements} + existing_statements = {s["@rid"] for s in statements} for ipr_row in convert_statements_to_alterations( graphkb_conn, statements, disease_name, convert_to_rid_set(matches) @@ -70,13 +79,17 @@ def get_ipr_statements_from_variants( inferred_statements = [ s for s in get_statements_from_variants(graphkb_conn, inferred_matches) - if s['@rid'] not in existing_statements # do not duplicate if non-inferred match + if s["@rid"] + not in existing_statements # do not duplicate if non-inferred match ] for ipr_row in convert_statements_to_alterations( - graphkb_conn, inferred_statements, disease_name, convert_to_rid_set(inferred_matches) + graphkb_conn, + inferred_statements, + disease_name, + convert_to_rid_set(inferred_matches), ): - ipr_row['kbData']['inferred'] = True + ipr_row["kbData"]["inferred"] = True rows.append(ipr_row) return rows @@ -104,8 +117,8 @@ def annotate_expression_variants( logger.info(f"Starting annotation of {len(variants)} expression category_variants") iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - gene = row['gene'] - variant = row['variant'] + gene = row["gene"] + variant = row["variant"] if not variant: skipped += 1 @@ -114,23 +127,25 @@ def annotate_expression_variants( try: matches = gkb_match.match_expression_variant(graphkb_conn, gene, variant) - for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_name): - ipr_row['variant'] = row['key'] - ipr_row['variantType'] = row.get('variantType', 'exp') + for ipr_row in get_ipr_statements_from_variants( + graphkb_conn, matches, disease_name + ): + ipr_row["variant"] = row["key"] + ipr_row["variantType"] = row.get("variantType", "exp") alterations.append(ipr_row) except FeatureNotFoundError as err: problem_genes.add(gene) - logger.debug(f'Unrecognized gene ({gene} {variant}): {err}') + logger.debug(f"Unrecognized gene ({gene} {variant}): {err}") except ValueError as err: - logger.error(f'failed to match variants ({gene} {variant}): {err}') + logger.error(f"failed to match variants ({gene} {variant}): {err}") if skipped: - logger.info(f'skipped matching {skipped} expression information rows') + logger.info(f"skipped matching {skipped} expression information rows") if problem_genes: - logger.error(f'gene finding failures for expression {sorted(problem_genes)}') - logger.error(f'gene finding falure for {len(problem_genes)} expression genes') + logger.error(f"gene finding failures for expression {sorted(problem_genes)}") + logger.error(f"gene finding falure for {len(problem_genes)} expression genes") logger.info( - f'matched {len(variants)} expression variants to {len(alterations)} graphkb annotations' + f"matched {len(variants)} expression variants to {len(alterations)} graphkb annotations" ) return alterations @@ -157,36 +172,42 @@ def annotate_copy_variants( logger.info(f"Starting annotation of {len(variants)} copy category_variants") iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - gene = row['gene'] - variant = row['variant'] + gene = row["gene"] + variant = row["variant"] if variant not in REPORTED_COPY_VARIANTS: # https://www.bcgsc.ca/jira/browse/GERO-77 skipped += 1 - logger.debug(f"Dropping {gene} copy change '{variant}' - not in REPORTED_COPY_VARIANTS") + logger.debug( + f"Dropping {gene} copy change '{variant}' - not in REPORTED_COPY_VARIANTS" + ) continue try: matches = gkb_match.match_copy_variant(graphkb_conn, gene, variant) - for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_name): - ipr_row['variant'] = row['key'] - ipr_row['variantType'] = row.get('variantType', 'cnv') + for ipr_row in get_ipr_statements_from_variants( + graphkb_conn, matches, disease_name + ): + ipr_row["variant"] = row["key"] + ipr_row["variantType"] = row.get("variantType", "cnv") alterations.append(ipr_row) except FeatureNotFoundError as err: problem_genes.add(gene) - logger.debug(f'Unrecognized gene ({gene} {variant}): {err}') + logger.debug(f"Unrecognized gene ({gene} {variant}): {err}") except ValueError as err: - logger.error(f'failed to match variants ({gene} {variant}): {err}') + logger.error(f"failed to match variants ({gene} {variant}): {err}") if skipped: logger.info( - f'skipped matching {skipped} copy number variants not in {REPORTED_COPY_VARIANTS}' + f"skipped matching {skipped} copy number variants not in {REPORTED_COPY_VARIANTS}" ) if problem_genes: - logger.error(f'gene finding failures for copy variants {sorted(problem_genes)}') - logger.error(f'gene finding failure for {len(problem_genes)} copy variant genes') + logger.error(f"gene finding failures for copy variants {sorted(problem_genes)}") + logger.error( + f"gene finding failure for {len(problem_genes)} copy variant genes" + ) logger.info( - f'matched {len(variants)} copy category variants to {len(alterations)} graphkb annotations' + f"matched {len(variants)} copy category variants to {len(alterations)} graphkb annotations" ) return alterations @@ -208,14 +229,14 @@ def annotate_positional_variants( Returns: list of kbMatches records for IPR """ - VARIANT_KEYS = ('variant', 'hgvsProtein', 'hgvsCds', 'hgvsGenomic') + VARIANT_KEYS = ("variant", "hgvsProtein", "hgvsCds", "hgvsGenomic") errors = 0 alterations = [] problem_genes = set() iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - if not row.get('gene') and (not row.get('gene1') or not row.get('gene2')): + if not row.get("gene") and (not row.get("gene1") or not row.get("gene2")): # https://www.bcgsc.ca/jira/browse/GERO-56?focusedCommentId=1234791&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-1234791 # should not match single gene SVs continue @@ -232,58 +253,62 @@ def annotate_positional_variants( # DEVSU-1885 - fix malformed single deletion described as substitution of blank # eg. deletion described as substitution with nothing: 'chr1:g.150951027T>' if ( - variant[-1] == '>' - and 'g.' in variant + variant[-1] == ">" + and "g." in variant and variant[-2].isalpha() and variant[-3].isnumeric() ): logger.warning( f"Assuming malformed deletion variant {variant} is {variant[:-2] + 'del'}" ) - variant = variant[:-2] + 'del' - matches = gkb_match.match_positional_variant(graphkb_conn, variant) + variant = variant[:-2] + "del" + matches = gkb_match.match_positional_variant( + graphkb_conn, variant + ) else: raise parse_err for ipr_row in get_ipr_statements_from_variants( graphkb_conn, matches, disease_name ): - ipr_row['variant'] = row['key'] - ipr_row['variantType'] = row.get( - 'variantType', 'mut' if row.get('gene') else 'sv' + ipr_row["variant"] = row["key"] + ipr_row["variantType"] = row.get( + "variantType", "mut" if row.get("gene") else "sv" ) alterations.append(Hashabledict(ipr_row)) except FeatureNotFoundError as err: - logger.debug(f'failed to match positional variants ({variant}): {err}') + logger.debug(f"failed to match positional variants ({variant}): {err}") errors += 1 - if 'gene' in row: - problem_genes.add(row['gene']) - elif 'gene1' in row and f"({row['gene1']})" in str(err): - problem_genes.add(row['gene1']) - elif 'gene2' in row and f"({row['gene2']})" in str(err): - problem_genes.add(row['gene2']) - elif 'gene1' in row and 'gene2' in row: - problem_genes.add(row['gene1']) - problem_genes.add(row['gene2']) + if "gene" in row: + problem_genes.add(row["gene"]) + elif "gene1" in row and f"({row['gene1']})" in str(err): + problem_genes.add(row["gene1"]) + elif "gene2" in row and f"({row['gene2']})" in str(err): + problem_genes.add(row["gene2"]) + elif "gene1" in row and "gene2" in row: + problem_genes.add(row["gene1"]) + problem_genes.add(row["gene2"]) else: raise err except HTTPError as err: errors += 1 - logger.error(f'failed to match positional variants ({variant}): {err}') + logger.error(f"failed to match positional variants ({variant}): {err}") if problem_genes: - logger.error(f'gene finding failures for {sorted(problem_genes)}') - logger.error(f'{len(problem_genes)} gene finding failures for positional variants') + logger.error(f"gene finding failures for {sorted(problem_genes)}") + logger.error( + f"{len(problem_genes)} gene finding failures for positional variants" + ) if errors: - logger.error(f'skipped {errors} positional variants due to errors') + logger.error(f"skipped {errors} positional variants due to errors") # drop duplicates alterations: List[KbMatch] = list(set(alterations)) - variant_types = ", ".join(sorted(set([alt['variantType'] for alt in alterations]))) + variant_types = ", ".join(sorted(set([alt["variantType"] for alt in alterations]))) logger.info( - f'matched {len(variants)} {variant_types} positional variants to {len(alterations)} graphkb annotations' + f"matched {len(variants)} {variant_types} positional variants to {len(alterations)} graphkb annotations" ) return alterations @@ -291,8 +316,8 @@ def annotate_positional_variants( def annotate_msi( graphkb_conn: GraphKBConnection, - disease_name: str = 'cancer', - msi_category: str = 'microsatellite instability', + disease_name: str = "cancer", + msi_category: str = "microsatellite instability", ) -> List[KbMatch]: """Annotate microsatellite instablity from GraphKB in the IPR alterations format. @@ -307,26 +332,33 @@ def annotate_msi( gkb_matches = [] msi_categories = graphkb_conn.query( { - 'target': { - 'target': 'CategoryVariant', - 'filters': { - 'reference1': {'target': 'Signature', 'filters': {'name': msi_category}} + "target": { + "target": "CategoryVariant", + "filters": { + "reference1": { + "target": "Signature", + "filters": {"name": msi_category}, + } }, }, - 'queryType': 'similarTo', - 'returnProperties': ['@rid', 'displayName'], + "queryType": "similarTo", + "returnProperties": ["@rid", "displayName"], } ) if msi_categories: - for ipr_row in get_ipr_statements_from_variants(graphkb_conn, msi_categories, disease_name): - ipr_row['variant'] = msi_category - ipr_row['variantType'] = 'msi' + for ipr_row in get_ipr_statements_from_variants( + graphkb_conn, msi_categories, disease_name + ): + ipr_row["variant"] = msi_category + ipr_row["variantType"] = "msi" gkb_matches.append(ipr_row) return gkb_matches def annotate_tmb( - graphkb_conn: GraphKBConnection, disease_name: str = 'cancer', category: str = TMB_HIGH_CATEGORY + graphkb_conn: GraphKBConnection, + disease_name: str = "cancer", + category: str = TMB_HIGH_CATEGORY, ) -> List[KbMatch]: """Annotate Tumour Mutation Burden (tmb) categories from GraphKB in the IPR alterations format. @@ -342,22 +374,26 @@ def annotate_tmb( gkb_matches = [] categories = graphkb_conn.query( { - 'target': { - 'target': 'CategoryVariant', - 'filters': { - 'reference1': { - 'target': 'Signature', - 'filters': {'OR': [{'name': category}, {'displayName': category}]}, + "target": { + "target": "CategoryVariant", + "filters": { + "reference1": { + "target": "Signature", + "filters": { + "OR": [{"name": category}, {"displayName": category}] + }, } }, }, - 'queryType': 'similarTo', - 'returnProperties': ['@rid', 'displayName'], + "queryType": "similarTo", + "returnProperties": ["@rid", "displayName"], } ) if categories: - for ipr_row in get_ipr_statements_from_variants(graphkb_conn, categories, disease_name): - ipr_row['variant'] = category - ipr_row['variantType'] = 'tmb' + for ipr_row in get_ipr_statements_from_variants( + graphkb_conn, categories, disease_name + ): + ipr_row["variant"] = category + ipr_row["variantType"] = "tmb" gkb_matches.append(ipr_row) return gkb_matches diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py index 5f4e846..122c047 100644 --- a/pori_python/ipr/connection.py +++ b/pori_python/ipr/connection.py @@ -2,9 +2,10 @@ import json import os +import time import zlib from typing import Dict, List -import time + from .constants import DEFAULT_URL from .util import logger @@ -137,7 +138,9 @@ def set_analyst_comments(self, report_id: str, data: Dict) -> Dict: data=zlib.compress(json.dumps(data, allow_nan=False).encode("utf-8")), ) - def post_images(self, report_id: str, files: Dict[str, str], data: Dict[str, str] = {}) -> None: + def post_images( + self, report_id: str, files: Dict[str, str], data: Dict[str, str] = {} + ) -> None: """ Post images to the report """ @@ -168,7 +171,9 @@ def post_images(self, report_id: str, files: Dict[str, str], data: Dict[str, str handler.close() start_index += IMAGE_MAX if image_errors: - raise ValueError(f'Error uploading images ({", ".join(sorted(list(image_errors)))})') + raise ValueError( + f'Error uploading images ({", ".join(sorted(list(image_errors)))})' + ) def get_spec(self) -> Dict: """ diff --git a/pori_python/ipr/constants.py b/pori_python/ipr/constants.py index 493d620..948abc9 100644 --- a/pori_python/ipr/constants.py +++ b/pori_python/ipr/constants.py @@ -1,9 +1,17 @@ -DEFAULT_URL = 'https://iprstaging-api.bcgsc.ca/api' -GERMLINE_BASE_TERMS = ('pharmacogenomic', 'cancer predisposition') # based on graphkb.constants -VARIANT_CLASSES = {'Variant', 'CategoryVariant', 'PositionalVariant', 'CatalogueVariant'} +DEFAULT_URL = "https://iprstaging-api.bcgsc.ca/api" +GERMLINE_BASE_TERMS = ( + "pharmacogenomic", + "cancer predisposition", +) # based on graphkb.constants +VARIANT_CLASSES = { + "Variant", + "CategoryVariant", + "PositionalVariant", + "CatalogueVariant", +} # all possible values for review status are: ['pending', 'not required', 'passed', 'failed', 'initial'] -FAILED_REVIEW_STATUS = 'failed' +FAILED_REVIEW_STATUS = "failed" TMB_HIGH = 10.0 # genomic mutations per mb - https://www.bcgsc.ca/jira/browse/GERO-296 -TMB_HIGH_CATEGORY = 'high mutation burden' +TMB_HIGH_CATEGORY = "high mutation burden" diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py index c18608e..628428a 100644 --- a/pori_python/ipr/inputs.py +++ b/pori_python/ipr/inputs.py @@ -7,9 +7,10 @@ import os import pandas as pd from Bio.Data.IUPACData import protein_letters_3to1 -from pori_python.graphkb.match import INPUT_COPY_CATEGORIES, INPUT_EXPRESSION_CATEGORIES from typing import Callable, Dict, Iterable, List, Set, Tuple, cast +from pori_python.graphkb.match import INPUT_COPY_CATEGORIES, INPUT_EXPRESSION_CATEGORIES + from .types import ( IprCopyVariant, IprExprVariant, @@ -19,133 +20,133 @@ ) from .util import hash_key, logger, pandas_falsy -protein_letters_3to1.setdefault('Ter', '*') +protein_letters_3to1.setdefault("Ter", "*") -SPECIFICATION = os.path.join(os.path.dirname(__file__), 'content.spec.json') +SPECIFICATION = os.path.join(os.path.dirname(__file__), "content.spec.json") # content in the local specification should match the values in IPR_API_SPEC_JSON_URL -IPR_API_SPEC_JSON_URL = 'https://ipr-api.bcgsc.ca/api/spec.json' +IPR_API_SPEC_JSON_URL = "https://ipr-api.bcgsc.ca/api/spec.json" # TODO: GERO-307 - use SPECIFICATION json to derive the variant required and optional details defined below # 'cnvState' is for display -COPY_REQ = ['gene', 'kbCategory'] -COPY_KEY = ['gene'] +COPY_REQ = ["gene", "kbCategory"] +COPY_KEY = ["gene"] COPY_OPTIONAL = [ - 'cnvState', - 'copyChange', - 'lohState', # Loss of Heterzygosity state - informative detail to analyst - 'chromosomeBand', - 'start', - 'end', - 'size', - 'log2Cna', - 'cna', - 'comments', - 'library', - 'germline', + "cnvState", + "copyChange", + "lohState", # Loss of Heterzygosity state - informative detail to analyst + "chromosomeBand", + "start", + "end", + "size", + "log2Cna", + "cna", + "comments", + "library", + "germline", ] -SMALL_MUT_REQ = ['gene', 'proteinChange'] +SMALL_MUT_REQ = ["gene", "proteinChange"] # alternate details in the key, can distinguish / subtype events. SMALL_MUT_KEY = SMALL_MUT_REQ + [ - 'altSeq', - 'chromosome', - 'endPosition', - 'refSeq', - 'startPosition', - 'transcript', + "altSeq", + "chromosome", + "endPosition", + "refSeq", + "startPosition", + "transcript", ] SMALL_MUT_OPTIONAL = [ - 'altSeq', - 'comments', - 'chromosome', - 'endPosition', - 'germline', - 'hgvsCds', - 'hgvsGenomic', - 'hgvsProtein', - 'library', - 'ncbiBuild', - 'normalAltCount', - 'normalDepth', - 'normalRefCount', - 'refSeq', - 'rnaAltCount', - 'rnaDepth', - 'rnaRefCount', - 'startPosition', - 'transcript', - 'tumourAltCount', - 'tumourAltCopies', - 'tumourDepth', - 'tumourRefCount', - 'tumourRefCopies', - 'zygosity', + "altSeq", + "comments", + "chromosome", + "endPosition", + "germline", + "hgvsCds", + "hgvsGenomic", + "hgvsProtein", + "library", + "ncbiBuild", + "normalAltCount", + "normalDepth", + "normalRefCount", + "refSeq", + "rnaAltCount", + "rnaDepth", + "rnaRefCount", + "startPosition", + "transcript", + "tumourAltCount", + "tumourAltCopies", + "tumourDepth", + "tumourRefCount", + "tumourRefCopies", + "zygosity", ] -EXP_REQ = ['gene', 'kbCategory'] -EXP_KEY = ['gene'] +EXP_REQ = ["gene", "kbCategory"] +EXP_KEY = ["gene"] EXP_OPTIONAL = [ - 'biopsySiteFoldChange', - 'biopsySitePercentile', - 'biopsySiteQC', - 'biopsySiteZScore', - 'biopsySitekIQR', - 'comments', - 'diseaseFoldChange', - 'diseasekIQR', - 'diseasePercentile', - 'diseaseQC', - 'diseaseZScore', - 'expressionState', - 'histogramImage', - 'library', - 'primarySiteFoldChange', - 'primarySitekIQR', - 'primarySitePercentile', - 'primarySiteQC', - 'primarySiteZScore', - 'internalPancancerFoldChange', - 'internalPancancerkIQR', - 'internalPancancerPercentile', - 'internalPancancerQC', - 'internalPancancerZScore', - 'rnaReads', - 'rpkm', - 'tpm', + "biopsySiteFoldChange", + "biopsySitePercentile", + "biopsySiteQC", + "biopsySiteZScore", + "biopsySitekIQR", + "comments", + "diseaseFoldChange", + "diseasekIQR", + "diseasePercentile", + "diseaseQC", + "diseaseZScore", + "expressionState", + "histogramImage", + "library", + "primarySiteFoldChange", + "primarySitekIQR", + "primarySitePercentile", + "primarySiteQC", + "primarySiteZScore", + "internalPancancerFoldChange", + "internalPancancerkIQR", + "internalPancancerPercentile", + "internalPancancerQC", + "internalPancancerZScore", + "rnaReads", + "rpkm", + "tpm", ] SV_REQ = [ - 'eventType', - 'breakpoint', - 'gene1', # prev: nterm_hugo - 'gene2', # prev: cterm_hugo - 'exon1', # n-terminal - 'exon2', # c-terminal + "eventType", + "breakpoint", + "gene1", # prev: nterm_hugo + "gene2", # prev: cterm_hugo + "exon1", # n-terminal + "exon2", # c-terminal ] SV_KEY = SV_REQ[:] SV_OPTIONAL = [ - 'ctermTranscript', - 'ntermTranscript', - 'ctermGene', # combined hugo ensembl form - 'ntermGene', # combined hugo ensembl form - 'detectedIn', - 'conventionalName', - 'svg', - 'svgTitle', - 'name', - 'frame', - 'omicSupport', - 'highQuality', - 'comments', - 'library', - 'rnaAltCount', - 'rnaDepth', - 'tumourAltCount', - 'tumourDepth', - 'germline', - 'mavis_product_id', + "ctermTranscript", + "ntermTranscript", + "ctermGene", # combined hugo ensembl form + "ntermGene", # combined hugo ensembl form + "detectedIn", + "conventionalName", + "svg", + "svgTitle", + "name", + "frame", + "omicSupport", + "highQuality", + "comments", + "library", + "rnaAltCount", + "rnaDepth", + "tumourAltCount", + "tumourDepth", + "germline", + "mavis_product_id", ] @@ -170,7 +171,7 @@ def validate_variant_rows( Returns: the rows from the tab file as dictionaries """ - header = required + optional + ['key'] + header = required + optional + ["key"] result = [] keys = set() @@ -181,18 +182,18 @@ def validate_variant_rows( if not header_validated: for req_col in required: if req_col not in row: - raise ValueError(f'header missing required column ({req_col})') + raise ValueError(f"header missing required column ({req_col})") header_validated = True row_key = hash_key(row_to_key(row)) if row_key in keys: - raise ValueError(f'duplicate row key ({row_key}) from ({row_to_key(row)})') - row['key'] = row_key + raise ValueError(f"duplicate row key ({row_key}) from ({row_to_key(row)})") + row["key"] = row_key keys.add(row_key) for k, v in row.items(): if v is pd.NA: - row[k] = '' + row[k] = "" - result.append(cast(IprVariant, {col: row.get(col, '') for col in header})) + result.append(cast(IprVariant, {col: row.get(col, "") for col in header})) return result @@ -212,20 +213,20 @@ def preprocess_copy_variants(rows: Iterable[Dict]) -> List[IprCopyVariant]: display_name_mapping.update(dict([(v, v) for v in display_name_mapping.values()])) def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(['cnv'] + [row[key] for key in COPY_KEY]) + return tuple(["cnv"] + [row[key] for key in COPY_KEY]) result = validate_variant_rows(rows, COPY_REQ, COPY_OPTIONAL, row_key) ret_list = [cast(IprCopyVariant, var) for var in result] for row in ret_list: - kb_cat = row.get('kbCategory') - kb_cat = '' if pd.isnull(kb_cat) else str(kb_cat) + kb_cat = row.get("kbCategory") + kb_cat = "" if pd.isnull(kb_cat) else str(kb_cat) if kb_cat: if kb_cat not in INPUT_COPY_CATEGORIES.values(): - raise ValueError(f'invalid copy variant kbCategory value ({kb_cat})') - if not row.get('cnvState'): # apply default short display name - row['cnvState'] = display_name_mapping[kb_cat] - row['variant'] = kb_cat - row['variantType'] = 'cnv' + raise ValueError(f"invalid copy variant kbCategory value ({kb_cat})") + if not row.get("cnvState"): # apply default short display name + row["cnvState"] = display_name_mapping[kb_cat] + row["variant"] = kb_cat + row["variantType"] = "cnv" return ret_list @@ -238,28 +239,28 @@ def preprocess_small_mutations(rows: Iterable[Dict]) -> List[IprSmallMutationVar def row_key(row: IprSmallMutationVariant) -> Tuple[str, ...]: key_vals = [] - for kval in [row.get(key, '') for key in SMALL_MUT_KEY]: - key_vals.append(str(kval) if pd.notnull(kval) else '') - return tuple(['small mutation'] + key_vals) + for kval in [row.get(key, "") for key in SMALL_MUT_KEY]: + key_vals.append(str(kval) if pd.notnull(kval) else "") + return tuple(["small mutation"] + key_vals) result = validate_variant_rows(rows, SMALL_MUT_REQ, SMALL_MUT_OPTIONAL, row_key) if not result: return [] def pick_variant(row: IprSmallMutationVariant) -> str: - protein_change = row.get('proteinChange') + protein_change = row.get("proteinChange") if not pandas_falsy(protein_change): for longAA, shortAA in protein_letters_3to1.items(): protein_change = str(protein_change).replace(longAA, shortAA) - hgvsp = '{}:{}'.format(row['gene'], protein_change) + hgvsp = "{}:{}".format(row["gene"], protein_change) return hgvsp - for field in ['hgvsProtein', 'hgvsCds', 'hgvsGenomic']: + for field in ["hgvsProtein", "hgvsCds", "hgvsGenomic"]: if not pandas_falsy(row.get(field)): return str(row.get(field)) raise ValueError( - 'Variant field cannot be empty. Must include proteinChange or one of the hgvs fields (hgvsProtein, hgvsCds, hgvsGenomic) to build the variant string' + "Variant field cannot be empty. Must include proteinChange or one of the hgvs fields (hgvsProtein, hgvsCds, hgvsGenomic) to build the variant string" ) # 'location' and 'refAlt' are not currently used for matching; still optional and allowed blank @@ -268,21 +269,21 @@ def pick_variant(row: IprSmallMutationVariant) -> str: # for row in result: def convert_sm(row: IprVariant) -> IprSmallMutationVariant: ret = cast(IprSmallMutationVariant, row) - ret['variant'] = pick_variant(ret) - ret['variantType'] = 'mut' + ret["variant"] = pick_variant(ret) + ret["variantType"] = "mut" - if ret.get('startPosition') and not ret.get('endPosition'): - ret['endPosition'] = ret['startPosition'] + if ret.get("startPosition") and not ret.get("endPosition"): + ret["endPosition"] = ret["startPosition"] # default depth to alt + ref if not given - for sample_type in ('normal', 'rna', 'tumour'): + for sample_type in ("normal", "rna", "tumour"): if ( - ret.get(f'{sample_type}RefCount') - and ret.get(f'{sample_type}AltCount') - and not ret.get(f'{sample_type}Depth') + ret.get(f"{sample_type}RefCount") + and ret.get(f"{sample_type}AltCount") + and not ret.get(f"{sample_type}Depth") ): - ret[f'{sample_type}Depth'] = ( # type: ignore - ret[f'{sample_type}RefCount'] + ret[f'{sample_type}AltCount'] # type: ignore + ret[f"{sample_type}Depth"] = ( # type: ignore + ret[f"{sample_type}RefCount"] + ret[f"{sample_type}AltCount"] # type: ignore ) return ret @@ -298,65 +299,65 @@ def preprocess_expression_variants(rows: Iterable[Dict]) -> List[IprExprVariant] """ def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(['expression'] + [row[key] for key in EXP_KEY]) + return tuple(["expression"] + [row[key] for key in EXP_KEY]) variants = validate_variant_rows(rows, EXP_REQ, EXP_OPTIONAL, row_key) result = [cast(IprExprVariant, var) for var in variants] float_columns = [ col for col in EXP_REQ + EXP_OPTIONAL - if col.endswith('kIQR') - or col.endswith('Percentile') - or col.endswith('FoldChange') - or col.endswith('QC') - or col.endswith('ZScore') - or col in ['tpm', 'rpkm'] + if col.endswith("kIQR") + or col.endswith("Percentile") + or col.endswith("FoldChange") + or col.endswith("QC") + or col.endswith("ZScore") + or col in ["tpm", "rpkm"] ] errors = [] for row in result: - row['variant'] = row['kbCategory'] - if not row['expressionState'] and row['kbCategory']: - row['expressionState'] = row['kbCategory'] + row["variant"] = row["kbCategory"] + if not row["expressionState"] and row["kbCategory"]: + row["expressionState"] = row["kbCategory"] - if row['variant'] and not pd.isnull(row['variant']): - if row['variant'] not in INPUT_EXPRESSION_CATEGORIES.values(): + if row["variant"] and not pd.isnull(row["variant"]): + if row["variant"] not in INPUT_EXPRESSION_CATEGORIES.values(): err_msg = f"{row['gene']} variant '{row['variant']}' not in {INPUT_EXPRESSION_CATEGORIES.values()}" errors.append(err_msg) logger.error(err_msg) - row['variantType'] = 'exp' + row["variantType"] = "exp" for col in float_columns: - if row[col] in ['inf', '+inf', '-inf']: - row[col] = row[col].replace('inf', 'Infinity') + if row[col] in ["inf", "+inf", "-inf"]: + row[col] = row[col].replace("inf", "Infinity") # check images exist - if row['histogramImage'] and not os.path.exists(row['histogramImage']): + if row["histogramImage"] and not os.path.exists(row["histogramImage"]): raise FileNotFoundError(f'missing image ({row["histogramImage"]})') if errors: - raise ValueError(f'{len(errors)} Invalid expression variants in file') + raise ValueError(f"{len(errors)} Invalid expression variants in file") return result def create_graphkb_sv_notation(row: IprFusionVariant) -> str: """Generate GKB/IPR fusion style notation from a structural variant.""" - gene1 = row['gene1'] or '?' - gene2 = row['gene2'] or '?' - exon1 = str(row['exon1']) if row['exon1'] else '?' - exon2 = str(row['exon2']) if row['exon2'] else '?' - if not row['gene1']: + gene1 = row["gene1"] or "?" + gene2 = row["gene2"] or "?" + exon1 = str(row["exon1"]) if row["exon1"] else "?" + exon2 = str(row["exon2"]) if row["exon2"] else "?" + if not row["gene1"]: gene1, gene2 = gene2, gene1 exon1, exon2 = exon2, exon1 - if gene1 == '?': + if gene1 == "?": raise ValueError( f'both genes cannot be blank for a structural variant {row["key"]}. At least 1 gene must be entered' ) # force exons to integer repr string exon1 = exon1[:-2] if exon1.endswith(".0") else exon1 exon2 = exon2[:-2] if exon2.endswith(".0") else exon2 - return f'({gene1},{gene2}):fusion(e.{exon1},e.{exon2})' + return f"({gene1},{gene2}):fusion(e.{exon1},e.{exon2})" def preprocess_structural_variants(rows: Iterable[Dict]) -> List[IprFusionVariant]: @@ -366,21 +367,21 @@ def preprocess_structural_variants(rows: Iterable[Dict]) -> List[IprFusionVarian """ def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(['sv'] + [row[key] for key in SV_KEY]) + return tuple(["sv"] + [row[key] for key in SV_KEY]) variants = validate_variant_rows(rows, SV_REQ, SV_OPTIONAL, row_key) result = [cast(IprFusionVariant, var) for var in variants] # genes are optional for structural variants for row in result: - row['variant'] = create_graphkb_sv_notation(row) - row['variantType'] = 'sv' + row["variant"] = create_graphkb_sv_notation(row) + row["variantType"] = "sv" # check and load the svg file where applicable - if row['svg'] and not pd.isnull(row['svg']): - if not os.path.exists(row['svg']): - raise FileNotFoundError(row['svg']) - with open(row['svg'], 'r') as fh: - row['svg'] = fh.read() + if row["svg"] and not pd.isnull(row["svg"]): + if not os.path.exists(row["svg"]): + raise FileNotFoundError(row["svg"]) + with open(row["svg"], "r") as fh: + row["svg"] = fh.read() return result @@ -408,39 +409,41 @@ def check_variant_links( missing_information_genes = set() missing_information_errors = set() - copy_variant_genes = {variant['gene'] for variant in copy_variants} - expression_variant_genes = {variant['gene'] for variant in expression_variants} + copy_variant_genes = {variant["gene"] for variant in copy_variants} + expression_variant_genes = {variant["gene"] for variant in expression_variants} genes_with_variants = set() # filter excess copy variants variant = IprVariant # to silence type errors for variant in copy_variants: - gene = variant['gene'] + gene = variant["gene"] if not gene: logger.error("copy_variant data cannot be applied to an empty genename") - elif variant['variant']: + elif variant["variant"]: genes_with_variants.add(gene) if expression_variant_genes and gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has a copy variant but is missing expression information' + f"gene ({gene}) has a copy variant but is missing expression information" ) for variant in expression_variants: - gene = variant['gene'] + gene = variant["gene"] if not gene: - logger.error("expression_variant data cannot be applied to an empty genename") - elif variant['variant']: + logger.error( + "expression_variant data cannot be applied to an empty genename" + ) + elif variant["variant"]: genes_with_variants.add(gene) if copy_variant_genes and gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has an expression variant but is missing copy number information' + f"gene ({gene}) has an expression variant but is missing copy number information" ) for variant in small_mutations: - gene = variant['gene'] + gene = variant["gene"] if not gene: logger.error("small_mutation data cannot be applied to an empty genename") continue @@ -448,104 +451,104 @@ def check_variant_links( if copy_variant_genes and gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has a small mutation but is missing copy number information' + f"gene ({gene}) has a small mutation but is missing copy number information" ) if expression_variant_genes and gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has a small mutation but is missing expression information' + f"gene ({gene}) has a small mutation but is missing expression information" ) genes_with_variants.add(gene) for variant in structural_variants: - for gene in [variant['gene1'], variant['gene2']]: + for gene in [variant["gene1"], variant["gene2"]]: if gene: # genes are optional for structural variants if gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has a structural variant but is missing copy number information' + f"gene ({gene}) has a structural variant but is missing copy number information" ) if gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has a structural variant but is missing expression information' + f"gene ({gene}) has a structural variant but is missing expression information" ) genes_with_variants.add(gene) if missing_information_genes: for err_msg in sorted(missing_information_errors): logger.debug(err_msg) - link_err_msg = ( - f'Missing information variant links on {len(missing_information_genes)} genes' - ) + link_err_msg = f"Missing information variant links on {len(missing_information_genes)} genes" logger.warning(link_err_msg) return genes_with_variants -def check_comparators(content: Dict, expresssionVariants: List[IprExprVariant] = []) -> None: +def check_comparators( + content: Dict, expresssionVariants: List[IprExprVariant] = [] +) -> None: """ Given the optional content dictionary, check that based on the analyses present the correct/sufficient comparators have also been specified """ - mutation_burden = 'mutationBurden' - comparator_roles = {c['analysisRole'] for c in content.get('comparators', [])} + mutation_burden = "mutationBurden" + comparator_roles = {c["analysisRole"] for c in content.get("comparators", [])} - for image in content.get('images', []): - key = image['key'] + for image in content.get("images", []): + key = image["key"] if key.startswith(mutation_burden): - comp_type = key.split('.')[-1] - role = f'mutation burden ({comp_type})' + comp_type = key.split(".")[-1] + role = f"mutation burden ({comp_type})" if role in comparator_roles: continue - if '_sv.' in key: - sv_role = f'mutation burden SV ({comp_type})' + if "_sv." in key: + sv_role = f"mutation burden SV ({comp_type})" if sv_role in comparator_roles: continue - raise ValueError(f'missing required comparator definition ({role})') + raise ValueError(f"missing required comparator definition ({role})") if expresssionVariants: - required_comparators = {'expression (disease)'} + required_comparators = {"expression (disease)"} def all_none(row: IprExprVariant, columns: List[str]) -> bool: - return all([row.get(col) is None or row.get(col) == '' for col in columns]) + return all([row.get(col) is None or row.get(col) == "" for col in columns]) for exp in expresssionVariants: if not all_none( exp, [ - 'primarySitekIQR', - 'primarySitePercentile', - 'primarySiteZScore', - 'primarySiteFoldChange', + "primarySitekIQR", + "primarySitePercentile", + "primarySiteZScore", + "primarySiteFoldChange", ], ): - required_comparators.add('expression (primary site)') + required_comparators.add("expression (primary site)") if not all_none( exp, [ - 'biopsySitekIQR', - 'biopsySitePercentile', - 'biopsySiteZScore', - 'biopsySiteFoldChange', + "biopsySitekIQR", + "biopsySitePercentile", + "biopsySiteZScore", + "biopsySiteFoldChange", ], ): - required_comparators.add('expression (biopsy site)') + required_comparators.add("expression (biopsy site)") if not all_none( exp, [ - 'internalPancancerkIQR', - 'internalPancancerPercentile', - 'internalPancancerZScore', - 'internalPancancerFoldChange', + "internalPancancerkIQR", + "internalPancancerPercentile", + "internalPancancerZScore", + "internalPancancerFoldChange", ], ): - required_comparators.add('expression (internal pancancer cohort)') + required_comparators.add("expression (internal pancancer cohort)") if required_comparators - comparator_roles: - missing = '; '.join(sorted(list(required_comparators - comparator_roles))) - raise ValueError(f'missing required comparator definitions ({missing})') + missing = "; ".join(sorted(list(required_comparators - comparator_roles))) + raise ValueError(f"missing required comparator definitions ({missing})") def extend_with_default(validator_class): @@ -570,7 +573,9 @@ def check_null(checker, instance): type_checker = validator_class.TYPE_CHECKER.redefine("null", check_null) return jsonschema.validators.extend( - validator_class, validators={"properties": set_defaults}, type_checker=type_checker + validator_class, + validators={"properties": set_defaults}, + type_checker=type_checker, ) @@ -584,7 +589,7 @@ def validate_report_content(content: Dict, schema_file: str = SPECIFICATION) -> Adds defaults as reccommended by: https://python-jsonschema.readthedocs.io/en/latest/faq/#why-doesn-t-my-schema-s-default-property-set-the-default-on-my-instance """ - with open(schema_file, 'r') as fh: + with open(schema_file, "r") as fh: schema = json.load(fh) return DefaultValidatingDraft7Validator(schema).validate(content) diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py index 6de20d2..7361042 100644 --- a/pori_python/ipr/ipr.py +++ b/pori_python/ipr/ipr.py @@ -3,24 +3,32 @@ by other reporting systems """ +from typing import Dict, Iterable, List, Sequence, Set, Tuple + from pori_python.graphkb import GraphKBConnection from pori_python.graphkb import statement as gkb_statement from pori_python.graphkb import vocab as gkb_vocab -from typing import Dict, Iterable, List, Sequence, Set, Tuple from .constants import GERMLINE_BASE_TERMS, VARIANT_CLASSES -from .types import GkbStatement, ImageDefinition, IprFusionVariant, IprGene, IprVariant, KbMatch +from .types import ( + GkbStatement, + ImageDefinition, + IprFusionVariant, + IprGene, + IprVariant, + KbMatch, +) from .util import find_variant, logger def display_evidence_levels(statement: GkbStatement) -> str: result = [] - for evidence_level in statement.get('evidenceLevel', []) or []: + for evidence_level in statement.get("evidenceLevel", []) or []: if isinstance(evidence_level, str): result.append(evidence_level) - elif 'displayName' in evidence_level: - result.append(evidence_level['displayName']) - return ';'.join(sorted(result)) + elif "displayName" in evidence_level: + result.append(evidence_level["displayName"]) + return ";".join(sorted(result)) def filter_structural_variants( @@ -32,9 +40,13 @@ def filter_structural_variants( Filter structural variants to remove non-high quality events unless they are matched/annotated or they involve a gene that is a known fusion partner """ - matched_svs = {match['variant'] for match in kb_matches if match['variantType'] == 'sv'} + matched_svs = { + match["variant"] for match in kb_matches if match["variantType"] == "sv" + } fusion_genes = { - gene['name'] for gene in gene_annotations if gene.get('knownFusionPartner', False) + gene["name"] + for gene in gene_annotations + if gene.get("knownFusionPartner", False) } result = [] @@ -42,10 +54,10 @@ def filter_structural_variants( for structural_variant in structural_variants: if any( [ - structural_variant['highQuality'], - structural_variant['key'] in matched_svs, - structural_variant['gene1'] in fusion_genes, - structural_variant['gene2'] in fusion_genes, + structural_variant["highQuality"], + structural_variant["key"] in matched_svs, + structural_variant["gene1"] in fusion_genes, + structural_variant["gene2"] in fusion_genes, ] ): result.append(structural_variant) @@ -72,11 +84,15 @@ def get_evidencelevel_mapping(graphkb_conn: GraphKBConnection) -> Dict[str, str] # Filter IPR EvidenceLevel and map each outgoing CrossReferenceOf to displayName ipr_source_rid = graphkb_conn.get_source("ipr")["@rid"] - ipr_evidence_levels = filter(lambda d: d.get("source") == ipr_source_rid, evidence_levels) + ipr_evidence_levels = filter( + lambda d: d.get("source") == ipr_source_rid, evidence_levels + ) cross_references_mapping: Dict[str, str] = dict() ipr_rids_to_displayname = dict() for level in ipr_evidence_levels: - d = map(lambda i: (i, level["displayName"]), level.get("out_CrossReferenceOf", [])) + d = map( + lambda i: (i, level["displayName"]), level.get("out_CrossReferenceOf", []) + ) cross_references_mapping.update(d) ipr_rids_to_displayname[level["@rid"]] = level["displayName"] @@ -119,21 +135,25 @@ def convert_statements_to_alterations( - only report disease matched prognostic markers https://www.bcgsc.ca/jira/browse/GERO-72 and GERO-196 """ disease_matches = { - r['@rid'] - for r in gkb_vocab.get_term_tree(graphkb_conn, disease_name, ontology_class='Disease') + r["@rid"] + for r in gkb_vocab.get_term_tree( + graphkb_conn, disease_name, ontology_class="Disease" + ) } if not disease_matches: - raise ValueError(f'failed to match disease ({disease_name}) to graphkb') + raise ValueError(f"failed to match disease ({disease_name}) to graphkb") rows = [] ev_map = get_evidencelevel_mapping(graphkb_conn) # GERO-318 - add all IPR-A evidence equivalents to the approvedTherapy flag - approved = set([ev for (ev, ipr) in ev_map.items() if ipr == 'IPR-A']) + approved = set([ev for (ev, ipr) in ev_map.items() if ipr == "IPR-A"]) # get the recruitment status for any trial associated with a statement clinical_trials = [ - s['subject']['@rid'] for s in statements if s['subject']['@class'] == 'ClinicalTrial' + s["subject"]["@rid"] + for s in statements + if s["subject"]["@class"] == "ClinicalTrial" ] recruitment_statuses = {} if clinical_trials: @@ -141,71 +161,81 @@ def convert_statements_to_alterations( for rid in clinical_trials: query_result = graphkb_conn.query( { - 'target': {'target': 'ClinicalTrial', 'filters': {'@rid': rid}}, - 'returnProperties': ['@rid', 'recruitmentStatus'], + "target": {"target": "ClinicalTrial", "filters": {"@rid": rid}}, + "returnProperties": ["@rid", "recruitmentStatus"], } ) if query_result: - recruitment_statuses[rid] = query_result[0]['recruitmentStatus'] + recruitment_statuses[rid] = query_result[0]["recruitmentStatus"] for statement in statements: - variants = [c for c in statement['conditions'] if c['@class'] in VARIANT_CLASSES] - diseases = [c for c in statement['conditions'] if c['@class'] == 'Disease'] - disease_match = len(diseases) == 1 and diseases[0]['@rid'] in disease_matches - pmid = ';'.join([e['displayName'] for e in statement['evidence']]) + variants = [ + c for c in statement["conditions"] if c["@class"] in VARIANT_CLASSES + ] + diseases = [c for c in statement["conditions"] if c["@class"] == "Disease"] + disease_match = len(diseases) == 1 and diseases[0]["@rid"] in disease_matches + pmid = ";".join([e["displayName"] for e in statement["evidence"]]) ipr_section = gkb_statement.categorize_relevance( - graphkb_conn, statement['relevance']['@rid'] + graphkb_conn, statement["relevance"]["@rid"] ) approved_therapy = False - if ipr_section == 'therapeutic': - for level in statement['evidenceLevel'] or []: - if level['@rid'] in approved: + if ipr_section == "therapeutic": + for level in statement["evidenceLevel"] or []: + if level["@rid"] in approved: approved_therapy = True break - if ipr_section == 'prognostic' and not disease_match: + if ipr_section == "prognostic" and not disease_match: continue # GERO-72 / GERO-196 evidence_level_str = display_evidence_levels(statement) - evidence_levels = statement.get('evidenceLevel') or [] - ipr_evidence_levels = [ev_map[el.get('@rid', '')] for el in evidence_levels if el] - ipr_evidence_levels_str = ';'.join(sorted(set([el for el in ipr_evidence_levels]))) + evidence_levels = statement.get("evidenceLevel") or [] + ipr_evidence_levels = [ + ev_map[el.get("@rid", "")] for el in evidence_levels if el + ] + ipr_evidence_levels_str = ";".join( + sorted(set([el for el in ipr_evidence_levels])) + ) for variant in variants: - if variant['@rid'] not in variant_matches: + if variant["@rid"] not in variant_matches: continue row = KbMatch( { - 'approvedTherapy': approved_therapy, - 'category': ipr_section or 'unknown', - 'context': ( - statement['subject']['displayName'] if statement['subject'] else None + "approvedTherapy": approved_therapy, + "category": ipr_section or "unknown", + "context": ( + statement["subject"]["displayName"] + if statement["subject"] + else None ), - 'kbContextId': (statement['subject']['@rid'] if statement['subject'] else None), - 'disease': ';'.join(sorted(d['displayName'] for d in diseases)), - 'evidenceLevel': evidence_level_str, - 'iprEvidenceLevel': ipr_evidence_levels_str, - 'kbStatementId': statement['@rid'], - 'kbVariant': variant['displayName'], - 'kbVariantId': variant['@rid'], - 'matchedCancer': disease_match, - 'reference': pmid, - 'relevance': statement['relevance']['displayName'], - 'kbRelevanceId': statement['relevance']['@rid'], - 'externalSource': ( - str(statement['source'].get('displayName', '')) - if statement['source'] + "kbContextId": ( + statement["subject"]["@rid"] if statement["subject"] else None + ), + "disease": ";".join(sorted(d["displayName"] for d in diseases)), + "evidenceLevel": evidence_level_str, + "iprEvidenceLevel": ipr_evidence_levels_str, + "kbStatementId": statement["@rid"], + "kbVariant": variant["displayName"], + "kbVariantId": variant["@rid"], + "matchedCancer": disease_match, + "reference": pmid, + "relevance": statement["relevance"]["displayName"], + "kbRelevanceId": statement["relevance"]["@rid"], + "externalSource": ( + str(statement["source"].get("displayName", "")) + if statement["source"] else None ), - 'externalStatementId': statement.get('sourceId'), - 'reviewStatus': statement.get('reviewStatus'), - 'kbData': {}, + "externalStatementId": statement.get("sourceId"), + "reviewStatus": statement.get("reviewStatus"), + "kbData": {}, } ) - if statement['relevance']['name'] == 'eligibility': - row['kbData']['recruitment_status'] = recruitment_statuses.get( - row['kbContextId'], 'not found' + if statement["relevance"]["name"] == "eligibility": + row["kbData"]["recruitment_status"] = recruitment_statuses.get( + row["kbContextId"], "not found" ) rows.append(row) return rows @@ -228,22 +258,24 @@ def select_expression_plots( """ selected_variants = { - (match['variantType'], match['variant']) + (match["variantType"], match["variant"]) for match in kb_matches - if match['category'] == 'therapeutic' + if match["category"] == "therapeutic" } images_by_gene: Dict[str, ImageDefinition] = {} selected_genes = set() for variant in all_variants: - if (variant['variantType'], variant['key']) in selected_variants: - for key in ['gene', 'gene1', 'gene2']: + if (variant["variantType"], variant["key"]) in selected_variants: + for key in ["gene", "gene1", "gene2"]: gene = variant.get(key) if gene: selected_genes.add(str(gene)) - gene = str(variant.get('gene', '')) - hist = str(variant.get('histogramImage', '')) + gene = str(variant.get("gene", "")) + hist = str(variant.get("histogramImage", "")) if hist: - images_by_gene[gene] = ImageDefinition({'key': f'expDensity.{gene}', 'path': hist}) + images_by_gene[gene] = ImageDefinition( + {"key": f"expDensity.{gene}", "path": hist} + ) return [images_by_gene[gene] for gene in selected_genes if gene in images_by_gene] @@ -256,17 +288,17 @@ def create_key_alterations( """ alterations = [] type_mapping = { - 'mut': 'smallMutations', - 'cnv': 'CNVs', - 'sv': 'SVs', - 'exp': 'expressionOutliers', + "mut": "smallMutations", + "cnv": "CNVs", + "sv": "SVs", + "exp": "expressionOutliers", } counts: Dict[str, Set] = {v: set() for v in type_mapping.values()} skipped_variant_types = [] for kb_match in kb_matches: - variant_type = kb_match['variantType'] - variant_key = kb_match['variant'] - if kb_match['category'] == 'unknown': + variant_type = kb_match["variantType"] + variant_key = kb_match["variant"] + if kb_match["category"] == "unknown": continue if variant_type not in type_mapping.keys(): @@ -285,32 +317,36 @@ def create_key_alterations( counts[type_mapping[variant_type]].add(variant_key) - if variant_type == 'exp': - alterations.append(f'{variant.get("gene","")} ({variant.get("expressionState")})') - elif variant_type == 'cnv': + if variant_type == "exp": + alterations.append( + f'{variant.get("gene","")} ({variant.get("expressionState")})' + ) + elif variant_type == "cnv": alterations.append(f'{variant.get("gene","")} ({variant.get("cnvState")})') # only show germline if relevant - elif kb_match['category'] in GERMLINE_BASE_TERMS and variant.get('germline'): + elif kb_match["category"] in GERMLINE_BASE_TERMS and variant.get("germline"): alterations.append(f"germline {variant['variant']}") else: - alterations.append(variant['variant']) + alterations.append(variant["variant"]) counted_variants = set.union(*counts.values()) - counts['variantsUnknown'] = set() + counts["variantsUnknown"] = set() # count the un-matched variants for variant in all_variants: - if variant['variant'] and variant['key'] not in counted_variants: - counts['variantsUnknown'].add(variant['key']) + if variant["variant"] and variant["key"] not in counted_variants: + counts["variantsUnknown"].add(variant["key"]) return ( - [{'geneVariant': alt} for alt in set(alterations)], + [{"geneVariant": alt} for alt in set(alterations)], {k: len(v) for k, v in counts.items()}, ) def germline_kb_matches( - kb_matches: List[KbMatch], all_variants: Sequence[IprVariant], assume_somatic: bool = True + kb_matches: List[KbMatch], + all_variants: Sequence[IprVariant], + assume_somatic: bool = True, ) -> List[KbMatch]: """Filter kb_matches for matching to germline or somatic events using the 'germline' optional property. @@ -327,14 +363,14 @@ def germline_kb_matches( filtered list of kb_matches """ ret_list = [] - germ_alts = [alt for alt in kb_matches if alt['category'] in GERMLINE_BASE_TERMS] + germ_alts = [alt for alt in kb_matches if alt["category"] in GERMLINE_BASE_TERMS] somatic_alts = [alt for alt in kb_matches if alt not in germ_alts] if germ_alts: logger.info(f"checking germline status of {GERMLINE_BASE_TERMS}") for alt in germ_alts: - var_list = [v for v in all_variants if v['key'] == alt['variant']] - germline_var_list = [v for v in var_list if v.get('germline')] - unknown_var_list = [v for v in var_list if 'germline' not in v] + var_list = [v for v in all_variants if v["key"] == alt["variant"]] + germline_var_list = [v for v in var_list if v.get("germline")] + unknown_var_list = [v for v in var_list if "germline" not in v] if germline_var_list: logger.debug( f"germline kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" @@ -360,8 +396,10 @@ def germline_kb_matches( if somatic_alts: # Remove any matches to germline events for alt in somatic_alts: - var_list = [v for v in all_variants if v['key'] == alt['variant']] - somatic_var_list = [v for v in var_list if not v.get('germline', not assume_somatic)] + var_list = [v for v in all_variants if v["key"] == alt["variant"]] + somatic_var_list = [ + v for v in var_list if not v.get("germline", not assume_somatic) + ] if var_list and not somatic_var_list: logger.debug( f"Dropping germline match to somatic statement kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" @@ -369,6 +407,8 @@ def germline_kb_matches( elif somatic_var_list: ret_list.append(alt) # match to somatic variant else: - ret_list.append(alt) # alteration not in any specific keys matches to check. + ret_list.append( + alt + ) # alteration not in any specific keys matches to check. return ret_list diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py index 2c6eefd..8c08f24 100644 --- a/pori_python/ipr/main.py +++ b/pori_python/ipr/main.py @@ -5,9 +5,10 @@ import logging import os from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser +from typing import Dict, List, Sequence + from pori_python.graphkb import GraphKBConnection from pori_python.graphkb.genes import get_gene_information -from typing import Dict, List, Sequence from .annotate import ( annotate_copy_variants, @@ -41,59 +42,68 @@ CACHE_GENE_MINIMUM = 5000 RENAMED_GENE_PROPERTIES = { # old_name: new_name - 'cancerRelated': 'kbStatementRelated', - 'cancerGene': 'cancerGeneListMatch', + "cancerRelated": "kbStatementRelated", + "cancerGene": "cancerGeneListMatch", } def file_path(path: str) -> str: if not os.path.exists(path): - raise argparse.ArgumentTypeError(f'{repr(path)} is not a valid filename. does not exist') + raise argparse.ArgumentTypeError( + f"{repr(path)} is not a valid filename. does not exist" + ) return path def timestamp() -> str: - return datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S') + return datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") def command_interface() -> None: parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) - req = parser.add_argument_group('required arguments') - (req if not os.environ.get('USER') else parser).add_argument( - '--username', - required=not os.environ.get('USER'), - default=os.environ.get('USER'), - help='username to use connecting to graphkb/ipr', + req = parser.add_argument_group("required arguments") + (req if not os.environ.get("USER") else parser).add_argument( + "--username", + required=not os.environ.get("USER"), + default=os.environ.get("USER"), + help="username to use connecting to graphkb/ipr", + ) + req.add_argument( + "--password", required=True, help="password to use connecting to graphkb/ipr" ) - req.add_argument('--password', required=True, help='password to use connecting to graphkb/ipr') req.add_argument( - '-c', '--content', required=True, type=file_path, help="Report Content as JSON" + "-c", "--content", required=True, type=file_path, help="Report Content as JSON" ) - parser.add_argument('--ipr_url', default=DEFAULT_URL) - parser.add_argument('--graphkb_url', default=None) - parser.add_argument('--log_level', default='info', choices=LOG_LEVELS.keys()) + parser.add_argument("--ipr_url", default=DEFAULT_URL) + parser.add_argument("--graphkb_url", default=None) + parser.add_argument("--log_level", default="info", choices=LOG_LEVELS.keys()) parser.add_argument( - '--therapeutics', default=False, help='Generate therapeutic options', action='store_true' + "--therapeutics", + default=False, + help="Generate therapeutic options", + action="store_true", ) parser.add_argument( - '--skip_comments', + "--skip_comments", default=False, - action='store_true', - help='Turn off generating the analyst comments section of the report', + action="store_true", + help="Turn off generating the analyst comments section of the report", ) parser.add_argument( - '-o', '--output_json_path', help='path to a JSON to output the report upload body' + "-o", + "--output_json_path", + help="path to a JSON to output the report upload body", ) parser.add_argument( - '-w', - '--always_write_output_json', + "-w", + "--always_write_output_json", action="store_true", - help='Write to output_json_path on successful IPR uploads instead of just when the upload fails', + help="Write to output_json_path on successful IPR uploads instead of just when the upload fails", ) args = parser.parse_args() - with open(args.content, 'r') as fh: + with open(args.content, "r") as fh: content = json.load(fh) create_report( @@ -118,12 +128,14 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict """ if ( ipr_spec - and 'components' in ipr_spec.keys() - and 'schemas' in ipr_spec['components'].keys() - and 'genesCreate' in ipr_spec['components']['schemas'].keys() - and 'properties' in ipr_spec['components']['schemas']['genesCreate'].keys() + and "components" in ipr_spec.keys() + and "schemas" in ipr_spec["components"].keys() + and "genesCreate" in ipr_spec["components"]["schemas"].keys() + and "properties" in ipr_spec["components"]["schemas"]["genesCreate"].keys() ): - genes_spec = ipr_spec['components']['schemas']['genesCreate']['properties'].keys() + genes_spec = ipr_spec["components"]["schemas"]["genesCreate"][ + "properties" + ].keys() # check what ipr report upload expects and adjust contents to match for old_name, new_name in RENAMED_GENE_PROPERTIES.items(): @@ -131,13 +143,13 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict logger.warning( f"Legacy IPR - Renaming property {new_name} to {old_name} for compatibility to ipr_spec" ) - for gene in upload_content['genes']: + for gene in upload_content["genes"]: if new_name in gene: gene[old_name] = gene[new_name] gene.pop(new_name) else: outdate_properties = 0 - for gene in upload_content['genes']: + for gene in upload_content["genes"]: if old_name in gene: gene[new_name] = gene[old_name] gene.pop(old_name) @@ -149,7 +161,7 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict # remove any unhandled incompatible keys removed_keys: Dict[str, int] = {} - for gene in upload_content['genes']: + for gene in upload_content["genes"]: unsupported_keys = [key for key in gene.keys() if key not in genes_spec] for key in unsupported_keys: if key in removed_keys: @@ -158,25 +170,29 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict removed_keys[key] = 1 gene.pop(key) for key, count in removed_keys.items(): - logger.warning(f"IPR unsupported property '{key}' removed from {count} genes.") + logger.warning( + f"IPR unsupported property '{key}' removed from {count} genes." + ) - drop_columns = ['variant', 'variantType', 'histogramImage'] + drop_columns = ["variant", "variantType", "histogramImage"] # DEVSU-2034 - use a 'displayName' VARIANT_LIST_KEYS = [ - 'expressionVariants', - 'smallMutations', - 'copyVariants', - 'structuralVariants', - 'probeResults', - 'msi', + "expressionVariants", + "smallMutations", + "copyVariants", + "structuralVariants", + "probeResults", + "msi", ] for variant_list_section in VARIANT_LIST_KEYS: for variant in upload_content.get(variant_list_section, []): - if not variant.get('displayName'): - variant['displayName'] = ( - variant.get('variant') or variant.get('kbCategory') or variant.get('key', '') + if not variant.get("displayName"): + variant["displayName"] = ( + variant.get("variant") + or variant.get("kbCategory") + or variant.get("key", "") ) - if variant_list_section == 'probeResults': + if variant_list_section == "probeResults": # currently probeResults will error if they do NOT have a 'variant' column. # smallMutations will error if they DO have a 'variant' column. continue @@ -184,20 +200,22 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict if col in variant: del variant[col] # tmburMutationBurden is a single value, not list - if upload_content.get('tmburMutationBurden'): - if not upload_content['tmburMutationBurden'].get('displayName'): - upload_content['tmburMutationBurden']['displayName'] = upload_content[ - 'tmburMutationBurden' - ].get('kbCategory', '') - - for row in upload_content['kbMatches']: - del row['kbContextId'] - del row['kbRelevanceId'] + if upload_content.get("tmburMutationBurden"): + if not upload_content["tmburMutationBurden"].get("displayName"): + upload_content["tmburMutationBurden"]["displayName"] = upload_content[ + "tmburMutationBurden" + ].get("kbCategory", "") + + for row in upload_content["kbMatches"]: + del row["kbContextId"] + del row["kbRelevanceId"] return upload_content def create_report(**kwargs) -> Dict: - logger.warning("Deprecated function 'create_report' called - use ipr_report instead") + logger.warning( + "Deprecated function 'create_report' called - use ipr_report instead" + ) return ipr_report(**kwargs) @@ -206,12 +224,12 @@ def ipr_report( password: str, content: Dict, ipr_url: str = DEFAULT_URL, - log_level: str = 'info', - output_json_path: str = '', + log_level: str = "info", + output_json_path: str = "", always_write_output_json: bool = False, ipr_upload: bool = True, interactive: bool = False, - graphkb_url: str = '', + graphkb_url: str = "", generate_therapeutics: bool = False, generate_comments: bool = True, match_germline: bool = False, @@ -245,23 +263,29 @@ def ipr_report( # set the default logging configuration logging.basicConfig( level=LOG_LEVELS[log_level], - format='%(asctime)s %(name)s %(levelname)s %(message)s', - datefmt='%m-%d-%y %H:%M:%S', + format="%(asctime)s %(name)s %(levelname)s %(message)s", + datefmt="%m-%d-%y %H:%M:%S", ) # validate the JSON content follows the specification try: validate_report_content(content) except jsonschema.exceptions.ValidationError as err: - logger.error("Failed schema check - report variants may be corrupted or unmatched.") + logger.error( + "Failed schema check - report variants may be corrupted or unmatched." + ) logger.error(f"Failed schema check: {err}") - kb_disease_match = content['kbDiseaseMatch'] + kb_disease_match = content["kbDiseaseMatch"] # validate the input variants - small_mutations = preprocess_small_mutations(content.get('smallMutations', [])) - structural_variants = preprocess_structural_variants(content.get('structuralVariants', [])) - copy_variants = preprocess_copy_variants(content.get('copyVariants', [])) - expression_variants = preprocess_expression_variants(content.get('expressionVariants', [])) + small_mutations = preprocess_small_mutations(content.get("smallMutations", [])) + structural_variants = preprocess_structural_variants( + content.get("structuralVariants", []) + ) + copy_variants = preprocess_copy_variants(content.get("copyVariants", [])) + expression_variants = preprocess_expression_variants( + content.get("expressionVariants", []) + ) if expression_variants: check_comparators(content, expression_variants) @@ -274,7 +298,7 @@ def ipr_report( ipr_spec = ipr_conn.get_spec() if graphkb_url: - logger.info(f'connecting to graphkb: {graphkb_url}') + logger.info(f"connecting to graphkb: {graphkb_url}") graphkb_conn = GraphKBConnection(graphkb_url) else: graphkb_conn = GraphKBConnection() @@ -285,61 +309,65 @@ def ipr_report( # Signature category variants tmb_variant: IprVariant = {} tmb_matches = [] - if 'tmburMutationBurden' in content.keys(): + if "tmburMutationBurden" in content.keys(): tmb_val = 0.0 tmb = {} try: - tmb = content.get('tmburMutationBurden', {}) - tmb_val = tmb['genomeIndelTmb'] + tmb['genomeSnvTmb'] + tmb = content.get("tmburMutationBurden", {}) + tmb_val = tmb["genomeIndelTmb"] + tmb["genomeSnvTmb"] except Exception as err: logger.error(f"tmburMutationBurden parsing failure: {err}") if tmb_val >= TMB_HIGH: logger.warning( - f'GERO-296 - tmburMutationBurden high -checking graphkb matches for {TMB_HIGH_CATEGORY}' + f"GERO-296 - tmburMutationBurden high -checking graphkb matches for {TMB_HIGH_CATEGORY}" ) - if not tmb.get('key'): - tmb['key'] = TMB_HIGH_CATEGORY - if not tmb.get('kbCategory'): - tmb['kbCategory'] = TMB_HIGH_CATEGORY + if not tmb.get("key"): + tmb["key"] = TMB_HIGH_CATEGORY + if not tmb.get("kbCategory"): + tmb["kbCategory"] = TMB_HIGH_CATEGORY # GERO-296 - try matching to graphkb - tmb_matches = annotate_tmb(graphkb_conn, kb_disease_match, TMB_HIGH_CATEGORY) + tmb_matches = annotate_tmb( + graphkb_conn, kb_disease_match, TMB_HIGH_CATEGORY + ) if tmb_matches: - tmb_variant['kbCategory'] = TMB_HIGH_CATEGORY # type: ignore - tmb_variant['variant'] = TMB_HIGH_CATEGORY - tmb_variant['key'] = tmb['key'] - tmb_variant['variantType'] = 'tmb' + tmb_variant["kbCategory"] = TMB_HIGH_CATEGORY # type: ignore + tmb_variant["variant"] = TMB_HIGH_CATEGORY + tmb_variant["key"] = tmb["key"] + tmb_variant["variantType"] = "tmb" logger.info( f"GERO-296 '{TMB_HIGH_CATEGORY}' matches {len(tmb_matches)} statements." ) gkb_matches.extend(tmb_matches) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") - msi = content.get('msi', []) + msi = content.get("msi", []) msi_matches = [] msi_variant: IprVariant = {} if msi: # only one msi variant per library if isinstance(msi, list): - msi_cat = msi[0].get('kbCategory') + msi_cat = msi[0].get("kbCategory") elif isinstance(msi, str): msi_cat = msi else: - msi_cat = msi.get('kbCategory') + msi_cat = msi.get("kbCategory") msi_variant = msi.copy() - logger.info(f'Matching GKB msi {msi_cat}') + logger.info(f"Matching GKB msi {msi_cat}") msi_matches = annotate_msi(graphkb_conn, kb_disease_match, msi_cat) if msi_matches: - msi_variant['kbCategory'] = msi_cat # type: ignore - msi_variant['variant'] = msi_cat - msi_variant['key'] = msi_cat - msi_variant['variantType'] = 'msi' - logger.info(f"GERO-295 '{msi_cat}' matches {len(msi_matches)} msi statements.") + msi_variant["kbCategory"] = msi_cat # type: ignore + msi_variant["variant"] = msi_cat + msi_variant["key"] = msi_cat + msi_variant["variantType"] = "msi" + logger.info( + f"GERO-295 '{msi_cat}' matches {len(msi_matches)} msi statements." + ) gkb_matches.extend(msi_matches) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") - logger.info(f'annotating {len(small_mutations)} small mutations') + logger.info(f"annotating {len(small_mutations)} small mutations") gkb_matches.extend( annotate_positional_variants( graphkb_conn, small_mutations, kb_disease_match, show_progress=interactive @@ -347,15 +375,18 @@ def ipr_report( ) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") - logger.info(f'annotating {len(structural_variants)} structural variants') + logger.info(f"annotating {len(structural_variants)} structural variants") gkb_matches.extend( annotate_positional_variants( - graphkb_conn, structural_variants, kb_disease_match, show_progress=interactive + graphkb_conn, + structural_variants, + kb_disease_match, + show_progress=interactive, ) ) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") - logger.info(f'annotating {len(copy_variants)} copy variants') + logger.info(f"annotating {len(copy_variants)} copy variants") gkb_matches.extend( annotate_copy_variants( graphkb_conn, copy_variants, kb_disease_match, show_progress=interactive @@ -363,10 +394,13 @@ def ipr_report( ) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") - logger.info(f'annotating {len(expression_variants)} expression variants') + logger.info(f"annotating {len(expression_variants)} expression variants") gkb_matches.extend( annotate_expression_variants( - graphkb_conn, expression_variants, kb_disease_match, show_progress=interactive + graphkb_conn, + expression_variants, + kb_disease_match, + show_progress=interactive, ) ) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") @@ -378,68 +412,79 @@ def ipr_report( if tmb_matches: all_variants.append(tmb_variant) # type: ignore - if match_germline: # verify germline kb statements matched germline observed variants + if ( + match_germline + ): # verify germline kb statements matched germline observed variants gkb_matches = germline_kb_matches(gkb_matches, all_variants) if gkb_matches: - logger.info(f"Removing {len(gkb_matches)} germline events without medical matches.") + logger.info( + f"Removing {len(gkb_matches)} germline events without medical matches." + ) if custom_kb_match_filter: - logger.info(f'custom_kb_match_filter on {len(gkb_matches)} variants') + logger.info(f"custom_kb_match_filter on {len(gkb_matches)} variants") gkb_matches = custom_kb_match_filter(gkb_matches) - logger.info(f'\t custom_kb_match_filter left {len(gkb_matches)} variants') + logger.info(f"\t custom_kb_match_filter left {len(gkb_matches)} variants") key_alterations, variant_counts = create_key_alterations(gkb_matches, all_variants) - logger.info('fetching gene annotations') + logger.info("fetching gene annotations") gene_information = get_gene_information(graphkb_conn, sorted(genes_with_variants)) if generate_therapeutics: - logger.info('generating therapeutic options') + logger.info("generating therapeutic options") targets = create_therapeutic_options(graphkb_conn, gkb_matches, all_variants) else: targets = [] - logger.info('generating analyst comments') + logger.info("generating analyst comments") if generate_comments: comments = { - 'comments': summarize( - graphkb_conn, gkb_matches, disease_name=kb_disease_match, variants=all_variants + "comments": summarize( + graphkb_conn, + gkb_matches, + disease_name=kb_disease_match, + variants=all_variants, ) } else: - comments = {'comments': ''} + comments = {"comments": ""} # thread safe deep-copy the original content output = json.loads(json.dumps(content)) output.update( { - 'kbMatches': [trim_empty_values(a) for a in gkb_matches], - 'copyVariants': [ - trim_empty_values(c) for c in copy_variants if c['gene'] in genes_with_variants + "kbMatches": [trim_empty_values(a) for a in gkb_matches], + "copyVariants": [ + trim_empty_values(c) + for c in copy_variants + if c["gene"] in genes_with_variants ], - 'smallMutations': [trim_empty_values(s) for s in small_mutations], - 'expressionVariants': [ + "smallMutations": [trim_empty_values(s) for s in small_mutations], + "expressionVariants": [ trim_empty_values(e) for e in expression_variants - if e['gene'] in genes_with_variants + if e["gene"] in genes_with_variants ], - 'kbDiseaseMatch': kb_disease_match, - 'kbUrl': graphkb_conn.url, - 'kbVersion': timestamp(), - 'structuralVariants': [ + "kbDiseaseMatch": kb_disease_match, + "kbUrl": graphkb_conn.url, + "kbVersion": timestamp(), + "structuralVariants": [ trim_empty_values(s) for s in filter_structural_variants( structural_variants, gkb_matches, gene_information ) ], - 'genes': gene_information, - 'genomicAlterationsIdentified': key_alterations, - 'variantCounts': variant_counts, - 'analystComments': comments, - 'therapeuticTarget': targets, + "genes": gene_information, + "genomicAlterationsIdentified": key_alterations, + "variantCounts": variant_counts, + "analystComments": comments, + "therapeuticTarget": targets, } ) - output.setdefault('images', []).extend(select_expression_plots(gkb_matches, all_variants)) + output.setdefault("images", []).extend( + select_expression_plots(gkb_matches, all_variants) + ) output = clean_unsupported_content(output, ipr_spec) ipr_result = None @@ -447,7 +492,7 @@ def ipr_report( if ipr_upload: try: - logger.info(f'Uploading to IPR {ipr_conn.url}') + logger.info(f"Uploading to IPR {ipr_conn.url}") ipr_result = ipr_conn.upload_report(output, async_upload, mins_to_wait) logger.info(ipr_result) output.update(ipr_result) @@ -456,11 +501,11 @@ def ipr_report( logger.error(f"ipr_conn.upload_report failed: {err}", exc_info=True) if output_json_path: if always_write_output_json or not ipr_result: - logger.info(f'Writing IPR upload json to: {output_json_path}') - with open(output_json_path, 'w') as fh: + logger.info(f"Writing IPR upload json to: {output_json_path}") + with open(output_json_path, "w") as fh: fh.write(json.dumps(output)) - logger.info(f'made {graphkb_conn.request_count} requests to graphkb') - logger.info(f'average load {int(graphkb_conn.load or 0)} req/s') + logger.info(f"made {graphkb_conn.request_count} requests to graphkb") + logger.info(f"average load {int(graphkb_conn.load or 0)} req/s") if upload_error: raise upload_error return output diff --git a/pori_python/ipr/summary.py b/pori_python/ipr/summary.py index 491bfad..a91f57e 100644 --- a/pori_python/ipr/summary.py +++ b/pori_python/ipr/summary.py @@ -1,14 +1,14 @@ import base64 import json +from typing import Callable, Dict, List, Sequence, Set, Tuple +from urllib.parse import urlencode + from pori_python.graphkb import GraphKBConnection from pori_python.graphkb.constants import RELEVANCE_BASE_TERMS from pori_python.graphkb.statement import categorize_relevance from pori_python.graphkb.types import Ontology, Record from pori_python.graphkb.util import convert_to_rid_list from pori_python.graphkb.vocab import get_term_tree -from typing import Callable, Dict, List, Sequence, Set, Tuple -from urllib.parse import urlencode - from pori_python.ipr.inputs import create_graphkb_sv_notation from .types import GkbStatement, IprVariant, KbMatch @@ -20,10 +20,10 @@ logger, ) -OTHER_DISEASES = 'other disease types' -ENTREZ_GENE_URL = 'https://www.ncbi.nlm.nih.gov/gene' +OTHER_DISEASES = "other disease types" +ENTREZ_GENE_URL = "https://www.ncbi.nlm.nih.gov/gene" # TODO: https://www.bcgsc.ca/jira/browse/DEVSU-1181 -GRAPHKB_GUI = 'https://graphkb.bcgsc.ca' +GRAPHKB_GUI = "https://graphkb.bcgsc.ca" def filter_by_record_class( @@ -37,32 +37,33 @@ def check(name: str) -> bool: else: return name in record_classes - return [rec for rec in record_list if check(rec['@class'])] + return [rec for rec in record_list if check(rec["@class"])] def natural_join(word_list: List[str]) -> str: if len(word_list) > 1: - return ', '.join(word_list[:-1]) + ', and ' + word_list[-1] - return ''.join(word_list) + return ", ".join(word_list[:-1]) + ", and " + word_list[-1] + return "".join(word_list) def natural_join_records( - records: Sequence[Record], covert_to_word: Callable[[Dict], str] = lambda x: x['displayName'] + records: Sequence[Record], + covert_to_word: Callable[[Dict], str] = lambda x: x["displayName"], ) -> str: word_list = sorted(list({covert_to_word(rec) for rec in records})) return natural_join(word_list) -def create_graphkb_link(record_ids: List[str], record_class: str = 'Statement') -> str: +def create_graphkb_link(record_ids: List[str], record_class: str = "Statement") -> str: """ Create a link for a set of statements to the GraphKB client """ record_ids = sorted(list(set(record_ids))) if len(record_ids) == 1: return f'{GRAPHKB_GUI}/view/{record_class}/{record_ids[0].replace("#", "")}' - complex_param = base64.b64encode(json.dumps({'target': record_ids}).encode("utf-8")) - search_params = {'complex': complex_param, '@class': record_class} - return f'{GRAPHKB_GUI}/data/table?{urlencode(search_params)}' + complex_param = base64.b64encode(json.dumps({"target": record_ids}).encode("utf-8")) + search_params = {"complex": complex_param, "@class": record_class} + return f"{GRAPHKB_GUI}/data/table?{urlencode(search_params)}" def substitute_sentence_template( @@ -77,60 +78,78 @@ def substitute_sentence_template( """Create the filled-in sentence template for a given template and list of substitutions which may be the result of the aggregation of 1 or more statements. """ - disease_conditions = filter_by_record_class(conditions, 'Disease') + disease_conditions = filter_by_record_class(conditions, "Disease") variant_conditions = filter_by_record_class( - conditions, 'CategoryVariant', 'CatalogueVariant', 'PositionalVariant' + conditions, "CategoryVariant", "CatalogueVariant", "PositionalVariant" ) other_conditions = filter_by_record_class( conditions, - 'CategoryVariant', - 'CatalogueVariant', - 'PositionalVariant', - 'Disease', + "CategoryVariant", + "CatalogueVariant", + "PositionalVariant", + "Disease", exclude=True, ) - result = template.replace(r'{relevance}', relevance['displayName']) + result = template.replace(r"{relevance}", relevance["displayName"]) def merge_diseases(diseases: List[Ontology]) -> str: if len(convert_to_rid_set(diseases) - disease_matches) >= 2 and all( - [d['@class'] == 'Disease' for d in diseases] + [d["@class"] == "Disease" for d in diseases] ): words = sorted( - list(set([s['displayName'] for s in diseases if s['@rid'] in disease_matches])) + list( + set( + [ + s["displayName"] + for s in diseases + if s["@rid"] in disease_matches + ] + ) + ) ) words.append(OTHER_DISEASES) return natural_join(words) else: return natural_join_records(diseases) - if r'{subject}' in template: + if r"{subject}" in template: # remove subject from the conditions replacements subjects_ids = convert_to_rid_set(subjects) - disease_conditions = [d for d in disease_conditions if d['@rid'] not in subjects_ids] - variant_conditions = [d for d in variant_conditions if d['@rid'] not in subjects_ids] - other_conditions = [d for d in other_conditions if d['@rid'] not in subjects_ids] + disease_conditions = [ + d for d in disease_conditions if d["@rid"] not in subjects_ids + ] + variant_conditions = [ + d for d in variant_conditions if d["@rid"] not in subjects_ids + ] + other_conditions = [ + d for d in other_conditions if d["@rid"] not in subjects_ids + ] - result = result.replace(r'{subject}', merge_diseases(subjects)) + result = result.replace(r"{subject}", merge_diseases(subjects)) - if r'{conditions:disease}' in template: - result = result.replace(r'{conditions:disease}', merge_diseases(disease_conditions)) + if r"{conditions:disease}" in template: + result = result.replace( + r"{conditions:disease}", merge_diseases(disease_conditions) + ) else: other_conditions.extend(disease_conditions) - if r'{conditions:variant}' in template: - result = result.replace(r'{conditions:variant}', natural_join_records(variant_conditions)) + if r"{conditions:variant}" in template: + result = result.replace( + r"{conditions:variant}", natural_join_records(variant_conditions) + ) else: other_conditions.extend(variant_conditions) - result = result.replace(r'{conditions}', natural_join_records(other_conditions)) + result = result.replace(r"{conditions}", natural_join_records(other_conditions)) - link_url = create_graphkb_link(statement_rids) if statement_rids else '' + link_url = create_graphkb_link(statement_rids) if statement_rids else "" - if r'{evidence}' in template: - evidence_str = ', '.join(sorted(list({e['displayName'] for e in evidence}))) + if r"{evidence}" in template: + evidence_str = ", ".join(sorted(list({e["displayName"] for e in evidence}))) if link_url: evidence_str = f'{evidence_str}' - result = result.replace(r'{evidence}', evidence_str) + result = result.replace(r"{evidence}", evidence_str) return result @@ -148,18 +167,20 @@ def aggregate_statements( def generate_key(statement: GkbStatement) -> Tuple: result = [ - cond['displayName'] - for cond in filter_by_record_class(statement['conditions'], 'Disease', exclude=True) - if cond['@rid'] != statement['subject']['@rid'] + cond["displayName"] + for cond in filter_by_record_class( + statement["conditions"], "Disease", exclude=True + ) + if cond["@rid"] != statement["subject"]["@rid"] ] - if statement.get('subject', {}).get('@class', 'Disease') != 'Disease': - subject = statement['subject'] - if subject['@class'] == 'Therapy': - alt = get_preferred_drug_representation(graphkb_conn, subject['@rid']) - statement['subject'] = alt - result.append(statement['subject']['displayName']) - result.append(statement['relevance']['displayName']) - result.append(statement['displayNameTemplate']) + if statement.get("subject", {}).get("@class", "Disease") != "Disease": + subject = statement["subject"] + if subject["@class"] == "Therapy": + alt = get_preferred_drug_representation(graphkb_conn, subject["@rid"]) + statement["subject"] = alt + result.append(statement["subject"]["displayName"]) + result.append(statement["relevance"]["displayName"]) + result.append(statement["displayNameTemplate"]) return tuple(sorted(set(result))) for statement in statements: @@ -171,12 +192,12 @@ def generate_key(statement: GkbStatement) -> Tuple: conditions = [] subjects = [] evidence = [] - relevance = group[0]['relevance'] - template = group[0]['displayNameTemplate'] + relevance = group[0]["relevance"] + template = group[0]["displayNameTemplate"] for statement in group: - conditions.extend(statement['conditions']) - evidence.extend(statement['evidence']) - subjects.append(statement['subject']) + conditions.extend(statement["conditions"]) + evidence.extend(statement["evidence"]) + subjects.append(statement["subject"]) sentence = substitute_sentence_template( template, @@ -189,17 +210,17 @@ def generate_key(statement: GkbStatement) -> Tuple: ) for statement in group: - result[statement['@rid']] = sentence + result[statement["@rid"]] = sentence return result def display_variant(variant: IprVariant) -> str: """Short, human readable variant description string.""" - gene = variant.get('gene', '') - if not gene and 'gene1' in variant and 'gene2' in variant: + gene = variant.get("gene", "") + if not gene and "gene1" in variant and "gene2" in variant: gene = f'({variant.get("gene1", "")},{variant.get("gene2", "")})' - if variant.get('kbCategory'): + if variant.get("kbCategory"): return f'{variant.get("kbCategory")} of {gene}' # Special display of IprFusionVariant with exons @@ -208,28 +229,32 @@ def display_variant(variant: IprVariant) -> str: # Use chosen legacy 'proteinChange' or an hgvs description of lowest detail. hgvs = variant.get( - 'proteinChange', - variant.get('hgvsProtein', variant.get('hgvsCds', variant.get('hgvsGenomic', ''))), + "proteinChange", + variant.get( + "hgvsProtein", variant.get("hgvsCds", variant.get("hgvsGenomic", "")) + ), ) if gene and hgvs: - return f'{gene}:{hgvs}' + return f"{gene}:{hgvs}" elif variant.get("variant"): return variant.get("variant") - raise ValueError(f'Unable to form display_variant of {variant}') + raise ValueError(f"Unable to form display_variant of {variant}") def display_variants(gene_name: str, variants: List[IprVariant]) -> str: - result = sorted(list({v for v in [display_variant(e) for e in variants] if gene_name in v})) + result = sorted( + list({v for v in [display_variant(e) for e in variants] if gene_name in v}) + ) variants_text = natural_join(result) if len(result) > 1: + return f"Multiple variants of the gene {gene_name} were observed in this case: {variants_text}" + elif result: return ( - f'Multiple variants of the gene {gene_name} were observed in this case: {variants_text}' + f"{variants_text[0].upper()}{variants_text[1:]} was observed in this case." ) - elif result: - return f'{variants_text[0].upper()}{variants_text[1:]} was observed in this case.' - return '' + return "" def create_section_html( @@ -242,14 +267,16 @@ def create_section_html( """ Generate HTML for a gene section of the comments """ - output = [f'

{gene_name}

'] + output = [f"

{gene_name}

"] sentence_categories: Dict[str, str] = {} for statement_id, sentence in sentences_by_statement_id.items(): - relevance = statements[statement_id]['relevance']['@rid'] + relevance = statements[statement_id]["relevance"]["@rid"] category = categorize_relevance( - graphkb_conn, relevance, RELEVANCE_BASE_TERMS + [('resistance', ['no sensitivity'])] + graphkb_conn, + relevance, + RELEVANCE_BASE_TERMS + [("resistance", ["no sensitivity"])], ) sentence_categories[sentence] = category @@ -257,12 +284,17 @@ def create_section_html( genes = sorted( graphkb_conn.query( { - 'target': 'Feature', - 'filters': { - 'AND': [ - {'source': {'target': 'Source', 'filters': {'name': 'entrez gene'}}}, - {'name': gene_name}, - {'biotype': 'gene'}, + "target": "Feature", + "filters": { + "AND": [ + { + "source": { + "target": "Source", + "filters": {"name": "entrez gene"}, + } + }, + {"name": gene_name}, + {"biotype": "gene"}, ] }, } @@ -273,39 +305,50 @@ def create_section_html( variants_text = display_variants(gene_name, exp_variants) if not variants_text: # exclude sections where they are not linked to an experimental variant. this can occur when there are co-occurent statements collected - return '' - if genes and genes[0].get('description', ''): - description = '. '.join(genes[0]['description'].split('. ')[:2]) # type: ignore - sourceId = genes[0].get('sourceId', '') + return "" + if genes and genes[0].get("description", ""): + description = ". ".join(genes[0]["description"].split(". ")[:2]) # type: ignore + sourceId = genes[0].get("sourceId", "") output.append( - f''' + f"""
{description}.

{variants_text}

-''' +""" ) sentences_used: Set[str] = set() for section in [ - {s for (s, v) in sentence_categories.items() if v == 'diagnostic'}, - {s for (s, v) in sentence_categories.items() if v == 'biological'}, - {s for (s, v) in sentence_categories.items() if v in ['therapeutic', 'prognostic']}, + {s for (s, v) in sentence_categories.items() if v == "diagnostic"}, + {s for (s, v) in sentence_categories.items() if v == "biological"}, { s for (s, v) in sentence_categories.items() - if v not in ['diagnostic', 'biological', 'therapeutic', 'prognostic', 'resistance'] + if v in ["therapeutic", "prognostic"] }, - {s for (s, v) in sentence_categories.items() if v == 'resistance'}, + { + s + for (s, v) in sentence_categories.items() + if v + not in [ + "diagnostic", + "biological", + "therapeutic", + "prognostic", + "resistance", + ] + }, + {s for (s, v) in sentence_categories.items() if v == "resistance"}, ]: - content = '. '.join(sorted(list(section - sentences_used))) + content = ". ".join(sorted(list(section - sentences_used))) sentences_used.update(section) - output.append(f'

{content}

') - return '\n'.join(output) + output.append(f"

{content}

") + return "\n".join(output) def section_statements_by_genes( @@ -315,16 +358,16 @@ def section_statements_by_genes( genes: Dict[str, Set[str]] = {} for statement in statements: - for condition in statement['conditions']: - if condition.get('biotype', '') == 'gene': - gene = get_preferred_gene_name(graphkb_conn, condition['@rid']) - genes.setdefault(gene, set()).add(statement['@rid']) + for condition in statement["conditions"]: + if condition.get("biotype", "") == "gene": + gene = get_preferred_gene_name(graphkb_conn, condition["@rid"]) + genes.setdefault(gene, set()).add(statement["@rid"]) else: - for cond_ref_key in ('reference1', 'reference2'): + for cond_ref_key in ("reference1", "reference2"): cond_ref_gene = condition.get(cond_ref_key) if cond_ref_gene: gene = get_preferred_gene_name(graphkb_conn, str(cond_ref_gene)) - genes.setdefault(gene, set()).add(statement['@rid']) + genes.setdefault(gene, set()).add(statement["@rid"]) return genes @@ -338,12 +381,12 @@ def summarize( """Given a list of GraphKB matches, generate a text summary to add to the report.""" templates: Dict[str, List[GkbStatement]] = {} statements: Dict[str, GkbStatement] = {} - variants_by_keys = {v['key']: v for v in variants} + variants_by_keys = {v["key"]: v for v in variants} variant_keys_by_statement_ids: Dict[str, Set[str]] = {} for match in matches: - rid = match['kbStatementId'] - exp_variant = match['variant'] + rid = match["kbStatementId"] + exp_variant = match["variant"] variant_keys_by_statement_ids.setdefault(rid, set()).add(exp_variant) exp_variants_by_statements: Dict[str, List[IprVariant]] = {} @@ -355,27 +398,31 @@ def summarize( exp_variants_by_statements[rid] = [] disease_matches = convert_to_rid_set( - get_term_tree(graphkb_conn, disease_name, ontology_class='Disease') + get_term_tree(graphkb_conn, disease_name, ontology_class="Disease") ) # get details for statements for match in matches: - rid = match['kbStatementId'].replace('#', '') - result = graphkb_conn.request(f'/statements/{rid}?neighbors=1')['result'] + rid = match["kbStatementId"].replace("#", "") + result = graphkb_conn.request(f"/statements/{rid}?neighbors=1")["result"] - templates.setdefault(result['displayNameTemplate'], []).append(result) - statements[result['@rid']] = result + templates.setdefault(result["displayNameTemplate"], []).append(result) + statements[result["@rid"]] = result # aggregate similar sentences sentences = {} for template, group in templates.items(): - sentences.update(aggregate_statements(graphkb_conn, template, group, disease_matches)) + sentences.update( + aggregate_statements(graphkb_conn, template, group, disease_matches) + ) # section statements by genes - statements_by_genes = section_statements_by_genes(graphkb_conn, list(statements.values())) + statements_by_genes = section_statements_by_genes( + graphkb_conn, list(statements.values()) + ) output: List[str] = [ - '

The comments below were automatically generated from matches to GraphKB and have not been manually reviewed

' + "

The comments below were automatically generated from matches to GraphKB and have not been manually reviewed

" ] for section, statement_rids in sorted( @@ -384,7 +431,7 @@ def summarize( exp_variants = {} for variant_list in [exp_variants_by_statements[r] for r in statement_rids]: for variant in variant_list: - exp_variants[variant['key']] = variant + exp_variants[variant["key"]] = variant output.append( create_section_html( @@ -396,4 +443,4 @@ def summarize( ) ) - return '\n'.join(output) + return "\n".join(output) diff --git a/pori_python/ipr/therapeutic_options.py b/pori_python/ipr/therapeutic_options.py index 4d52f9d..d1cc1f8 100644 --- a/pori_python/ipr/therapeutic_options.py +++ b/pori_python/ipr/therapeutic_options.py @@ -3,9 +3,10 @@ """ import pandas -from pori_python.graphkb import GraphKBConnection from typing import Dict, List, Sequence +from pori_python.graphkb import GraphKBConnection + from .types import IprVariant, KbMatch from .util import ( create_variant_name_tuple, @@ -16,63 +17,65 @@ def create_therapeutic_options( - graphkb_conn: GraphKBConnection, kb_matches: List[KbMatch], variants: Sequence[IprVariant] + graphkb_conn: GraphKBConnection, + kb_matches: List[KbMatch], + variants: Sequence[IprVariant], ) -> List[Dict]: """ Generate therapeutic options summary from the list of kb-matches """ options = [] - resistance_markers = get_terms_set(graphkb_conn, ['no sensitivity']) + resistance_markers = get_terms_set(graphkb_conn, ["no sensitivity"]) for match in kb_matches: - row_type = 'therapeutic' - if match['category'] != 'therapeutic' or match['relevance'] == 'eligibility': + row_type = "therapeutic" + if match["category"] != "therapeutic" or match["relevance"] == "eligibility": continue - if match['kbRelevanceId'] in resistance_markers: - row_type = 'chemoresistance' - variant = find_variant(variants, match['variantType'], match['variant']) - drug = get_preferred_drug_representation(graphkb_conn, match['kbContextId']) + if match["kbRelevanceId"] in resistance_markers: + row_type = "chemoresistance" + variant = find_variant(variants, match["variantType"], match["variant"]) + drug = get_preferred_drug_representation(graphkb_conn, match["kbContextId"]) gene, variant_string = create_variant_name_tuple(variant) options.append( { - 'gene': gene, - 'type': row_type, - 'therapy': drug['displayName'], - 'therapyGraphkbId': drug['@rid'], - 'context': match['relevance'], - 'contextGraphkbId': match['kbRelevanceId'], - 'variantGraphkbId': match['kbVariantId'], - 'variant': variant_string, - 'evidenceLevel': match['evidenceLevel'], - 'kbStatementIds': match['kbStatementId'], - 'notes': '', + "gene": gene, + "type": row_type, + "therapy": drug["displayName"], + "therapyGraphkbId": drug["@rid"], + "context": match["relevance"], + "contextGraphkbId": match["kbRelevanceId"], + "variantGraphkbId": match["kbVariantId"], + "variant": variant_string, + "evidenceLevel": match["evidenceLevel"], + "kbStatementIds": match["kbStatementId"], + "notes": "", } ) if not options: return options options_df = pandas.DataFrame.from_records(options) - def delimited_list(inputs: List, delimiter: str = ' / ') -> str: + def delimited_list(inputs: List, delimiter: str = " / ") -> str: return delimiter.join(sorted(list({i for i in inputs if i}))) - options_df = options_df.groupby(['gene', 'type', 'therapy', 'variant']).agg( + options_df = options_df.groupby(["gene", "type", "therapy", "variant"]).agg( { - 'evidenceLevel': delimited_list, - 'context': delimited_list, - 'notes': lambda x: delimited_list(x, ' '), + "evidenceLevel": delimited_list, + "context": delimited_list, + "notes": lambda x: delimited_list(x, " "), } ) options_df = options_df.reset_index() - options = options_df.to_dict('records') + options = options_df.to_dict("records") therapeutic_rank = 0 chemoresistance_rank = 0 for option in options: - if option['type'] == 'therapeutic': - option['rank'] = therapeutic_rank + if option["type"] == "therapeutic": + option["rank"] = therapeutic_rank therapeutic_rank += 1 else: - option['rank'] = chemoresistance_rank + option["rank"] = chemoresistance_rank chemoresistance_rank += 1 return options diff --git a/pori_python/ipr/types.py b/pori_python/ipr/types.py index 42154dd..415fa9f 100644 --- a/pori_python/ipr/types.py +++ b/pori_python/ipr/types.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union, Dict +from typing import Dict, List, Optional, Union try: from typing import TypedDict # type: ignore diff --git a/pori_python/ipr/util.py b/pori_python/ipr/util.py index b0647f3..0b57de5 100644 --- a/pori_python/ipr/util.py +++ b/pori_python/ipr/util.py @@ -2,23 +2,24 @@ import json import logging import pandas as pd +from numpy import nan +from typing import Any, Dict, List, Sequence, Set, Tuple, cast + from pori_python.graphkb import GraphKBConnection from pori_python.graphkb.types import Ontology, Record from pori_python.graphkb.vocab import get_term_tree -from numpy import nan -from typing import Any, Dict, List, Sequence, Set, Tuple, cast from .types import IprVariant GENE_NEIGHBORS_MAX = 3 # name the logger after the package to make it simple to disable for packages using this one as a dependency -logger = logging.getLogger('ipr') +logger = logging.getLogger("ipr") LOG_LEVELS = { - 'info': logging.INFO, - 'debug': logging.DEBUG, - 'warn': logging.WARN, - 'error': logging.ERROR, + "info": logging.INFO, + "debug": logging.DEBUG, + "warn": logging.WARN, + "error": logging.ERROR, } @@ -31,23 +32,25 @@ def get_terms_set(graphkb_conn: GraphKBConnection, base_terms: List[str]) -> Set terms = set() for base_term in base_terms: terms.update( - convert_to_rid_set(get_term_tree(graphkb_conn, base_term, include_superclasses=False)) + convert_to_rid_set( + get_term_tree(graphkb_conn, base_term, include_superclasses=False) + ) ) return terms def hash_key(key: Tuple[str]) -> str: - body = json.dumps({'key': key}, sort_keys=True) - hash_code = hashlib.md5(body.encode('utf-8')).hexdigest() + body = json.dumps({"key": key}, sort_keys=True) + hash_code = hashlib.md5(body.encode("utf-8")).hexdigest() return hash_code def convert_to_rid_set(records: Sequence[Record]) -> Set[str]: - return {r['@rid'] for r in records} + return {r["@rid"] for r in records} -def trim_empty_values(obj: IprVariant, empty_values: Sequence = ('', None, nan)): - blacklist = ('gene1', 'gene2') # allow null for sv genes +def trim_empty_values(obj: IprVariant, empty_values: Sequence = ("", None, nan)): + blacklist = ("gene1", "gene2") # allow null for sv genes keys = list(obj.keys()) for key in keys: @@ -61,19 +64,21 @@ def create_variant_name_tuple(variant: IprVariant) -> Tuple[str, str]: Given an IPR variant row, create the variant representation to be used as the name of the variant """ - variant_type = variant['variantType'] - gene = str(variant.get('gene', variant.get('gene1', ''))) - if variant_type == 'exp': - return (gene, str(variant.get('expressionState', ''))) - elif variant_type == 'cnv': - return (gene, str(variant.get('cnvState', ''))) + variant_type = variant["variantType"] + gene = str(variant.get("gene", variant.get("gene1", ""))) + if variant_type == "exp": + return (gene, str(variant.get("expressionState", ""))) + elif variant_type == "cnv": + return (gene, str(variant.get("cnvState", ""))) variant_split = ( - variant['variant'].split(':', 1)[1] if ':' in variant['variant'] else variant['variant'] + variant["variant"].split(":", 1)[1] + if ":" in variant["variant"] + else variant["variant"] ) - gene2 = str(variant.get('gene2', '')) + gene2 = str(variant.get("gene2", "")) if gene and gene2: - gene = f'{gene}, {gene2}' + gene = f"{gene}, {gene2}" elif gene2: gene = gene2 @@ -87,28 +92,30 @@ def find_variant( Find a variant in a list of variants by its key and type """ for variant in all_variants: - if variant['key'] == variant_key and variant['variantType'] == variant_type: + if variant["key"] == variant_key and variant["variantType"] == variant_type: return variant - raise KeyError(f'expected variant ({variant_key}, {variant_type}) does not exist') + raise KeyError(f"expected variant ({variant_key}, {variant_type}) does not exist") -def generate_ontology_preference_key(record: Ontology, sources_sort: Dict[str, int] = {}) -> Tuple: +def generate_ontology_preference_key( + record: Ontology, sources_sort: Dict[str, int] = {} +) -> Tuple: """Generate a tuple key for comparing preferred ontology terms.""" return ( - record.get('name') == record.get('sourceId'), - record.get('deprecated', False), - record.get('alias', False), - bool(record.get('dependency', '')), - sources_sort.get(record['source'], 99999), - record['sourceId'], - record.get('sourceIdVersion', ''), - record['name'], + record.get("name") == record.get("sourceId"), + record.get("deprecated", False), + record.get("alias", False), + bool(record.get("dependency", "")), + sources_sort.get(record["source"], 99999), + record["sourceId"], + record.get("sourceIdVersion", ""), + record["name"], ) def get_alternatives(graphkb_conn: GraphKBConnection, record_id: str) -> List[Ontology]: rec_list = graphkb_conn.query( - {'target': [record_id], 'queryType': 'similarTo', 'treeEdges': []} + {"target": [record_id], "queryType": "similarTo", "treeEdges": []} ) return [cast(Ontology, rec) for rec in rec_list] @@ -121,8 +128,10 @@ def get_preferred_drug_representation( """ source_preference = { - r['@rid']: r['sort'] - for r in graphkb_conn.query({'target': 'Source', 'returnProperties': ['sort', '@rid']}) + r["@rid"]: r["sort"] + for r in graphkb_conn.query( + {"target": "Source", "returnProperties": ["sort", "@rid"]} + ) } drugs = sorted( get_alternatives(graphkb_conn, drug_record_id), @@ -136,42 +145,46 @@ def get_preferred_gene_name( ) -> str: """Given some Feature record ID return the preferred gene name.""" record = graphkb_conn.get_record_by_id(record_id) - biotype = record.get('biotype', '') + biotype = record.get("biotype", "") genes = [] - expanded_gene_names = graphkb_conn.query({'target': [record_id], 'neighbors': neighbors}) - assert len(expanded_gene_names) == 1, "get_preferred_gene_name should have single result" + expanded_gene_names = graphkb_conn.query( + {"target": [record_id], "neighbors": neighbors} + ) + assert ( + len(expanded_gene_names) == 1 + ), "get_preferred_gene_name should have single result" expanded: Dict[str, List] = expanded_gene_names[0] # type: ignore - if biotype != 'gene': - for edge in expanded.get('out_ElementOf', []): - target = edge['in'] - if target.get('biotype') == 'gene': + if biotype != "gene": + for edge in expanded.get("out_ElementOf", []): + target = edge["in"] + if target.get("biotype") == "gene": genes.append(target) for edge_type in [ - 'out_AliasOf', - 'in_AliasOf', - 'in_DeprecatedBy', - 'out_CrossReferenceOf', - 'in_CrossReferenceOf', + "out_AliasOf", + "in_AliasOf", + "in_DeprecatedBy", + "out_CrossReferenceOf", + "in_CrossReferenceOf", ]: - target_name = 'out' if edge_type.startswith('in') else 'in' + target_name = "out" if edge_type.startswith("in") else "in" for edge in expanded.get(edge_type, []): target = edge[target_name] - if target.get('biotype') == 'gene': + if target.get("biotype") == "gene": genes.append(target) genes = sorted( genes, key=lambda gene: ( - gene['deprecated'], - bool(gene['dependency']), - '_' in gene['name'], - gene['name'].startswith('ens'), + gene["deprecated"], + bool(gene["dependency"]), + "_" in gene["name"], + gene["name"].startswith("ens"), ), ) if genes: - return genes[0]['displayName'] + return genes[0]["displayName"] # fallback to the input displayName - return str(record.get('displayName', '')) + return str(record.get("displayName", "")) def pandas_falsy(field: Any) -> bool: diff --git a/tests/test_graphkb/data.py b/tests/test_graphkb/data.py index 764f905..c0f69d8 100644 --- a/tests/test_graphkb/data.py +++ b/tests/test_graphkb/data.py @@ -10,203 +10,84 @@ # Unambiguous structural variations "(FGFR3,BRCA2):fusion(g.1234567,g.1234567)": { "matches": { - "displayName": [ - "FGFR3 fusion", - "FGFR3 rearrangement", - ], - "type": [ - "fusion", - "rearrangement", - ], - }, + "displayName": ["FGFR3 fusion", "FGFR3 rearrangement"], + "type": ["fusion", "rearrangement"], + } }, # ambiguous structural variations -> structural "FGFR3:c.1200_1300dup": { "matches": { - "displayName": [ - "FGFR3 mutation", - "FGFR3 rearrangement", - ], - "type": [ - "mutation", - "rearrangement", - ], - }, + "displayName": ["FGFR3 mutation", "FGFR3 rearrangement"], + "type": ["mutation", "rearrangement"], + } }, "FGFR3:c.1200_1201insACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT": { "matches": { - "displayName": [ - "FGFR3 mutation", - "FGFR3 rearrangement", - ], - "type": [ - "mutation", - "rearrangement", - ], - }, + "displayName": ["FGFR3 mutation", "FGFR3 rearrangement"], + "type": ["mutation", "rearrangement"], + } }, "FGFR3:g.5000_5100del": { "matches": { - "displayName": [ - "FGFR3 mutation", - "FGFR3 rearrangement", - ], - "type": [ - "mutation", - "rearrangement", - ], - }, + "displayName": ["FGFR3 mutation", "FGFR3 rearrangement"], + "type": ["mutation", "rearrangement"], + } }, "FGFR3:c.1200_1300delinsA": { "matches": { - "displayName": [ - "FGFR3 mutation", - "FGFR3 rearrangement", - ], - "type": [ - "mutation", - "rearrangement", - ], - }, + "displayName": ["FGFR3 mutation", "FGFR3 rearrangement"], + "type": ["mutation", "rearrangement"], + } }, "FGFR3:c.1200delinsACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT": { "matches": { - "displayName": [ - "FGFR3 mutation", - "FGFR3 rearrangement", - ], - "type": [ - "mutation", - "rearrangement", - ], - }, + "displayName": ["FGFR3 mutation", "FGFR3 rearrangement"], + "type": ["mutation", "rearrangement"], + } }, # ambiguous structural variations -> non-structural "FGFR3:c.1200dup": { - "matches": { - "displayName": [ - "FGFR3 mutation", - ], - "type": [ - "mutation", - ], - }, + "matches": {"displayName": ["FGFR3 mutation"], "type": ["mutation"]}, "does_not_matches": { - "displayName": [ - "FGFR3 rearrangement", - ], - "type": [ - "rearrangement", - ], + "displayName": ["FGFR3 rearrangement"], + "type": ["rearrangement"], }, }, "FGFR3:c.1200_1201insA": { - "matches": { - "displayName": [ - "FGFR3 mutation", - ], - "type": [ - "mutation", - ], - }, + "matches": {"displayName": ["FGFR3 mutation"], "type": ["mutation"]}, "does_not_matches": { - "displayName": [ - "FGFR3 rearrangement", - ], - "type": [ - "rearrangement", - ], + "displayName": ["FGFR3 rearrangement"], + "type": ["rearrangement"], }, }, "FGFR3:g.5000del": { - "matches": { - "displayName": [ - "FGFR3 mutation", - ], - "type": [ - "mutation", - ], - }, + "matches": {"displayName": ["FGFR3 mutation"], "type": ["mutation"]}, "does_not_matches": { - "displayName": [ - "FGFR3 rearrangement", - ], - "type": [ - "rearrangement", - ], + "displayName": ["FGFR3 rearrangement"], + "type": ["rearrangement"], }, }, "FGFR3:c.1200delinsA": { - "matches": { - "displayName": [ - "FGFR3 mutation", - ], - "type": [ - "mutation", - ], - }, + "matches": {"displayName": ["FGFR3 mutation"], "type": ["mutation"]}, "does_not_matches": { - "displayName": [ - "FGFR3 rearrangement", - ], - "type": [ - "rearrangement", - ], + "displayName": ["FGFR3 rearrangement"], + "type": ["rearrangement"], }, }, "STK11:e.1_100del": { - "matches": { - "displayName": [ - "STK11 mutation", - ], - "type": [ - "mutation", - ], - }, - "does_not_matches": { - "displayName": [ - "STK11 deletion", - ], - "type": [ - "deletion", - ], - }, + "matches": {"displayName": ["STK11 mutation"], "type": ["mutation"]}, + "does_not_matches": {"displayName": ["STK11 deletion"], "type": ["deletion"]}, }, "STK11:i.1_100del": { - "matches": { - "displayName": [ - "STK11 mutation", - ], - "type": [ - "mutation", - ], - }, - "does_not_matches": { - "displayName": [ - "STK11 deletion", - ], - "type": [ - "deletion", - ], - }, + "matches": {"displayName": ["STK11 mutation"], "type": ["mutation"]}, + "does_not_matches": {"displayName": ["STK11 deletion"], "type": ["deletion"]}, }, # non-structural variations "FGFR3:c.1200C>A": { - "matches": { - "displayName": [ - "FGFR3 mutation", - ], - "type": [ - "mutation", - ], - }, + "matches": {"displayName": ["FGFR3 mutation"], "type": ["mutation"]}, "does_not_matches": { - "displayName": [ - "FGFR3 rearrangement", - ], - "type": [ - "rearrangement", - ], + "displayName": ["FGFR3 rearrangement"], + "type": ["rearrangement"], }, }, } diff --git a/tests/test_graphkb/test_genes.py b/tests/test_graphkb/test_genes.py index d746cec..efd5506 100644 --- a/tests/test_graphkb/test_genes.py +++ b/tests/test_graphkb/test_genes.py @@ -3,20 +3,19 @@ """ import os - import pytest from pori_python.graphkb import GraphKBConnection from pori_python.graphkb.genes import ( get_cancer_genes, get_cancer_predisposition_info, - get_gene_linked_cancer_predisposition_info, get_gene_information, + get_gene_linked_cancer_predisposition_info, + get_gene_linked_pharmacogenomic_info, get_genes_from_variant_types, get_oncokb_oncogenes, get_oncokb_tumour_supressors, get_pharmacogenomic_info, - get_gene_linked_pharmacogenomic_info, get_preferred_gene_name, get_therapeutic_associated_genes, ) @@ -152,7 +151,7 @@ def test_get_pharmacogenomic_info(conn): break else: # no break called # failing on this version of the func; addressed in 'new' version - if gene == 'ACYP2': + if gene == "ACYP2": continue assert False, f"No rid found for a pharmacogenomic with {gene}" @@ -169,14 +168,18 @@ def test_get_gene_linked_pharmacogenomic_info(conn): assert False, f"No rid found for a pharmacogenomic with {gene}" -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif( + EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" +) def test_get_cancer_predisposition_info(conn): genes, matches = get_cancer_predisposition_info(conn) for gene in CANCER_PREDISP_INITIAL_GENES: assert gene in genes, f"{gene} not found in get_cancer_predisposition_info" -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif( + EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" +) def test_get_gene_linked_cancer_predisposition_info(conn): genes, matches = get_gene_linked_cancer_predisposition_info(conn) for gene in CANCER_PREDISP_INITIAL_GENES: @@ -193,7 +196,9 @@ def test_get_preferred_gene_name_kras(alt_rep, conn): ), f"Expected KRAS as preferred gene name for {alt_rep}, not '{gene_name}'" -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif( + EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" +) def test_find_genes_by_variant_type_structural_variant(conn): result = get_genes_from_variant_types(conn, ["structural variant"]) names = {row["name"] for row in result} @@ -201,7 +206,9 @@ def test_find_genes_by_variant_type_structural_variant(conn): assert gene in names, f"{gene} was not identified as a structural variant gene." -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif( + EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" +) def test_find_no_genes_by_variant_type_with_nonmatching_source_record_id(conn): refseq_id = get_rid(conn, target="source", name="refseq") result = get_genes_from_variant_types( @@ -210,7 +217,9 @@ def test_find_no_genes_by_variant_type_with_nonmatching_source_record_id(conn): assert not result -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif( + EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" +) def test_get_therapeutic_associated_genes(conn): gene_list = get_therapeutic_associated_genes(graphkb_conn=conn) assert gene_list, "No get_therapeutic_associated_genes found" @@ -222,7 +231,9 @@ def test_get_therapeutic_associated_genes(conn): assert gene in names, f"{gene} not found by get_therapeutic_associated_genes" -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif( + EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" +) def test_get_gene_information(conn): gene_info = get_gene_information( conn, diff --git a/tests/test_graphkb/test_graphkb.py b/tests/test_graphkb/test_graphkb.py index 8f9bd2e..5a7e48f 100644 --- a/tests/test_graphkb/test_graphkb.py +++ b/tests/test_graphkb/test_graphkb.py @@ -1,7 +1,6 @@ import os -from unittest import mock - import pytest +from unittest import mock from pori_python.graphkb import GraphKBConnection diff --git a/tests/test_graphkb/test_match.py b/tests/test_graphkb/test_match.py index 6e9165f..3df10e3 100644 --- a/tests/test_graphkb/test_match.py +++ b/tests/test_graphkb/test_match.py @@ -1,10 +1,9 @@ import os +import pytest import re from typing import List from unittest.mock import MagicMock -import pytest - import pori_python.graphkb from pori_python.graphkb import GraphKBConnection, match from pori_python.graphkb.constants import ( @@ -77,7 +76,11 @@ def test_checks_by_source_id_kras(self, conn): kras = [ f["displayName"] for f in match.get_equivalent_features( - conn, "nm_033360", source="refseq", source_id_version="4", is_source_id=True + conn, + "nm_033360", + source="refseq", + source_id_version="4", + is_source_id=True, ) ] assert "KRAS" in kras @@ -90,10 +93,14 @@ def test_bad_category(self, conn): def test_bad_gene_name(self, conn): with pytest.raises(FeatureNotFoundError): - match.match_copy_variant(conn, "not a real gene name", match.INPUT_COPY_CATEGORIES.AMP) + match.match_copy_variant( + conn, "not a real gene name", match.INPUT_COPY_CATEGORIES.AMP + ) def test_known_loss(self, conn): - matches = match.match_copy_variant(conn, "CDKN2A", match.INPUT_COPY_CATEGORIES.ANY_LOSS) + matches = match.match_copy_variant( + conn, "CDKN2A", match.INPUT_COPY_CATEGORIES.ANY_LOSS + ) assert matches types_selected = {record["type"]["name"] for record in matches} @@ -143,7 +150,9 @@ def test_known_gain(self, conn): EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" ) def test_low_gain_excludes_amplification(self, conn): - matches = match.match_copy_variant(conn, "KRAS", match.INPUT_COPY_CATEGORIES.GAIN) + matches = match.match_copy_variant( + conn, "KRAS", match.INPUT_COPY_CATEGORIES.GAIN + ) types_selected = {record["type"]["name"] for record in matches} @@ -155,9 +164,13 @@ def test_low_gain_excludes_amplification(self, conn): assert not has_prefix(variant_type, DECREASE_PREFIXES) -@pytest.mark.parametrize("pos1,pos2_start,pos2_end", [[3, 2, 5], [2, None, 5], [3, 2, None]]) +@pytest.mark.parametrize( + "pos1,pos2_start,pos2_end", [[3, 2, 5], [2, None, 5], [3, 2, None]] +) def test_range_overlap(pos1, pos2_start, pos2_end): - assert match.positions_overlap({"pos": pos1}, {"pos": pos2_start}, {"pos": pos2_end}) + assert match.positions_overlap( + {"pos": pos1}, {"pos": pos2_start}, {"pos": pos2_end} + ) @pytest.mark.parametrize( @@ -165,7 +178,9 @@ def test_range_overlap(pos1, pos2_start, pos2_end): [[2, 4, 5], [5, 2, 3], [10, None, 9], [10, 11, None], [1, 2, 2], [2, 1, 1]], ) def test_range_not_overlap(pos1, pos2_start, pos2_end): - assert not match.positions_overlap({"pos": pos1}, {"pos": pos2_start}, {"pos": pos2_end}) + assert not match.positions_overlap( + {"pos": pos1}, {"pos": pos2_start}, {"pos": pos2_end} + ) @pytest.mark.parametrize("pos1", [None, 1]) @@ -203,7 +218,9 @@ def test_known_reduced_expression(self, conn): assert not has_prefix(variant_type, INCREASE_PREFIXES) def test_known_reduced_expression_gene_id(self, conn): - gene_id = conn.query({"target": "Feature", "filters": [{"name": "PTEN"}]})[0]["@rid"] + gene_id = conn.query({"target": "Feature", "filters": [{"name": "PTEN"}]})[0][ + "@rid" + ] matches = match.match_expression_variant( conn, gene_id, match.INPUT_EXPRESSION_CATEGORIES.DOWN ) @@ -221,7 +238,9 @@ def test_known_reduced_expression_gene_id(self, conn): EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" ) def test_known_increased_expression(self, conn): - matches = match.match_expression_variant(conn, "CA9", match.INPUT_EXPRESSION_CATEGORIES.UP) + matches = match.match_expression_variant( + conn, "CA9", match.INPUT_EXPRESSION_CATEGORIES.UP + ) assert matches types_selected = {record["type"]["name"] for record in matches} @@ -240,10 +259,12 @@ def test_nonspecific_altseq(self): ) # null matches anything assert match.compare_positional_variants( - {"break1Start": {"pos": 1}, "untemplatedSeq": "T"}, {"break1Start": {"pos": 1}} + {"break1Start": {"pos": 1}, "untemplatedSeq": "T"}, + {"break1Start": {"pos": 1}}, ) assert match.compare_positional_variants( - {"break1Start": {"pos": 1}}, {"break1Start": {"pos": 1}, "untemplatedSeq": "T"} + {"break1Start": {"pos": 1}}, + {"break1Start": {"pos": 1}, "untemplatedSeq": "T"}, ) @pytest.mark.parametrize("seq1", ["T", "X", "?"]) @@ -279,15 +300,18 @@ def test_nonspecific_refseq(self): def test_ambiguous_refseq(self, seq1, seq2): # ambiguous AA matches anything the same length assert match.compare_positional_variants( - {"break1Start": {"pos": 1}, "refSeq": seq1}, {"break1Start": {"pos": 1}, "refSeq": seq2} + {"break1Start": {"pos": 1}, "refSeq": seq1}, + {"break1Start": {"pos": 1}, "refSeq": seq2}, ) def test_refseq_length_mismatch(self): assert not match.compare_positional_variants( - {"break1Start": {"pos": 1}, "refSeq": "??"}, {"break1Start": {"pos": 1}, "refSeq": "T"} + {"break1Start": {"pos": 1}, "refSeq": "??"}, + {"break1Start": {"pos": 1}, "refSeq": "T"}, ) assert not match.compare_positional_variants( - {"break1Start": {"pos": 1}, "refSeq": "?"}, {"break1Start": {"pos": 1}, "refSeq": "TT"} + {"break1Start": {"pos": 1}, "refSeq": "?"}, + {"break1Start": {"pos": 1}, "refSeq": "TT"}, ) def test_diff_altseq(self): @@ -304,12 +328,14 @@ def test_same_altseq_matches(self): def test_diff_refseq(self): assert not match.compare_positional_variants( - {"break1Start": {"pos": 1}, "refSeq": "M"}, {"break1Start": {"pos": 1}, "refSeq": "R"} + {"break1Start": {"pos": 1}, "refSeq": "M"}, + {"break1Start": {"pos": 1}, "refSeq": "R"}, ) def test_same_refseq_matches(self): assert match.compare_positional_variants( - {"break1Start": {"pos": 1}, "refSeq": "R"}, {"break1Start": {"pos": 1}, "refSeq": "R"} + {"break1Start": {"pos": 1}, "refSeq": "R"}, + {"break1Start": {"pos": 1}, "refSeq": "R"}, ) def test_range_vs_sub(self): @@ -363,7 +389,9 @@ def test_bad_gene2_name(self, conn): match.match_positional_variant(conn, "(BCR,ME-AS-A-GENE):fusion(e.13,e.3)") def test_match_explicit_reference1(self, conn): - reference1 = conn.query({"target": "Feature", "filters": {"name": "KRAS"}})[0]["@rid"] + reference1 = conn.query({"target": "Feature", "filters": {"name": "KRAS"}})[0][ + "@rid" + ] matches = match.match_positional_variant(conn, "p.G12D", reference1=reference1) assert matches @@ -371,8 +399,12 @@ def test_match_explicit_reference1(self, conn): EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" ) def test_match_explicit_references(self, conn): - reference1 = conn.query({"target": "Feature", "filters": {"name": "BCR"}})[0]["@rid"] - reference2 = conn.query({"target": "Feature", "filters": {"name": "ABL1"}})[0]["@rid"] + reference1 = conn.query({"target": "Feature", "filters": {"name": "BCR"}})[0][ + "@rid" + ] + reference2 = conn.query({"target": "Feature", "filters": {"name": "ABL1"}})[0][ + "@rid" + ] matches = match.match_positional_variant( conn, "fusion(e.13,e.3)", reference1=reference1, reference2=reference2 ) @@ -390,7 +422,9 @@ def test_match_explicit_references(self, conn): ["EGFR:p.E746_S752delinsI", ["EGFR mutation"], ["EGFR copy variant"]], ], ) - def test_known_variants(self, conn, known_variant, related_variants, unrelated_variants): + def test_known_variants( + self, conn, known_variant, related_variants, unrelated_variants + ): matches = match.match_positional_variant(conn, known_variant) names = {m["displayName"] for m in matches} assert matches @@ -404,7 +438,10 @@ def test_known_variants(self, conn, known_variant, related_variants, unrelated_v "known_variant,related_variants", [ ["(BCR,ABL1):fusion(e.13,e.3)", ["BCR and ABL1 fusion"]], - ["(ATP1B1,NRG1):fusion(e.2,e.2)", ["NRG1 fusion", "ATP1B1 and NRG1 fusion"]], + [ + "(ATP1B1,NRG1):fusion(e.2,e.2)", + ["NRG1 fusion", "ATP1B1 and NRG1 fusion"], + ], ], ) def test_known_fusions(self, conn, known_variant, related_variants): @@ -445,7 +482,8 @@ def test_tert_promoter(self, conn): assert match.match_positional_variant(conn, "TERT:c.-124C>T") @pytest.mark.skipif( - True, reason="GERO-303 - technically incorrect notation for GSC backwards compatibility." + True, + reason="GERO-303 - technically incorrect notation for GSC backwards compatibility.", ) def test_tert_promoter_leading_one_alt_notation(self, conn): # GERO-303 - technically this format is incorrect. @@ -474,18 +512,18 @@ def test_structural_variants(self, conn): MatchingTypes = [el["type"]["name"] for el in m] # Match - for displayName in expected.get('matches', {}).get("displayName", []): + for displayName in expected.get("matches", {}).get("displayName", []): assert displayName in MatchingDisplayNames - for type in expected.get('matches', {}).get("type", []): + for type in expected.get("matches", {}).get("type", []): assert type in MatchingTypes # Does not match for displayName in MatchingDisplayNames: - assert displayName not in expected.get('does_not_matches', {}).get( + assert displayName not in expected.get("does_not_matches", {}).get( "displayName", [] ) for type in MatchingTypes: - assert type not in expected.get('does_not_matches', {}).get("type", []) + assert type not in expected.get("does_not_matches", {}).get("type", []) class TestCacheMissingFeatures: @@ -508,16 +546,8 @@ class TestTypeScreening: # Types as class variables default_type = DEFAULT_NON_STRUCTURAL_VARIANT_TYPE threshold = STRUCTURAL_VARIANT_SIZE_THRESHOLD - unambiguous_structural = [ - "fusion", - "translocation", - ] - ambiguous_structural = [ - "duplication", - "deletion", - "insertion", - "indel", - ] + unambiguous_structural = ["fusion", "translocation"] + ambiguous_structural = ["duplication", "deletion", "insertion", "indel"] non_structural = [ "substitution", "missense", @@ -533,11 +563,15 @@ def mock_get_terms_set(graphkb_conn, base_terms): called = True return set() - monkeypatch.setattr("pori_python.graphkb.match.get_terms_set", mock_get_terms_set) + monkeypatch.setattr( + "pori_python.graphkb.match.get_terms_set", mock_get_terms_set + ) # Assert get_terms_set() has been called called = False - pori_python.graphkb.match.type_screening(conn, {"type": ""}, updateStructuralTypes=True) + pori_python.graphkb.match.type_screening( + conn, {"type": ""}, updateStructuralTypes=True + ) assert called # Assert get_terms_set() has not been called (default behavior) @@ -556,14 +590,17 @@ def test_type_screening_structural(self, conn): assert match.type_screening(conn, {"type": type}) == type for type in TestTypeScreening.ambiguous_structural: # w/ reference2 - assert match.type_screening(conn, {"type": type, "reference2": "#123:45"}) == type + assert ( + match.type_screening(conn, {"type": type, "reference2": "#123:45"}) + == type + ) # w/ cytoband coordinates assert match.type_screening(conn, {"type": type, "prefix": "y"}) == type def test_type_screening_structural_ambiguous_size(self, conn): for type in TestTypeScreening.ambiguous_structural: # coordinate system with ambiguous size - for prefix in ['e', 'i']: + for prefix in ["e", "i"]: assert ( match.type_screening( conn, @@ -593,10 +630,7 @@ def test_type_screening_structural_untemplatedSeqSize(self, conn): assert ( match.type_screening( conn, - { - "type": type, - "untemplatedSeqSize": TestTypeScreening.threshold, - }, + {"type": type, "untemplatedSeqSize": TestTypeScreening.threshold}, ) == type ) @@ -606,11 +640,26 @@ def test_type_screening_structural_positions(self, conn): # Variation length too small (< threshold) for opt in [ {"break2Start": {"pos": TestTypeScreening.threshold - 1}}, - {"break2Start": {"pos": TestTypeScreening.threshold - 1}, "prefix": "c"}, - {"break2Start": {"pos": TestTypeScreening.threshold - 1}, "prefix": "g"}, - {"break2Start": {"pos": TestTypeScreening.threshold - 1}, "prefix": "n"}, - {"break2Start": {"pos": TestTypeScreening.threshold - 1}, "prefix": "r"}, - {"break2Start": {"pos": int(TestTypeScreening.threshold / 3) - 1}, "prefix": "p"}, + { + "break2Start": {"pos": TestTypeScreening.threshold - 1}, + "prefix": "c", + }, + { + "break2Start": {"pos": TestTypeScreening.threshold - 1}, + "prefix": "g", + }, + { + "break2Start": {"pos": TestTypeScreening.threshold - 1}, + "prefix": "n", + }, + { + "break2Start": {"pos": TestTypeScreening.threshold - 1}, + "prefix": "r", + }, + { + "break2Start": {"pos": int(TestTypeScreening.threshold / 3) - 1}, + "prefix": "p", + }, { "break1Start": {"pos": 1 + 99}, "break2Start": {"pos": TestTypeScreening.threshold + 99 - 1}, @@ -627,7 +676,10 @@ def test_type_screening_structural_positions(self, conn): {"break2Start": {"pos": TestTypeScreening.threshold}, "prefix": "g"}, {"break2Start": {"pos": TestTypeScreening.threshold}, "prefix": "n"}, {"break2Start": {"pos": TestTypeScreening.threshold}, "prefix": "r"}, - {"break2Start": {"pos": int(TestTypeScreening.threshold / 3) + 1}, "prefix": "p"}, + { + "break2Start": {"pos": int(TestTypeScreening.threshold / 3) + 1}, + "prefix": "p", + }, { "break1Start": {"pos": 1 + 99}, "break2Start": {"pos": TestTypeScreening.threshold + 99}, diff --git a/tests/test_graphkb/test_statement.py b/tests/test_graphkb/test_statement.py index 8935f9f..ff0b9b6 100644 --- a/tests/test_graphkb/test_statement.py +++ b/tests/test_graphkb/test_statement.py @@ -1,7 +1,6 @@ import os -from unittest.mock import Mock - import pytest +from unittest.mock import Mock from pori_python.graphkb import statement @@ -86,9 +85,17 @@ def test_custom_categories(self, graphkb_conn): assert category == "blargh" -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif( + EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" +) class TestStatementMatch: - def test_truncating_categories(self, conn): # noqa - pytest fixture, not redefinition - variant = {"@class": "CategoryVariant", "@rid": "#161:429", "displayName": "RB1 truncating"} + def test_truncating_categories( + self, conn + ): # noqa - pytest fixture, not redefinition + variant = { + "@class": "CategoryVariant", + "@rid": "#161:429", + "displayName": "RB1 truncating", + } statements = statement.get_statements_from_variants(conn, [variant]) assert statements diff --git a/tests/test_graphkb/test_util.py b/tests/test_graphkb/test_util.py index fc90b9e..a61bc92 100644 --- a/tests/test_graphkb/test_util.py +++ b/tests/test_graphkb/test_util.py @@ -1,5 +1,4 @@ import os - import pytest from pori_python.graphkb import GraphKBConnection, util @@ -98,12 +97,28 @@ class TestStripDisplayName: @pytest.mark.parametrize( "opt,stripDisplayName", [ - [{"displayName": "ABL1:p.T315I", "withRef": True, "withRefSeq": True}, "ABL1:p.T315I"], - [{"displayName": "ABL1:p.T315I", "withRef": False, "withRefSeq": True}, "p.T315I"], - [{"displayName": "ABL1:p.T315I", "withRef": True, "withRefSeq": False}, "ABL1:p.315I"], - [{"displayName": "ABL1:p.T315I", "withRef": False, "withRefSeq": False}, "p.315I"], [ - {"displayName": "chr3:g.41266125C>T", "withRef": False, "withRefSeq": False}, + {"displayName": "ABL1:p.T315I", "withRef": True, "withRefSeq": True}, + "ABL1:p.T315I", + ], + [ + {"displayName": "ABL1:p.T315I", "withRef": False, "withRefSeq": True}, + "p.T315I", + ], + [ + {"displayName": "ABL1:p.T315I", "withRef": True, "withRefSeq": False}, + "ABL1:p.315I", + ], + [ + {"displayName": "ABL1:p.T315I", "withRef": False, "withRefSeq": False}, + "p.315I", + ], + [ + { + "displayName": "chr3:g.41266125C>T", + "withRef": False, + "withRefSeq": False, + }, "g.41266125>T", ], [ @@ -143,8 +158,16 @@ class TestStringifyVariant: {"withRef": False, "withRefSeq": False}, "fusion(e.10,e.12)", ], - ["ABCA12:p.N1671Ifs*4", {"withRef": False, "withRefSeq": False}, "p.1671Ifs*4"], - ["x:y.p22.33copyloss", {"withRef": False, "withRefSeq": False}, "y.p22.33copyloss"], + [ + "ABCA12:p.N1671Ifs*4", + {"withRef": False, "withRefSeq": False}, + "p.1671Ifs*4", + ], + [ + "x:y.p22.33copyloss", + {"withRef": False, "withRefSeq": False}, + "y.p22.33copyloss", + ], # TODO: ['MED12:p.(?34_?68)mut', {'withRef': False, 'withRefSeq': False}, 'p.(34_68)mut'], # TODO: ['FLT3:p.(?572_?630)_(?572_?630)ins', {'withRef': False, 'withRefSeq': False}, 'p.(572_630)_(572_630)ins'], ], @@ -163,7 +186,9 @@ def test_stringifyVariant_parsed(self, conn, hgvs_string, opt, stringifiedVarian ["#158:35317", 1652734056311, "c.1>G"], ], ) - def test_stringifyVariant_positional(self, conn, rid, createdAt, stringifiedVariant): + def test_stringifyVariant_positional( + self, conn, rid, createdAt, stringifiedVariant + ): opt = {"withRef": False, "withRefSeq": False} variant = conn.get_record_by_id(rid) if variant and variant.get("createdAt", None) == createdAt: diff --git a/tests/test_graphkb/test_vocab.py b/tests/test_graphkb/test_vocab.py index 2861da8..fc8497f 100644 --- a/tests/test_graphkb/test_vocab.py +++ b/tests/test_graphkb/test_vocab.py @@ -3,7 +3,6 @@ """ import os - import pytest from pori_python.graphkb import GraphKBConnection, genes, vocab diff --git a/tests/test_ipr/constants.py b/tests/test_ipr/constants.py index 8b211ff..b4edeab 100644 --- a/tests/test_ipr/constants.py +++ b/tests/test_ipr/constants.py @@ -1,3 +1,3 @@ import os -EXCLUDE_INTEGRATION_TESTS = os.environ.get('EXCLUDE_INTEGRATION_TESTS') == '1' +EXCLUDE_INTEGRATION_TESTS = os.environ.get("EXCLUDE_INTEGRATION_TESTS") == "1" diff --git a/tests/test_ipr/test_annotate.py b/tests/test_ipr/test_annotate.py index dd4cc01..00a63a2 100644 --- a/tests/test_ipr/test_annotate.py +++ b/tests/test_ipr/test_annotate.py @@ -1,7 +1,7 @@ import os import pytest -from pori_python.graphkb import GraphKBConnection +from pori_python.graphkb import GraphKBConnection from pori_python.ipr.annotate import annotate_positional_variants from pori_python.ipr.types import IprSmallMutationVariant @@ -9,40 +9,40 @@ # Mutations are actually identical but on alternate transcripts. TP53_MUT_DICT = { - 'pref': IprSmallMutationVariant( # type: ignore + "pref": IprSmallMutationVariant( # type: ignore { - 'key': 'SDEV-3122_preferred', - 'gene': 'TP53', - 'hgvsGenomic': 'chr17:g.7674252C>T', - 'hgvsCds': 'ENST00000269305:c.711G>A', - 'hgvsProtein': 'TP53:p.M237I', + "key": "SDEV-3122_preferred", + "gene": "TP53", + "hgvsGenomic": "chr17:g.7674252C>T", + "hgvsCds": "ENST00000269305:c.711G>A", + "hgvsProtein": "TP53:p.M237I", } ), - 'intersect': IprSmallMutationVariant( # type: ignore + "intersect": IprSmallMutationVariant( # type: ignore { - 'key': 'SDEV-3122_alt', - 'gene': 'TP53', - 'hgvsGenomic': 'chr17:g.7674252C>T', - 'hgvsCds': 'ENST00000610292:c.594G>A', - 'hgvsProtein': 'TP53:p.M198I', + "key": "SDEV-3122_alt", + "gene": "TP53", + "hgvsGenomic": "chr17:g.7674252C>T", + "hgvsCds": "ENST00000610292:c.594G>A", + "hgvsProtein": "TP53:p.M198I", } ), - 'prot_only': IprSmallMutationVariant( # type: ignore - {'key': 'prot_only', 'gene': 'TP53', 'hgvsProtein': 'TP53:p.M237I'} + "prot_only": IprSmallMutationVariant( # type: ignore + {"key": "prot_only", "gene": "TP53", "hgvsProtein": "TP53:p.M237I"} ), - 'cds_only': IprSmallMutationVariant( # type: ignore - {'key': 'cds_only', 'gene': 'TP53', 'hgvsCds': 'ENST00000269305:c.711G>A'} + "cds_only": IprSmallMutationVariant( # type: ignore + {"key": "cds_only", "gene": "TP53", "hgvsCds": "ENST00000269305:c.711G>A"} ), - 'genome_only': IprSmallMutationVariant( # type: ignore - {'key': 'genome_only', 'gene': 'TP53', 'hgvsGenomic': 'chr17:g.7674252C>T'} + "genome_only": IprSmallMutationVariant( # type: ignore + {"key": "genome_only", "gene": "TP53", "hgvsGenomic": "chr17:g.7674252C>T"} ), } -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def graphkb_conn(): - username = os.environ['IPR_USER'] - password = os.environ['IPR_PASS'] + username = os.environ["IPR_USER"] + password = os.environ["IPR_PASS"] graphkb_conn = GraphKBConnection() graphkb_conn.login(username, password) return graphkb_conn @@ -51,47 +51,53 @@ def graphkb_conn(): class TestAnnotation: def test_annotate_nonsense_vs_missense(self, graphkb_conn): """Verify missense (point mutation) is not mistaken for a nonsense (stop codon) mutation.""" - disease = 'cancer' - for key in ('prot_only', 'cds_only', 'genome_only', 'pref'): - matched = annotate_positional_variants(graphkb_conn, [TP53_MUT_DICT[key]], disease) + disease = "cancer" + for key in ("prot_only", "cds_only", "genome_only", "pref"): + matched = annotate_positional_variants( + graphkb_conn, [TP53_MUT_DICT[key]], disease + ) # nonsense - stop codon - should not match. This is missense not nonsense (#164:933). - nonsense = [a for a in matched if a['kbVariant'] == 'TP53 nonsense'] + nonsense = [a for a in matched if a["kbVariant"] == "TP53 nonsense"] assert not nonsense, f"nonsense matched to {key}: {TP53_MUT_DICT[key]}" assert matched, f"should have matched in {key}: {TP53_MUT_DICT[key]}" def test_annotate_nonsense_vs_missense_protein(self, graphkb_conn): """Verify missense (point mutation) is not mistaken for a nonsense (stop codon) mutation.""" - disease = 'cancer' - for key in ('prot_only', 'pref'): - matched = annotate_positional_variants(graphkb_conn, [TP53_MUT_DICT[key]], disease) + disease = "cancer" + for key in ("prot_only", "pref"): + matched = annotate_positional_variants( + graphkb_conn, [TP53_MUT_DICT[key]], disease + ) # nonsense - stop codon - should not match. This is missense not nonsense (#164:933). - nonsense = [a for a in matched if 'nonsense' in a['kbVariant']] + nonsense = [a for a in matched if "nonsense" in a["kbVariant"]] assert not nonsense, f"nonsense matched to {key}: {TP53_MUT_DICT[key]}" assert matched, f"should have matched in {key}: {TP53_MUT_DICT[key]}" def test_annotate_structural_variants_tp53(self, graphkb_conn): """Verify alternate TP53 variants match.""" - disease = 'cancer' - ref_key = 'prot_only' - pref = annotate_positional_variants(graphkb_conn, [TP53_MUT_DICT[ref_key]], disease) + disease = "cancer" + ref_key = "prot_only" + pref = annotate_positional_variants( + graphkb_conn, [TP53_MUT_DICT[ref_key]], disease + ) # GERO-299 - nonsense - stop codon - should not match. This is missense not nonsense (#164:933). - nonsense = [a for a in pref if a['kbVariant'] == 'TP53 nonsense'] + nonsense = [a for a in pref if a["kbVariant"] == "TP53 nonsense"] assert not nonsense - pref_vars = set([m['kbVariant'] for m in pref]) + pref_vars = set([m["kbVariant"] for m in pref]) assert pref_vars, f"No matches to {TP53_MUT_DICT[pref]}" print(pref_vars) for key, alt_rep in TP53_MUT_DICT.items(): if key == ref_key: continue alt = annotate_positional_variants(graphkb_conn, [alt_rep], disease) - alt_vars = set([m['kbVariant'] for m in alt]) + alt_vars = set([m["kbVariant"] for m in alt]) diff = pref_vars.symmetric_difference(alt_vars) missing = pref_vars.difference(alt_vars) known_issues = set() - if key == 'genome_only': + if key == "genome_only": # genome_only matched to more precise type 'TP53 deleterious mutation' but not 'TP53 mutation' - known_issues.add('TP53 mutation') + known_issues.add("TP53 mutation") missing = pref_vars.difference(alt_vars).difference(known_issues) print(alt_vars) diff --git a/tests/test_ipr/test_connection.py b/tests/test_ipr/test_connection.py index d83ac79..a825a97 100644 --- a/tests/test_ipr/test_connection.py +++ b/tests/test_ipr/test_connection.py @@ -4,37 +4,39 @@ from pori_python.ipr.connection import IprConnection -IMAGE_DIR = os.path.join(os.path.dirname(__file__), '../../docs/images') +IMAGE_DIR = os.path.join(os.path.dirname(__file__), "../../docs/images") class TestPostImages: def test_no_images_ok(self): def request(*args, **kwargs): m = mock.MagicMock( - json=lambda: [{'upload': 'successful'}], raise_for_status=lambda: None + json=lambda: [{"upload": "successful"}], raise_for_status=lambda: None ) return m - with mock.patch('pori_python.ipr.connection.requests.request', request): - conn = IprConnection('user', 'pass') - result = conn.post_images('report_id', files={}, data={}) + with mock.patch("pori_python.ipr.connection.requests.request", request): + conn = IprConnection("user", "pass") + result = conn.post_images("report_id", files={}, data={}) assert result is None def test_images_load_ok(self): def request(*args, **kwargs): m = mock.MagicMock( - json=lambda: [{'upload': 'successful'}], raise_for_status=lambda: None + json=lambda: [{"upload": "successful"}], raise_for_status=lambda: None ) return m - with mock.patch('pori_python.ipr.connection.requests.request', request): - conn = IprConnection('user', 'pass') + with mock.patch("pori_python.ipr.connection.requests.request", request): + conn = IprConnection("user", "pass") result = conn.post_images( - 'report_id', + "report_id", files={ - 'expression.correlation': os.path.join(IMAGE_DIR, 'expression_correlation.png'), - 'mixcr.circos_trb_vj_gene_usage': os.path.join( - IMAGE_DIR, 'mixcr.circos_trb_vj_gene_usage.png' + "expression.correlation": os.path.join( + IMAGE_DIR, "expression_correlation.png" + ), + "mixcr.circos_trb_vj_gene_usage": os.path.join( + IMAGE_DIR, "mixcr.circos_trb_vj_gene_usage.png" ), }, data={}, @@ -44,54 +46,57 @@ def request(*args, **kwargs): def test_images_with_data_load_ok(self): def request(*args, **kwargs): m = mock.MagicMock( - json=lambda: [{'upload': 'successful'}], raise_for_status=lambda: None + json=lambda: [{"upload": "successful"}], raise_for_status=lambda: None ) return m - with mock.patch('pori_python.ipr.connection.requests.request', request): - conn = IprConnection('user', 'pass') + with mock.patch("pori_python.ipr.connection.requests.request", request): + conn = IprConnection("user", "pass") result = conn.post_images( - 'report_id', + "report_id", files={ - 'expression.correlation': os.path.join(IMAGE_DIR, 'expression_correlation.png'), - 'mixcr.circos_trb_vj_gene_usage': os.path.join( - IMAGE_DIR, 'mixcr.circos_trb_vj_gene_usage.png' + "expression.correlation": os.path.join( + IMAGE_DIR, "expression_correlation.png" + ), + "mixcr.circos_trb_vj_gene_usage": os.path.join( + IMAGE_DIR, "mixcr.circos_trb_vj_gene_usage.png" ), }, - data={'expression.correlation.title': 'this is a title'}, + data={"expression.correlation.title": "this is a title"}, ) assert result is None def test_bad_file(self): def request(*args, **kwargs): m = mock.MagicMock( - json=lambda: [{'upload': 'successful'}], raise_for_status=lambda: None + json=lambda: [{"upload": "successful"}], raise_for_status=lambda: None ) return m - with mock.patch('pori_python.ipr.connection.requests.request', request): - conn = IprConnection('user', 'pass') + with mock.patch("pori_python.ipr.connection.requests.request", request): + conn = IprConnection("user", "pass") with pytest.raises(FileNotFoundError): conn.post_images( - 'report_id', files={'expression.correlation': 'thing/that/does/not/exist.png'} + "report_id", + files={"expression.correlation": "thing/that/does/not/exist.png"}, ) def test_failed_image_load(self): def request(*args, **kwargs): m = mock.MagicMock( - json=lambda: [{'upload': 'anything else', 'key': 'thing'}], + json=lambda: [{"upload": "anything else", "key": "thing"}], raise_for_status=lambda: None, ) return m - with mock.patch('pori_python.ipr.connection.requests.request', request): - conn = IprConnection('user', 'pass') + with mock.patch("pori_python.ipr.connection.requests.request", request): + conn = IprConnection("user", "pass") with pytest.raises(ValueError): conn.post_images( - 'report_id', + "report_id", { - 'expression.correlation': os.path.join( - IMAGE_DIR, 'expression_correlation.png' + "expression.correlation": os.path.join( + IMAGE_DIR, "expression_correlation.png" ) }, ) diff --git a/tests/test_ipr/test_inputs.py b/tests/test_ipr/test_inputs.py index c27da8c..07c0723 100644 --- a/tests/test_ipr/test_inputs.py +++ b/tests/test_ipr/test_inputs.py @@ -3,9 +3,9 @@ import os import pandas as pd import pytest -from pori_python.graphkb.match import INPUT_COPY_CATEGORIES from unittest import mock +from pori_python.graphkb.match import INPUT_COPY_CATEGORIES from pori_python.ipr.inputs import ( COPY_OPTIONAL, check_comparators, @@ -20,8 +20,8 @@ from pori_python.ipr.types import IprFusionVariant, IprGeneVariant from pori_python.ipr.util import logger -DATA_DIR = os.path.join(os.path.dirname(__file__), 'test_data') -NON_EMPTY_STRING_NULLS = ['', None, np.nan, pd.NA] +DATA_DIR = os.path.join(os.path.dirname(__file__), "test_data") +NON_EMPTY_STRING_NULLS = ["", None, np.nan, pd.NA] def read_data_file(filename): @@ -31,186 +31,192 @@ def read_data_file(filename): class TestPreProcessSmallMutations: def test_load_test_file(self) -> None: records = preprocess_small_mutations( - pd.read_csv(os.path.join(DATA_DIR, 'small_mutations.tab'), sep='\t').to_dict('records') + pd.read_csv( + os.path.join(DATA_DIR, "small_mutations.tab"), sep="\t" + ).to_dict("records") ) assert records assert len(records) == 2614 def test_maintains_optional_fields(self): original = { - 'gene': 'A1BG', - 'proteinChange': 'p.V460M', - 'zygosity': 'het', - 'tumourAltCount': 42, - 'tumourRefCount': 48, - 'hgvsProtein': '', - 'transcript': 'ENST1000', - 'hgvsCds': '', - 'hgvsGenomic': '', - 'key': '02fe85a3477784b5ac0f8ecffb300d10', - 'variant': 'blargh', - 'chromosome': '2', - 'startPosition': 1234, + "gene": "A1BG", + "proteinChange": "p.V460M", + "zygosity": "het", + "tumourAltCount": 42, + "tumourRefCount": 48, + "hgvsProtein": "", + "transcript": "ENST1000", + "hgvsCds": "", + "hgvsGenomic": "", + "key": "02fe85a3477784b5ac0f8ecffb300d10", + "variant": "blargh", + "chromosome": "2", + "startPosition": 1234, } records = preprocess_small_mutations([original]) record = records[0] - assert record['variantType'] == 'mut' + assert record["variantType"] == "mut" for col in original: assert col in record - assert record['variant'] == 'A1BG:p.V460M' - assert 'endPosition' in record - assert record['endPosition'] == record['startPosition'] - assert 'tumourDepth' in record - assert record['tumourDepth'] == 90 + assert record["variant"] == "A1BG:p.V460M" + assert "endPosition" in record + assert record["endPosition"] == record["startPosition"] + assert "tumourDepth" in record + assert record["tumourDepth"] == 90 def test_null(self): original = { - 'gene': 'A1BG', - 'proteinChange': 'p.V460M', - 'tumourAltCount': 42, - 'tumourRefCount': 48, - 'startPosition': 1234, + "gene": "A1BG", + "proteinChange": "p.V460M", + "tumourAltCount": 42, + "tumourRefCount": 48, + "startPosition": 1234, } # Make sure TEST_KEYS are appropriate. # For some fields, like 'ref' and 'alt', NA is _not_ equivalent to a null string. - TEST_KEYS = ['startPosition', 'endPosition', 'tumourAltCount', 'tumourRefCount'] + TEST_KEYS = ["startPosition", "endPosition", "tumourAltCount", "tumourRefCount"] for key in TEST_KEYS: for null in NON_EMPTY_STRING_NULLS: small_mut = original.copy() small_mut[key] = null records = preprocess_small_mutations([small_mut]) record = records[0] - assert record['variantType'] == 'mut' + assert record["variantType"] == "mut" for col in original: assert col in record - assert record['variant'] == 'A1BG:p.V460M' - assert 'endPosition' in record + assert record["variant"] == "A1BG:p.V460M" + assert "endPosition" in record def test_load_small_mutations_probe(self) -> None: records = preprocess_small_mutations( - pd.read_csv(os.path.join(DATA_DIR, 'small_mutations_probe.tab'), sep='\t').to_dict( - 'records' - ) + pd.read_csv( + os.path.join(DATA_DIR, "small_mutations_probe.tab"), sep="\t" + ).to_dict("records") ) assert records assert len(records) == 4 - assert records[0]['variantType'] == 'mut' - assert 'variant' in records[0] + assert records[0]["variantType"] == "mut" + assert "variant" in records[0] class TestPreProcessCopyVariants: def test_load_copy_variants(self) -> None: records = preprocess_copy_variants( - pd.read_csv(os.path.join(DATA_DIR, 'copy_variants.tab'), sep='\t').to_dict('records') + pd.read_csv(os.path.join(DATA_DIR, "copy_variants.tab"), sep="\t").to_dict( + "records" + ) ) assert records assert len(records) == 4603 - assert records[0]['variantType'] == 'cnv' - assert 'variant' in records[0] + assert records[0]["variantType"] == "cnv" + assert "variant" in records[0] def test_null(self): for kb_cat in list(INPUT_COPY_CATEGORIES.values()) + NON_EMPTY_STRING_NULLS: - original = {'gene': 'ERBB2', 'kbCategory': kb_cat} + original = {"gene": "ERBB2", "kbCategory": kb_cat} for key in COPY_OPTIONAL: for null in NON_EMPTY_STRING_NULLS: copy_var = original.copy() copy_var[key] = null records = preprocess_copy_variants([copy_var]) record = records[0] - assert record['variantType'] == 'cnv' + assert record["variantType"] == "cnv" def test_load_structural_variants() -> None: records = preprocess_structural_variants( - pd.read_csv(os.path.join(DATA_DIR, 'fusions.tab'), sep='\t').to_dict('records') + pd.read_csv(os.path.join(DATA_DIR, "fusions.tab"), sep="\t").to_dict("records") ) assert records assert len(records) == 5 - assert records[0]['variantType'] == 'sv' - assert 'variant' in records[0] + assert records[0]["variantType"] == "sv" + assert "variant" in records[0] def test_load_expression_variants() -> None: records = preprocess_expression_variants( - pd.read_csv(os.path.join(DATA_DIR, 'expression.tab'), sep='\t').to_dict('records') + pd.read_csv(os.path.join(DATA_DIR, "expression.tab"), sep="\t").to_dict( + "records" + ) ) assert records assert len(records) == 4603 - assert records[0]['variantType'] == 'exp' - assert 'variant' in records[0] + assert records[0]["variantType"] == "exp" + assert "variant" in records[0] class TestCheckVariantLinks: def test_sm_missing_copy_empty_ok(self) -> None: genes = check_variant_links( - small_mutations=[IprGeneVariant({'gene': 'KRAS'})], # type: ignore + small_mutations=[IprGeneVariant({"gene": "KRAS"})], # type: ignore copy_variants=[], - expression_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore + expression_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore structural_variants=[], ) - assert genes == {'KRAS'} + assert genes == {"KRAS"} def test_sm_missing_exp_empty_ok(self) -> None: genes = check_variant_links( - small_mutations=[IprGeneVariant({'gene': 'KRAS'})], # type: ignore - copy_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore + small_mutations=[IprGeneVariant({"gene": "KRAS"})], # type: ignore + copy_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore expression_variants=[], structural_variants=[], ) - assert genes == {'KRAS'} + assert genes == {"KRAS"} def test_sm_missing_copy(self) -> None: - with mock.patch.object(logger, 'debug') as mock_debug: + with mock.patch.object(logger, "debug") as mock_debug: check_variant_links( - small_mutations=[IprGeneVariant({'gene': 'KRAS'})], # type: ignore - copy_variants=[IprGeneVariant({'gene': 'CDK', 'variant': ''})], # type: ignore - expression_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore + small_mutations=[IprGeneVariant({"gene": "KRAS"})], # type: ignore + copy_variants=[IprGeneVariant({"gene": "CDK", "variant": ""})], # type: ignore + expression_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore structural_variants=[], ) assert mock_debug.called def test_sm_missing_exp(self) -> None: - with mock.patch.object(logger, 'debug') as mock_debug: + with mock.patch.object(logger, "debug") as mock_debug: check_variant_links( - small_mutations=[IprGeneVariant({'gene': 'KRAS'})], # type: ignore - copy_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore - expression_variants=[IprGeneVariant({'gene': 'CDK', 'variant': ''})], # type: ignore + small_mutations=[IprGeneVariant({"gene": "KRAS"})], # type: ignore + copy_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore + expression_variants=[IprGeneVariant({"gene": "CDK", "variant": ""})], # type: ignore structural_variants=[], ) assert mock_debug.called def test_with_valid_inputs(self) -> None: genes = check_variant_links( - small_mutations=[IprGeneVariant({'gene': 'KRAS'})], # type: ignore + small_mutations=[IprGeneVariant({"gene": "KRAS"})], # type: ignore copy_variants=[ - IprGeneVariant({'gene': 'KRAS', 'variant': ''}), # type: ignore - IprGeneVariant({'gene': 'CDK', 'variant': ''}), # type: ignore + IprGeneVariant({"gene": "KRAS", "variant": ""}), # type: ignore + IprGeneVariant({"gene": "CDK", "variant": ""}), # type: ignore ], - expression_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore + expression_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore structural_variants=[], ) - assert genes == {'KRAS'} + assert genes == {"KRAS"} def test_copy_missing_exp(self) -> None: - with mock.patch.object(logger, 'debug') as mock_debug: + with mock.patch.object(logger, "debug") as mock_debug: check_variant_links( small_mutations=[], copy_variants=[ - IprGeneVariant({'gene': 'BRAF', 'variant': 'copy gain'}), # type: ignore - IprGeneVariant({'gene': 'KRAS', 'variant': ''}), # type: ignore + IprGeneVariant({"gene": "BRAF", "variant": "copy gain"}), # type: ignore + IprGeneVariant({"gene": "KRAS", "variant": ""}), # type: ignore ], - expression_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore + expression_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore structural_variants=[], ) assert mock_debug.called def test_exp_missing_copy(self) -> None: - with mock.patch.object(logger, 'debug') as mock_debug: + with mock.patch.object(logger, "debug") as mock_debug: check_variant_links( small_mutations=[], - copy_variants=[IprGeneVariant({'gene': 'KRAS', 'variant': ''})], # type: ignore + copy_variants=[IprGeneVariant({"gene": "KRAS", "variant": ""})], # type: ignore expression_variants=[ - IprGeneVariant({'gene': 'BRAF', 'variant': 'increased expression'}) # type: ignore + IprGeneVariant({"gene": "BRAF", "variant": "increased expression"}) # type: ignore ], structural_variants=[], ) @@ -220,67 +226,67 @@ def test_exp_missing_copy(self) -> None: class TestCreateGraphkbSvNotation: def test_both_genes_and_exons(self) -> None: notation = create_graphkb_sv_notation( - IprFusionVariant({'gene1': 'A', 'gene2': 'B', 'exon1': 1, 'exon2': 2}) # type: ignore + IprFusionVariant({"gene1": "A", "gene2": "B", "exon1": 1, "exon2": 2}) # type: ignore ) - assert notation == '(A,B):fusion(e.1,e.2)' + assert notation == "(A,B):fusion(e.1,e.2)" def test_one_exon_missing(self) -> None: notation = create_graphkb_sv_notation( - IprFusionVariant({'gene1': 'A', 'gene2': 'B', 'exon1': '', 'exon2': 2}) # type: ignore + IprFusionVariant({"gene1": "A", "gene2": "B", "exon1": "", "exon2": 2}) # type: ignore ) - assert notation == '(A,B):fusion(e.?,e.2)' + assert notation == "(A,B):fusion(e.?,e.2)" def test_one_gene_missing(self) -> None: notation = create_graphkb_sv_notation( - IprFusionVariant({'gene1': 'A', 'gene2': '', 'exon1': 1, 'exon2': 2}) # type: ignore + IprFusionVariant({"gene1": "A", "gene2": "", "exon1": 1, "exon2": 2}) # type: ignore ) - assert notation == '(A,?):fusion(e.1,e.2)' + assert notation == "(A,?):fusion(e.1,e.2)" def test_first_gene_missing(self) -> None: notation = create_graphkb_sv_notation( - IprFusionVariant({'gene1': '', 'gene2': 'B', 'exon1': 1, 'exon2': 2}) # type: ignore + IprFusionVariant({"gene1": "", "gene2": "B", "exon1": 1, "exon2": 2}) # type: ignore ) - assert notation == '(B,?):fusion(e.2,e.1)' + assert notation == "(B,?):fusion(e.2,e.1)" def test_no_genes_error(self) -> None: with pytest.raises(ValueError): create_graphkb_sv_notation( - IprFusionVariant({'gene1': '', 'gene2': '', 'exon1': 1, 'exon2': 2, 'key': 'x'}) # type: ignore + IprFusionVariant({"gene1": "", "gene2": "", "exon1": 1, "exon2": 2, "key": "x"}) # type: ignore ) class TestCheckComparators: def test_missing_disease_expression_error(self): - content = {'comparators': [{'analysisRole': 'expression (primary site)'}]} + content = {"comparators": [{"analysisRole": "expression (primary site)"}]} variants = [{}] with pytest.raises(ValueError): check_comparators(content, variants) def test_missing_primary_expression_error(self): - content = {'comparators': [{'analysisRole': 'expression (disease)'}]} - variants = [{'primarySiteFoldChange': 1}] + content = {"comparators": [{"analysisRole": "expression (disease)"}]} + variants = [{"primarySiteFoldChange": 1}] with pytest.raises(ValueError): check_comparators(content, variants) def test_missing_biopsy_expression_error(self): - content = {'comparators': [{'analysisRole': 'expression (disease)'}]} - variants = [{'biopsySitePercentile': 1}] + content = {"comparators": [{"analysisRole": "expression (disease)"}]} + variants = [{"biopsySitePercentile": 1}] with pytest.raises(ValueError): check_comparators(content, variants) def test_expression_not_required_without_variants(self): - content = {'comparators': []} + content = {"comparators": []} variants = [] assert check_comparators(content, variants) is None def test_missing_mutation_burden(self): content = { - 'comparators': [{'analysisRole': 'mutation burden (secondary)'}], - 'images': [{'key': 'mutationBurden.density_snv.primary'}], + "comparators": [{"analysisRole": "mutation burden (secondary)"}], + "images": [{"key": "mutationBurden.density_snv.primary"}], } variants = [] @@ -288,8 +294,10 @@ def test_missing_mutation_burden(self): check_comparators(content, variants) -@pytest.mark.parametrize("example_name", ['no_variants', 'sm_and_exp', 'sm_only']) +@pytest.mark.parametrize("example_name", ["no_variants", "sm_and_exp", "sm_only"]) def test_valid_json_inputs(example_name: str): - with open(os.path.join(DATA_DIR, 'json_examples', f'{example_name}.json'), 'r') as fh: + with open( + os.path.join(DATA_DIR, "json_examples", f"{example_name}.json"), "r" + ) as fh: content = json.load(fh) validate_report_content(content) diff --git a/tests/test_ipr/test_ipr.py b/tests/test_ipr/test_ipr.py index 38812f4..bda4d1d 100644 --- a/tests/test_ipr/test_ipr.py +++ b/tests/test_ipr/test_ipr.py @@ -1,8 +1,8 @@ import pytest -from pori_python.graphkb import statement as gkb_statement -from pori_python.graphkb import vocab as gkb_vocab from unittest.mock import Mock, patch +from pori_python.graphkb import statement as gkb_statement +from pori_python.graphkb import vocab as gkb_vocab from pori_python.ipr.ipr import convert_statements_to_alterations, germline_kb_matches from pori_python.ipr.types import GkbStatement @@ -154,7 +154,11 @@ class QueryMock: def __call__(self, *args, **kwargs): self.index += 1 - ret_val = self.return_values[self.index] if self.index < len(self.return_values) else [] + ret_val = ( + self.return_values[self.index] + if self.index < len(self.return_values) + else [] + ) return ret_val def mock_get_source(source): @@ -171,7 +175,11 @@ def base_graphkb_statement( statement = GkbStatement( # type: ignore { "conditions": [ - {"@class": "Disease", "@rid": disease_id, "displayName": "disease_display_name"}, + { + "@class": "Disease", + "@rid": disease_id, + "displayName": "disease_display_name", + }, { "@class": "CategoryVariant", "@rid": "variant_rid", @@ -180,9 +188,9 @@ def base_graphkb_statement( ], "evidence": [], "subject": { - "@class": 'dummy_value', + "@class": "dummy_value", "@rid": "101:010", - "displayName": 'dummy_display_name', + "displayName": "dummy_display_name", }, "source": None, "sourceId": None, @@ -294,7 +302,9 @@ def test_diagnostic(self, graphkb_conn) -> None: assert row["category"] == "diagnostic" @patch("pori_python.ipr.ipr.get_evidencelevel_mapping") - def test_unapproved_therapeutic(self, mock_get_evidencelevel_mapping, graphkb_conn) -> None: + def test_unapproved_therapeutic( + self, mock_get_evidencelevel_mapping, graphkb_conn + ) -> None: mock_get_evidencelevel_mapping.return_value = {"other": "test"} statement = base_graphkb_statement() @@ -309,8 +319,12 @@ def test_unapproved_therapeutic(self, mock_get_evidencelevel_mapping, graphkb_co assert row["category"] == "therapeutic" @patch("pori_python.ipr.ipr.get_evidencelevel_mapping") - def test_approved_therapeutic(self, mock_get_evidencelevel_mapping, graphkb_conn) -> None: - mock_get_evidencelevel_mapping.return_value = {APPROVED_EVIDENCE_RIDS[0]: "test"} + def test_approved_therapeutic( + self, mock_get_evidencelevel_mapping, graphkb_conn + ) -> None: + mock_get_evidencelevel_mapping.return_value = { + APPROVED_EVIDENCE_RIDS[0]: "test" + } statement = base_graphkb_statement() statement["relevance"]["@rid"] = "therapeutic" diff --git a/tests/test_ipr/test_main.py b/tests/test_ipr/test_main.py index c1cf2b9..a679ae3 100644 --- a/tests/test_ipr/test_main.py +++ b/tests/test_ipr/test_main.py @@ -14,67 +14,70 @@ def get_test_spec(): - ipr_spec = {'components': {'schemas': {'genesCreate': {'properties': {}}}}} + ipr_spec = {"components": {"schemas": {"genesCreate": {"properties": {}}}}} ipr_gene_keys = IprGene.__required_keys__ | IprGene.__optional_keys__ for key in ipr_gene_keys: - ipr_spec['components']['schemas']['genesCreate']['properties'][key] = "" + ipr_spec["components"]["schemas"]["genesCreate"]["properties"][key] = "" return ipr_spec def get_test_file(name: str) -> str: - return os.path.join(os.path.dirname(__file__), 'test_data', name) + return os.path.join(os.path.dirname(__file__), "test_data", name) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def report_upload_content(tmp_path_factory) -> Dict: mock = MagicMock() - json_file = tmp_path_factory.mktemp('inputs') / 'content.json' + json_file = tmp_path_factory.mktemp("inputs") / "content.json" json_file.write_text( json.dumps( { - 'blargh': 'some fake content', - 'comparators': [ - {'analysisRole': 'expression (disease)', 'name': '1'}, - {'analysisRole': 'expression (primary site)', 'name': '2'}, - {'analysisRole': 'expression (biopsy site)', 'name': '3'}, - {'analysisRole': 'expression (internal pancancer cohort)', 'name': '4'}, + "blargh": "some fake content", + "comparators": [ + {"analysisRole": "expression (disease)", "name": "1"}, + {"analysisRole": "expression (primary site)", "name": "2"}, + {"analysisRole": "expression (biopsy site)", "name": "3"}, + { + "analysisRole": "expression (internal pancancer cohort)", + "name": "4", + }, ], - 'patientId': 'PATIENT001', - 'project': 'TEST', - 'expressionVariants': pd.read_csv( - get_test_file('expression.short.tab'), sep='\t' - ).to_dict('records'), - 'smallMutations': pd.read_csv( - get_test_file('small_mutations.short.tab'), sep='\t' - ).to_dict('records'), - 'copyVariants': pd.read_csv( - get_test_file('copy_variants.short.tab'), sep='\t' - ).to_dict('records'), - 'structuralVariants': pd.read_csv(get_test_file('fusions.tab'), sep='\t').to_dict( - 'records' - ), - 'kbDiseaseMatch': 'colorectal cancer', + "patientId": "PATIENT001", + "project": "TEST", + "expressionVariants": pd.read_csv( + get_test_file("expression.short.tab"), sep="\t" + ).to_dict("records"), + "smallMutations": pd.read_csv( + get_test_file("small_mutations.short.tab"), sep="\t" + ).to_dict("records"), + "copyVariants": pd.read_csv( + get_test_file("copy_variants.short.tab"), sep="\t" + ).to_dict("records"), + "structuralVariants": pd.read_csv( + get_test_file("fusions.tab"), sep="\t" + ).to_dict("records"), + "kbDiseaseMatch": "colorectal cancer", } ) ) with patch.object( sys, - 'argv', + "argv", [ - 'ipr', - '--username', - os.environ.get('IPR_USER', os.environ['USER']), - '--password', - os.environ['IPR_PASS'], - '--ipr_url', - 'http://fake.url.ca', - '--content', + "ipr", + "--username", + os.environ.get("IPR_USER", os.environ["USER"]), + "--password", + os.environ["IPR_PASS"], + "--ipr_url", + "http://fake.url.ca", + "--content", str(json_file), - '--therapeutics', + "--therapeutics", ], ): - with patch.object(IprConnection, 'upload_report', new=mock): - with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()): + with patch.object(IprConnection, "upload_report", new=mock): + with patch.object(IprConnection, "get_spec", return_value=get_test_spec()): command_interface() assert mock.called @@ -83,48 +86,55 @@ def report_upload_content(tmp_path_factory) -> Dict: return report_content -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif( + EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" +) class TestCreateReport: def test_main_sections_present(self, report_upload_content: Dict) -> None: sections = set(report_upload_content.keys()) for section in [ - 'structuralVariants', - 'expressionVariants', - 'copyVariants', - 'smallMutations', - 'kbMatches', - 'genes', + "structuralVariants", + "expressionVariants", + "copyVariants", + "smallMutations", + "kbMatches", + "genes", ]: assert section in sections def test_kept_low_quality_fusion(self, report_upload_content: Dict) -> None: - fusions = [(sv['gene1'], sv['gene2']) for sv in report_upload_content['structuralVariants']] - assert ('SARM1', 'SUZ12') in fusions + fusions = [ + (sv["gene1"], sv["gene2"]) + for sv in report_upload_content["structuralVariants"] + ] + assert ("SARM1", "SUZ12") in fusions def test_pass_through_content_added(self, report_upload_content: Dict) -> None: # check the passthorough content was added - assert 'blargh' in report_upload_content + assert "blargh" in report_upload_content def test_found_fusion_partner_gene(self, report_upload_content: Dict) -> None: - genes = report_upload_content['genes'] + genes = report_upload_content["genes"] # eg, A1BG - assert any([g.get('knownFusionPartner', False) for g in genes]) + assert any([g.get("knownFusionPartner", False) for g in genes]) def test_found_oncogene(self, report_upload_content: Dict) -> None: - genes = report_upload_content['genes'] + genes = report_upload_content["genes"] # eg, ZBTB20 - assert any([g.get('oncogene', False) for g in genes]) + assert any([g.get("oncogene", False) for g in genes]) def test_found_tumour_supressor(self, report_upload_content: Dict) -> None: - genes = report_upload_content['genes'] + genes = report_upload_content["genes"] # eg, ZNRF3 - assert any([g.get('tumourSuppressor', False) for g in genes]) + assert any([g.get("tumourSuppressor", False) for g in genes]) def test_found_kb_statement_related_gene(self, report_upload_content: Dict) -> None: - genes = report_upload_content['genes'] - assert any([g.get('kbStatementRelated', False) for g in genes]) - - def test_found_cancer_gene_list_match_gene(self, report_upload_content: Dict) -> None: - genes = report_upload_content['genes'] - assert any([g.get('cancerGeneListMatch', False) for g in genes]) + genes = report_upload_content["genes"] + assert any([g.get("kbStatementRelated", False) for g in genes]) + + def test_found_cancer_gene_list_match_gene( + self, report_upload_content: Dict + ) -> None: + genes = report_upload_content["genes"] + assert any([g.get("cancerGeneListMatch", False) for g in genes]) diff --git a/tests/test_ipr/test_probe.py b/tests/test_ipr/test_probe.py index a7d9cbd..0d70846 100644 --- a/tests/test_ipr/test_probe.py +++ b/tests/test_ipr/test_probe.py @@ -11,33 +11,33 @@ def get_test_file(name: str) -> str: - return os.path.join(os.path.dirname(__file__), 'test_data', name) + return os.path.join(os.path.dirname(__file__), "test_data", name) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def probe_upload_content() -> Dict: mock = MagicMock() - with patch.object(IprConnection, 'upload_report', new=mock): - with patch.object(IprConnection, 'get_spec', return_value={}): + with patch.object(IprConnection, "upload_report", new=mock): + with patch.object(IprConnection, "get_spec", return_value={}): create_report( content={ - 'patientId': 'PATIENT001', - 'project': 'TEST', - 'smallMutations': pd.read_csv( - get_test_file('small_mutations_probe.tab'), - sep='\t', - dtype={'chromosome': 'string'}, - ).to_dict('records'), - 'structuralVariants': pd.read_csv( - get_test_file('fusions.tab'), sep='\t' - ).to_dict('records'), - 'blargh': 'some fake content', - 'kbDiseaseMatch': 'colorectal cancer', + "patientId": "PATIENT001", + "project": "TEST", + "smallMutations": pd.read_csv( + get_test_file("small_mutations_probe.tab"), + sep="\t", + dtype={"chromosome": "string"}, + ).to_dict("records"), + "structuralVariants": pd.read_csv( + get_test_file("fusions.tab"), sep="\t" + ).to_dict("records"), + "blargh": "some fake content", + "kbDiseaseMatch": "colorectal cancer", }, - username=os.environ['IPR_USER'], - password=os.environ['IPR_PASS'], - log_level='info', - ipr_url='http://fake.url.ca', + username=os.environ["IPR_USER"], + password=os.environ["IPR_PASS"], + log_level="info", + ipr_url="http://fake.url.ca", ) assert mock.called @@ -46,18 +46,22 @@ def probe_upload_content() -> Dict: return report_content -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests") +@pytest.mark.skipif( + EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" +) class TestCreateReport: def test_found_probe_small_mutations(self, probe_upload_content: Dict) -> None: - assert probe_upload_content['smallMutations'] + assert probe_upload_content["smallMutations"] - def test_found_probe_small_mutations_match(self, probe_upload_content: Dict) -> None: + def test_found_probe_small_mutations_match( + self, probe_upload_content: Dict + ) -> None: # verify each probe had a KB match - for sm_probe in probe_upload_content['smallMutations']: + for sm_probe in probe_upload_content["smallMutations"]: match_list = [ kb_match - for kb_match in probe_upload_content['kbMatches'] - if kb_match['variant'] == sm_probe["key"] + for kb_match in probe_upload_content["kbMatches"] + if kb_match["variant"] == sm_probe["key"] ] assert ( match_list diff --git a/tests/test_ipr/test_summary.py b/tests/test_ipr/test_summary.py index aaee77e..edbcd35 100644 --- a/tests/test_ipr/test_summary.py +++ b/tests/test_ipr/test_summary.py @@ -14,14 +14,24 @@ def test_prefers_non_alias(self): side_effect=[ [], [ - {'sourceId': '1', 'alias': False, 'source': 'source', 'name': 'name'}, - {'sourceId': '2', 'alias': True, 'source': 'source', 'name': 'name'}, + { + "sourceId": "1", + "alias": False, + "source": "source", + "name": "name", + }, + { + "sourceId": "2", + "alias": True, + "source": "source", + "name": "name", + }, ], ] ) ) - rec = get_preferred_drug_representation(api, 'anything') - assert rec['sourceId'] == '1' + rec = get_preferred_drug_representation(api, "anything") + assert rec["sourceId"] == "1" def test_prefers_non_deprecated(self): api = MagicMock( @@ -29,29 +39,49 @@ def test_prefers_non_deprecated(self): side_effect=[ [], [ - {'sourceId': '1', 'deprecated': True, 'source': 'source', 'name': 'name'}, - {'sourceId': '2', 'deprecated': False, 'source': 'source', 'name': 'name'}, + { + "sourceId": "1", + "deprecated": True, + "source": "source", + "name": "name", + }, + { + "sourceId": "2", + "deprecated": False, + "source": "source", + "name": "name", + }, ], ] ) ) - rec = get_preferred_drug_representation(api, 'anything') - assert rec['sourceId'] == '2' + rec = get_preferred_drug_representation(api, "anything") + assert rec["sourceId"] == "2" def test_prefers_lower_sort_source(self): api = MagicMock( query=MagicMock( side_effect=[ - [{'@rid': 'source2', 'sort': 0}, {'@rid': 'source1', 'sort': 1}], + [{"@rid": "source2", "sort": 0}, {"@rid": "source1", "sort": 1}], [ - {'sourceId': '1', 'deprecated': False, 'source': 'source1', 'name': 'name'}, - {'sourceId': '2', 'deprecated': False, 'source': 'source2', 'name': 'name'}, + { + "sourceId": "1", + "deprecated": False, + "source": "source1", + "name": "name", + }, + { + "sourceId": "2", + "deprecated": False, + "source": "source2", + "name": "name", + }, ], ] ) ) - rec = get_preferred_drug_representation(api, 'anything') - assert rec['sourceId'] == '2' + rec = get_preferred_drug_representation(api, "anything") + assert rec["sourceId"] == "2" def test_prefers_newer_version(self): api = MagicMock( @@ -60,46 +90,52 @@ def test_prefers_newer_version(self): [], [ { - 'sourceId': '2', - 'deprecated': True, - 'source': 'source', - 'name': 'name', - 'sourceIdVersion': '1', + "sourceId": "2", + "deprecated": True, + "source": "source", + "name": "name", + "sourceIdVersion": "1", }, { - 'sourceId': '2', - 'deprecated': True, - 'source': 'source', - 'name': 'name', - 'sourceIdVersion': '2', + "sourceId": "2", + "deprecated": True, + "source": "source", + "name": "name", + "sourceIdVersion": "2", }, ], ] ) ) - rec = get_preferred_drug_representation(api, 'anything') - assert rec['sourceIdVersion'] == '1' + rec = get_preferred_drug_representation(api, "anything") + assert rec["sourceIdVersion"] == "1" class TestSubstituteSentenceTemplate: def test_multiple_diseases_no_matches(self): template = "{conditions:variant} is associated with {relevance} to {subject} in {conditions:disease} ({evidence})" - relevance = {'displayName': 'senitivity'} - disease_matches = {'1'} + relevance = {"displayName": "senitivity"} + disease_matches = {"1"} diseases = [ - {'@class': 'Disease', '@rid': '2', 'displayName': 'disease 1'}, - {'@class': 'Disease', '@rid': '3', 'displayName': 'disease 2'}, + {"@class": "Disease", "@rid": "2", "displayName": "disease 1"}, + {"@class": "Disease", "@rid": "3", "displayName": "disease 2"}, ] variants = [ { - '@class': 'CategoryVariant', - 'displayName': 'KRAS increased RNA expression', - '@rid': '4', + "@class": "CategoryVariant", + "displayName": "KRAS increased RNA expression", + "@rid": "4", } ] - subjects = [{'@class': 'Therapy', 'displayName': 'some drug', '@rid': '5'}] + subjects = [{"@class": "Therapy", "displayName": "some drug", "@rid": "5"}] sentence = substitute_sentence_template( - template, diseases + variants, subjects, relevance, [], ['6', '7'], disease_matches + template, + diseases + variants, + subjects, + relevance, + [], + ["6", "7"], + disease_matches, ) assert ( sentence @@ -108,23 +144,29 @@ def test_multiple_diseases_no_matches(self): def test_multiple_diseases_some_matches(self): template = "{conditions:variant} is associated with {relevance} to {subject} in {conditions:disease} ({evidence})" - relevance = {'displayName': 'senitivity'} - disease_matches = {'1'} + relevance = {"displayName": "senitivity"} + disease_matches = {"1"} diseases = [ - {'@class': 'Disease', '@rid': '2', 'displayName': 'disease 2'}, - {'@class': 'Disease', '@rid': '1', 'displayName': 'disease 1'}, - {'@class': 'Disease', '@rid': '3', 'displayName': 'disease 3'}, + {"@class": "Disease", "@rid": "2", "displayName": "disease 2"}, + {"@class": "Disease", "@rid": "1", "displayName": "disease 1"}, + {"@class": "Disease", "@rid": "3", "displayName": "disease 3"}, ] variants = [ { - '@class': 'CategoryVariant', - 'displayName': 'KRAS increased RNA expression', - '@rid': '4', + "@class": "CategoryVariant", + "displayName": "KRAS increased RNA expression", + "@rid": "4", } ] - subjects = [{'@class': 'Therapy', 'displayName': 'some drug', '@rid': '5'}] + subjects = [{"@class": "Therapy", "displayName": "some drug", "@rid": "5"}] sentence = substitute_sentence_template( - template, diseases + variants, subjects, relevance, [], ['6', '7'], disease_matches + template, + diseases + variants, + subjects, + relevance, + [], + ["6", "7"], + disease_matches, ) assert ( sentence @@ -133,23 +175,29 @@ def test_multiple_diseases_some_matches(self): def test_multiple_diseases_only_matches(self): template = "{conditions:variant} is associated with {relevance} to {subject} in {conditions:disease} ({evidence})" - relevance = {'displayName': 'senitivity'} - disease_matches = {'1', '2', '3'} + relevance = {"displayName": "senitivity"} + disease_matches = {"1", "2", "3"} diseases = [ - {'@class': 'Disease', '@rid': '2', 'displayName': 'disease 2'}, - {'@class': 'Disease', '@rid': '1', 'displayName': 'disease 1'}, - {'@class': 'Disease', '@rid': '3', 'displayName': 'disease 3'}, + {"@class": "Disease", "@rid": "2", "displayName": "disease 2"}, + {"@class": "Disease", "@rid": "1", "displayName": "disease 1"}, + {"@class": "Disease", "@rid": "3", "displayName": "disease 3"}, ] variants = [ { - '@class': 'CategoryVariant', - 'displayName': 'KRAS increased RNA expression', - '@rid': '4', + "@class": "CategoryVariant", + "displayName": "KRAS increased RNA expression", + "@rid": "4", } ] - subjects = [{'@class': 'Therapy', 'displayName': 'some drug', '@rid': '5'}] + subjects = [{"@class": "Therapy", "displayName": "some drug", "@rid": "5"}] sentence = substitute_sentence_template( - template, diseases + variants, subjects, relevance, [], ['6', '7'], disease_matches + template, + diseases + variants, + subjects, + relevance, + [], + ["6", "7"], + disease_matches, ) assert ( sentence diff --git a/tests/test_ipr/test_util.py b/tests/test_ipr/test_util.py index bbae6d9..ffc35a7 100644 --- a/tests/test_ipr/test_util.py +++ b/tests/test_ipr/test_util.py @@ -4,8 +4,13 @@ @pytest.mark.parametrize( - 'input,output_keys', - [[{'key': 0}, ['key']], [{'key': None}, []], [{'key': ''}, []], [{'gene1': None}, ['gene1']]], + "input,output_keys", + [ + [{"key": 0}, ["key"]], + [{"key": None}, []], + [{"key": ""}, []], + [{"gene1": None}, ["gene1"]], + ], ) def test_trim_empty_values(input, output_keys): modified_object = trim_empty_values(input) @@ -13,17 +18,27 @@ def test_trim_empty_values(input, output_keys): @pytest.mark.parametrize( - 'variant,result', + "variant,result", [ [ - {'variantType': 'exp', 'gene': 'GENE', 'expressionState': 'increased expression'}, - 'increased expression', + { + "variantType": "exp", + "gene": "GENE", + "expressionState": "increased expression", + }, + "increased expression", + ], + [ + {"variantType": "cnv", "gene": "GENE", "cnvState": "amplification"}, + "amplification", + ], + [ + {"variantType": "other", "gene2": "GENE", "variant": "GENE:anything"}, + "anything", ], - [{'variantType': 'cnv', 'gene': 'GENE', 'cnvState': 'amplification'}, 'amplification'], - [{'variantType': 'other', 'gene2': 'GENE', 'variant': 'GENE:anything'}, 'anything'], ], ) def test_create_variant_name_tuple(variant, result): gene, name = create_variant_name_tuple(variant) assert name == result - assert gene == 'GENE' + assert gene == "GENE" diff --git a/tests/test_ipr/util.py b/tests/test_ipr/util.py index cb3dd27..25ad657 100644 --- a/tests/test_ipr/util.py +++ b/tests/test_ipr/util.py @@ -5,5 +5,9 @@ def __init__(self, return_values) -> None: def __call__(self, *args, **kwargs): self.index += 1 - ret_val = self.return_values[self.index] if self.index < len(self.return_values) else [] + ret_val = ( + self.return_values[self.index] + if self.index < len(self.return_values) + else [] + ) return ret_val