diff --git a/pori_python/__init__.py b/pori_python/__init__.py index 44df203..71a12b1 100644 --- a/pori_python/__init__.py +++ b/pori_python/__init__.py @@ -1,2 +1 @@ -from . import ipr -from . import graphkb +from . import graphkb, ipr diff --git a/pori_python/graphkb/constants.py b/pori_python/graphkb/constants.py index f1ebf66..9f4b812 100644 --- a/pori_python/graphkb/constants.py +++ b/pori_python/graphkb/constants.py @@ -177,7 +177,7 @@ def __getitem__(self, key): } # For match.type_screening() [KBDEV-1056] -DEFAULT_NON_STRUCTURAL_VARIANT_TYPE = 'mutation' +DEFAULT_NON_STRUCTURAL_VARIANT_TYPE = "mutation" STRUCTURAL_VARIANT_SIZE_THRESHOLD = 48 # bp STRUCTURAL_VARIANT_TYPES = [ "structural variant", diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py index cc71ce6..1edb263 100644 --- a/pori_python/graphkb/genes.py +++ b/pori_python/graphkb/genes.py @@ -24,7 +24,10 @@ def _get_tumourigenesis_genes_list( - conn: GraphKBConnection, relevance: str, sources: List[str], ignore_cache: bool = False + conn: GraphKBConnection, + relevance: str, + sources: List[str], + ignore_cache: bool = False, ) -> List[Ontology]: statements = cast( List[Statement], @@ -34,10 +37,17 @@ def _get_tumourigenesis_genes_list( "filters": { "AND": [ {"source": {"target": "Source", "filters": {"name": sources}}}, - {"relevance": {"target": "Vocabulary", "filters": {"name": relevance}}}, + { + "relevance": { + "target": "Vocabulary", + "filters": {"name": relevance}, + } + }, ] }, - "returnProperties": [f"subject.{prop}" for prop in GENE_RETURN_PROPERTIES], + "returnProperties": [ + f"subject.{prop}" for prop in GENE_RETURN_PROPERTIES + ], }, ignore_cache=ignore_cache, ), @@ -74,7 +84,9 @@ def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]: Returns: gene (Feature) records """ - return _get_tumourigenesis_genes_list(conn, TUMOUR_SUPPRESSIVE, [ONCOKB_SOURCE_NAME]) + return _get_tumourigenesis_genes_list( + conn, TUMOUR_SUPPRESSIVE, [ONCOKB_SOURCE_NAME] + ) def get_cancer_genes(conn: GraphKBConnection) -> List[Ontology]: @@ -147,7 +159,12 @@ def get_genes_from_variant_types( filters: List[Dict[str, Any]] = [] if types: filters.append( - {"type": {"target": "Vocabulary", "filters": {"name": types, "operator": "IN"}}} + { + "type": { + "target": "Vocabulary", + "filters": {"name": types, "operator": "IN"}, + } + } ) variants = cast( @@ -177,7 +194,11 @@ def get_genes_from_variant_types( result = cast( List[Ontology], conn.query( - {"target": list(genes), "returnProperties": GENE_RETURN_PROPERTIES, "filters": filters}, + { + "target": list(genes), + "returnProperties": GENE_RETURN_PROPERTIES, + "filters": filters, + }, ignore_cache=ignore_cache, ), ) @@ -273,7 +294,12 @@ def get_gene_linked_cancer_predisposition_info( "filters": {"@rid": get_rid(conn, "Source", "CGL")}, } }, - {"relevance": {"target": "Vocabulary", "filters": {"@rid": relevance_rids}}}, + { + "relevance": { + "target": "Vocabulary", + "filters": {"@rid": relevance_rids}, + } + }, ] }, "returnProperties": [ @@ -307,7 +333,10 @@ def get_gene_linked_cancer_predisposition_info( logger.error( f"Non-gene cancer predisposition {biotype}: {name} for {condition['displayName']}" ) - variants[condition["@rid"]] = [condition["displayName"], assoc_gene_list] + variants[condition["@rid"]] = [ + condition["displayName"], + assoc_gene_list, + ] for gene, name, biotype in infer_genes: logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})") @@ -359,7 +388,12 @@ def get_gene_linked_pharmacogenomic_info( { "target": "Statement", "filters": [ - {"relevance": {"target": "Vocabulary", "filters": {"@rid": relevance_rids}}} + { + "relevance": { + "target": "Vocabulary", + "filters": {"@rid": relevance_rids}, + } + } ], "returnProperties": [ "conditions.@class", @@ -397,7 +431,10 @@ def get_gene_linked_pharmacogenomic_info( logger.error( f"Non-gene pharmacogenomic {biotype}: {name} for {condition['displayName']}" ) - variants[condition["@rid"]] = [condition["displayName"], assoc_gene_list] + variants[condition["@rid"]] = [ + condition["displayName"], + assoc_gene_list, + ] for gene, name, biotype in infer_genes: logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})") genes.add(gene) @@ -449,7 +486,9 @@ def get_gene_information( gene_names = sorted(set(gene_names)) statements = graphkb_conn.query(body) - statements = [s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS] + statements = [ + s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS + ] gene_flags: Dict[str, Set[str]] = { "kbStatementRelated": set(), @@ -472,9 +511,13 @@ def get_gene_information( logger.info("fetching oncogenes list") gene_flags["oncogene"] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn)) logger.info("fetching tumour supressors list") - gene_flags["tumourSuppressor"] = convert_to_rid_set(get_oncokb_tumour_supressors(graphkb_conn)) + gene_flags["tumourSuppressor"] = convert_to_rid_set( + get_oncokb_tumour_supressors(graphkb_conn) + ) logger.info("fetching cancerGeneListMatch list") - gene_flags["cancerGeneListMatch"] = convert_to_rid_set(get_cancer_genes(graphkb_conn)) + gene_flags["cancerGeneListMatch"] = convert_to_rid_set( + get_cancer_genes(graphkb_conn) + ) logger.info("fetching therapeutic associated genes lists") gene_flags["therapeuticAssociated"] = convert_to_rid_set( @@ -484,7 +527,9 @@ def get_gene_information( logger.info(f"Setting gene_info flags on {len(gene_names)} genes") result = [] for gene_name in gene_names: - equivalent = convert_to_rid_set(get_equivalent_features(graphkb_conn, gene_name)) + equivalent = convert_to_rid_set( + get_equivalent_features(graphkb_conn, gene_name) + ) row = {"name": gene_name} flagged = False for flag in gene_flags: diff --git a/pori_python/graphkb/match.py b/pori_python/graphkb/match.py index ec82e57..631d073 100644 --- a/pori_python/graphkb/match.py +++ b/pori_python/graphkb/match.py @@ -15,7 +15,14 @@ STRUCTURAL_VARIANT_TYPES, VARIANT_RETURN_PROPERTIES, ) -from .types import BasicPosition, Ontology, ParsedVariant, PositionalVariant, Record, Variant +from .types import ( + BasicPosition, + Ontology, + ParsedVariant, + PositionalVariant, + Record, + Variant, +) from .util import ( FeatureNotFoundError, convert_to_rid_list, @@ -23,7 +30,7 @@ looks_like_rid, stringifyVariant, ) -from .vocab import get_equivalent_terms, get_terms_set, get_term_tree +from .vocab import get_equivalent_terms, get_term_tree, get_terms_set FEATURES_CACHE: Set[str] = set() @@ -63,7 +70,8 @@ def get_equivalent_features( return cast( List[Ontology], conn.query( - {"target": [gene_name], "queryType": "similarTo"}, ignore_cache=ignore_cache + {"target": [gene_name], "queryType": "similarTo"}, + ignore_cache=ignore_cache, ), ) @@ -82,9 +90,16 @@ def get_equivalent_features( filters.append({"sourceId": gene_name}) if source_id_version: filters.append( - {"OR": [{"sourceIdVersion": source_id_version}, {"sourceIdVersion": None}]} + { + "OR": [ + {"sourceIdVersion": source_id_version}, + {"sourceIdVersion": None}, + ] + } ) - elif FEATURES_CACHE and gene_name.lower() not in FEATURES_CACHE and not ignore_cache: + elif ( + FEATURES_CACHE and gene_name.lower() not in FEATURES_CACHE and not ignore_cache + ): return [] else: filters.append({"OR": [{"sourceId": gene_name}, {"name": gene_name}]}) @@ -92,7 +107,10 @@ def get_equivalent_features( return cast( List[Ontology], conn.query( - {"target": {"target": "Feature", "filters": filters}, "queryType": "similarTo"}, + { + "target": {"target": "Feature", "filters": filters}, + "queryType": "similarTo", + }, ignore_cache=ignore_cache, ), ) @@ -105,7 +123,13 @@ def cache_missing_features(conn: GraphKBConnection) -> None: """ genes = cast( List[Ontology], - conn.query({"target": "Feature", "returnProperties": ["name", "sourceId"], "neighbors": 0}), + conn.query( + { + "target": "Feature", + "returnProperties": ["name", "sourceId"], + "neighbors": 0, + } + ), ) for gene in genes: if gene["name"]: @@ -160,7 +184,9 @@ def match_category_variant( ) if not terms: - raise ValueError(f"unable to find the term/category ({category}) or any equivalent") + raise ValueError( + f"unable to find the term/category ({category}) or any equivalent" + ) # find the variant list return cast( @@ -175,7 +201,12 @@ def match_category_variant( ], }, "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf", "GeneralizationOf"], + "edges": [ + "AliasOf", + "DeprecatedBy", + "CrossReferenceOf", + "GeneralizationOf", + ], "treeEdges": ["Infers"], "returnProperties": VARIANT_RETURN_PROPERTIES, }, @@ -185,7 +216,11 @@ def match_category_variant( def match_copy_variant( - conn: GraphKBConnection, gene_name: str, category: str, drop_homozygous: bool = False, **kwargs + conn: GraphKBConnection, + gene_name: str, + category: str, + drop_homozygous: bool = False, + **kwargs, ) -> List[Variant]: """ Returns a list of variants matching the input variant @@ -226,7 +261,9 @@ def match_expression_variant( def positions_overlap( - pos_record: BasicPosition, range_start: BasicPosition, range_end: Optional[BasicPosition] = None + pos_record: BasicPosition, + range_start: BasicPosition, + range_end: Optional[BasicPosition] = None, ) -> bool: """ Check if 2 Position records from GraphKB indicate an overlap @@ -350,9 +387,14 @@ def compare_positional_variants( reference_variant["untemplatedSeq"] not in AMBIGUOUS_AA and variant["untemplatedSeq"] not in AMBIGUOUS_AA ): - if reference_variant["untemplatedSeq"].lower() != variant["untemplatedSeq"].lower(): + if ( + reference_variant["untemplatedSeq"].lower() + != variant["untemplatedSeq"].lower() + ): return False - elif len(variant["untemplatedSeq"]) != len(reference_variant["untemplatedSeq"]): + elif len(variant["untemplatedSeq"]) != len( + reference_variant["untemplatedSeq"] + ): return False # If both variants have a reference sequence, @@ -374,9 +416,7 @@ def compare_positional_variants( def type_screening( - conn: GraphKBConnection, - parsed: ParsedVariant, - updateStructuralTypes=False, + conn: GraphKBConnection, parsed: ParsedVariant, updateStructuralTypes=False ) -> str: """ [KBDEV-1056] @@ -424,40 +464,42 @@ def type_screening( # Will use either hardcoded type list or an updated list from the API if updateStructuralTypes: - rids = list(get_terms_set(conn, ['structural variant'])) + rids = list(get_terms_set(conn, ["structural variant"])) records = conn.get_records_by_id(rids) - structuralVariantTypes = [el['name'] for el in records] + structuralVariantTypes = [el["name"] for el in records] # Unambiguous non-structural variation type - if parsed['type'] not in structuralVariantTypes: - return parsed['type'] + if parsed["type"] not in structuralVariantTypes: + return parsed["type"] # Unambiguous structural variation type - if parsed['type'] in ['fusion', 'translocation']: - return parsed['type'] - if parsed.get('reference2', None): - return parsed['type'] - prefix = parsed.get('prefix', 'g') - if prefix == 'y': # Assuming all variations using cytoband coordiantes meet the size threshold - return parsed['type'] + if parsed["type"] in ["fusion", "translocation"]: + return parsed["type"] + if parsed.get("reference2", None): + return parsed["type"] + prefix = parsed.get("prefix", "g") + if ( + prefix == "y" + ): # Assuming all variations using cytoband coordiantes meet the size threshold + return parsed["type"] # When size cannot be determined: exonic and intronic coordinates # e.g. "MET:e.14del" meaning "Any deletion occuring at the 14th exon" - if prefix in ['e', 'i']: # Assuming they don't meet the size threshold + if prefix in ["e", "i"]: # Assuming they don't meet the size threshold return default_type # When size is given - if parsed.get('untemplatedSeqSize', 0) >= threshold: - return parsed['type'] + if parsed.get("untemplatedSeqSize", 0) >= threshold: + return parsed["type"] # When size needs to be computed from positions - pos_start = parsed.get('break1Start', {}).get('pos', 1) - pos_end = parsed.get('break2Start', {}).get('pos', pos_start) + pos_start = parsed.get("break1Start", {}).get("pos", 1) + pos_end = parsed.get("break2Start", {}).get("pos", pos_start) pos_size = 1 - if prefix == 'p': + if prefix == "p": pos_size = 3 if ((pos_end - pos_start) + 1) * pos_size >= threshold: - return parsed['type'] + return parsed["type"] # Default return default_type @@ -533,7 +575,11 @@ def match_positional_variant( gene1 = parsed["reference1"] gene1_features = get_equivalent_features( - conn, gene1, source=gene_source, is_source_id=gene_is_source_id, ignore_cache=ignore_cache + conn, + gene1, + source=gene_source, + is_source_id=gene_is_source_id, + ignore_cache=ignore_cache, ) features = convert_to_rid_list(gene1_features) @@ -584,12 +630,15 @@ def match_positional_variant( ] filtered_similarOnly: List[Record] = [] # For post filter match use - filtered_similarAndGeneric: List[Record] = [] # To be added to the matches at the very end + filtered_similarAndGeneric: List[Record] = ( + [] + ) # To be added to the matches at the very end for row in cast( List[Record], conn.query( - {"target": "PositionalVariant", "filters": query_filters}, ignore_cache=ignore_cache + {"target": "PositionalVariant", "filters": query_filters}, + ignore_cache=ignore_cache, ), ): # TODO: Check if variant and reference_variant should be interchanged @@ -612,7 +661,12 @@ def match_positional_variant( { "target": convert_to_rid_list(filtered_similarOnly), "queryType": "similarTo", - "edges": ["AliasOf", "DeprecatedBy", "CrossReferenceOf", "GeneralizationOf"], + "edges": [ + "AliasOf", + "DeprecatedBy", + "CrossReferenceOf", + "GeneralizationOf", + ], "treeEdges": ["Infers"], "returnProperties": POS_VARIANT_RETURN_PROPERTIES, }, diff --git a/pori_python/graphkb/statement.py b/pori_python/graphkb/statement.py index c969e8f..032498b 100644 --- a/pori_python/graphkb/statement.py +++ b/pori_python/graphkb/statement.py @@ -1,7 +1,11 @@ from typing import List, cast from . import GraphKBConnection -from .constants import FAILED_REVIEW_STATUS, RELEVANCE_BASE_TERMS, STATEMENT_RETURN_PROPERTIES +from .constants import ( + FAILED_REVIEW_STATUS, + RELEVANCE_BASE_TERMS, + STATEMENT_RETURN_PROPERTIES, +) from .types import CategoryBaseTermMapping, Statement, Variant from .util import convert_to_rid_list from .vocab import get_terms_set @@ -23,7 +27,9 @@ def categorize_relevance( def get_statements_from_variants( - graphkb_conn: GraphKBConnection, variants: List[Variant], failed_review: bool = False + graphkb_conn: GraphKBConnection, + variants: List[Variant], + failed_review: bool = False, ) -> List[Statement]: """Given a list of variant records from GraphKB, return related statements. @@ -38,10 +44,15 @@ def get_statements_from_variants( statements = graphkb_conn.query( { "target": "Statement", - "filters": {"conditions": convert_to_rid_list(variants), "operator": "CONTAINSANY"}, + "filters": { + "conditions": convert_to_rid_list(variants), + "operator": "CONTAINSANY", + }, "returnProperties": STATEMENT_RETURN_PROPERTIES, } ) if not failed_review: - statements = [s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS] + statements = [ + s for s in statements if s.get("reviewStatus") != FAILED_REVIEW_STATUS + ] return [cast(Statement, s) for s in statements] diff --git a/pori_python/graphkb/util.py b/pori_python/graphkb/util.py index 7c6ef94..64b82e0 100644 --- a/pori_python/graphkb/util.py +++ b/pori_python/graphkb/util.py @@ -1,3 +1,7 @@ +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + import hashlib import json import logging @@ -6,10 +10,6 @@ from datetime import datetime from typing import Any, Dict, Iterable, List, Optional, Union, cast -import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry - from .constants import DEFAULT_LIMIT, DEFAULT_URL, TYPES_TO_NOTATION, AA_3to1_MAPPING from .types import OntologyTerm, ParsedVariant, PositionalVariant, Record @@ -113,7 +113,10 @@ def __init__( self.url = url self.username = username self.password = password - self.headers = {"Accept": "application/json", "Content-Type": "application/json"} + self.headers = { + "Accept": "application/json", + "Content-Type": "application/json", + } self.cache: Dict[Any, Any] = {} if not use_global_cache else QUERY_CACHE self.request_count = 0 self.first_request: Optional[datetime] = None @@ -125,7 +128,9 @@ def __init__( def load(self) -> Optional[float]: if self.first_request and self.last_request: return ( - self.request_count * 1000 / millis_interval(self.first_request, self.last_request) + self.request_count + * 1000 + / millis_interval(self.first_request, self.last_request) ) return None @@ -266,7 +271,9 @@ def query( return self.cache[hash_code] while True: - content = self.post("query", data={**request_body, "limit": limit, "skip": len(result)}) + content = self.post( + "query", data={**request_body, "limit": limit, "skip": len(result)} + ) records = content["result"] result.extend(records) if len(records) < limit or not paginate: @@ -358,7 +365,9 @@ def stripRefSeq(breakRepr: str) -> str: return breakRepr -def stripDisplayName(displayName: str, withRef: bool = True, withRefSeq: bool = True) -> str: +def stripDisplayName( + displayName: str, withRef: bool = True, withRefSeq: bool = True +) -> str: match: object = re.search(r"^(.*)(\:)(.*)$", displayName) if match and not withRef: if withRefSeq: @@ -376,7 +385,9 @@ def stripDisplayName(displayName: str, withRef: bool = True, withRefSeq: bool = while new_matches: new_matches = re.search(r"(.*)([A-Z]|\?)([0-9]+)(.*)", rest) if new_matches: - rest = new_matches.group(1) + new_matches.group(3) + new_matches.group(4) + rest = ( + new_matches.group(1) + new_matches.group(3) + new_matches.group(4) + ) # refSeq before '>' new_matches = re.search(r"^([0-9]*)([A-Z]*|\?)(\>)(.*)$", rest) @@ -392,7 +403,9 @@ def stripDisplayName(displayName: str, withRef: bool = True, withRefSeq: bool = def stringifyVariant( - variant: Union[PositionalVariant, ParsedVariant], withRef: bool = True, withRefSeq: bool = True + variant: Union[PositionalVariant, ParsedVariant], + withRef: bool = True, + withRefSeq: bool = True, ) -> str: """ Convert variant record to a string representation (displayName/hgvs) @@ -458,8 +471,12 @@ def stringifyVariant( break2Repr_noParentheses = stripParentheses(break2Repr) result.append(f"({break1Repr_noParentheses},{break2Repr_noParentheses})") else: - break1Repr_noParentheses_noRefSeq = stripRefSeq(stripParentheses(break1Repr)) - break2Repr_noParentheses_noRefSeq = stripRefSeq(stripParentheses(break2Repr)) + break1Repr_noParentheses_noRefSeq = stripRefSeq( + stripParentheses(break1Repr) + ) + break2Repr_noParentheses_noRefSeq = stripRefSeq( + stripParentheses(break2Repr) + ) result.append( f"({break1Repr_noParentheses_noRefSeq},{break2Repr_noParentheses_noRefSeq})" ) diff --git a/pori_python/graphkb/vocab.py b/pori_python/graphkb/vocab.py index 51446db..1b6c609 100644 --- a/pori_python/graphkb/vocab.py +++ b/pori_python/graphkb/vocab.py @@ -24,7 +24,9 @@ def get_equivalent_terms( base_term_name: the name to get superclasses of root_exclude_term: the parent term to exlcude along with all of its parent terms """ - base_records = convert_to_rid_list(conn.query(build_base_query(ontology_class, base_term_name))) + base_records = convert_to_rid_list( + conn.query(build_base_query(ontology_class, base_term_name)) + ) if not base_records: return [] base_term_parents = cast( @@ -34,7 +36,13 @@ def get_equivalent_terms( "target": {"target": base_records, "queryType": "descendants"}, "queryType": "similarTo", "treeEdges": [], - "returnProperties": ["sourceId", "sourceIdVersion", "deprecated", "name", "@rid"], + "returnProperties": [ + "sourceId", + "sourceIdVersion", + "deprecated", + "name", + "@rid", + ], }, ignore_cache=ignore_cache, ), @@ -94,7 +102,9 @@ def get_term_tree( Note: this must be done in 2 calls to avoid going up and down the tree in a single query (exclude adjacent siblings) """ # get all child terms of the subclass tree and disambiguate them - base_records = convert_to_rid_list(conn.query(build_base_query(ontology_class, base_term_name))) + base_records = convert_to_rid_list( + conn.query(build_base_query(ontology_class, base_term_name)) + ) if not base_records: return [] child_terms = cast( @@ -104,7 +114,13 @@ def get_term_tree( "target": {"target": base_records, "queryType": "ancestors"}, "queryType": "similarTo", "treeEdges": [], - "returnProperties": ["sourceId", "sourceIdVersion", "deprecated", "name", "@rid"], + "returnProperties": [ + "sourceId", + "sourceIdVersion", + "deprecated", + "name", + "@rid", + ], }, ignore_cache=ignore_cache, ), @@ -176,7 +192,9 @@ def get_term_by_name( def get_terms_set( - graphkb_conn: GraphKBConnection, base_terms: Iterable[str], ignore_cache: bool = False + graphkb_conn: GraphKBConnection, + base_terms: Iterable[str], + ignore_cache: bool = False, ) -> Set[str]: """Get a set of vocabulary rids given some base/parent term names.""" base_terms = [base_terms] if isinstance(base_terms, str) else base_terms @@ -188,7 +206,10 @@ def get_terms_set( terms.update( convert_to_rid_list( get_term_tree( - graphkb_conn, base_term, include_superclasses=False, ignore_cache=ignore_cache + graphkb_conn, + base_term, + include_superclasses=False, + ignore_cache=ignore_cache, ) ) ) diff --git a/pori_python/ipr/annotate.py b/pori_python/ipr/annotate.py index a7f20d3..92aff40 100644 --- a/pori_python/ipr/annotate.py +++ b/pori_python/ipr/annotate.py @@ -4,19 +4,26 @@ from requests.exceptions import HTTPError +from pandas import isnull +from tqdm import tqdm +from typing import Dict, List, Sequence + from pori_python.graphkb import GraphKBConnection from pori_python.graphkb import match as gkb_match from pori_python.graphkb.match import INPUT_COPY_CATEGORIES from pori_python.graphkb.statement import get_statements_from_variants from pori_python.graphkb.types import Variant from pori_python.graphkb.util import FeatureNotFoundError -from pandas import isnull -from tqdm import tqdm -from typing import Dict, List, Sequence from .constants import TMB_HIGH_CATEGORY from .ipr import convert_statements_to_alterations -from .types import GkbStatement, IprCopyVariant, IprExprVariant, IprStructuralVariant, KbMatch +from .types import ( + GkbStatement, + IprCopyVariant, + IprExprVariant, + IprStructuralVariant, + KbMatch, +) from .util import Hashabledict, convert_to_rid_set, logger REPORTED_COPY_VARIANTS = (INPUT_COPY_CATEGORIES.AMP, INPUT_COPY_CATEGORIES.DEEP) @@ -31,16 +38,18 @@ def get_second_pass_variants( # second-pass matching all_inferred_matches: Dict[str, Variant] = {} inferred_variants = { - (s['subject']['@rid'], s['relevance']['name']) + (s["subject"]["@rid"], s["relevance"]["name"]) for s in statements - if s['subject'] and s['subject']['@class'] in ('Feature', 'Signature') + if s["subject"] and s["subject"]["@class"] in ("Feature", "Signature") } for reference1, variant_type in inferred_variants: - variants = gkb_match.match_category_variant(graphkb_conn, reference1, variant_type) + variants = gkb_match.match_category_variant( + graphkb_conn, reference1, variant_type + ) for variant in variants: - all_inferred_matches[variant['@rid']] = variant + all_inferred_matches[variant["@rid"]] = variant inferred_matches: List[Variant] = list(all_inferred_matches.values()) return inferred_matches @@ -57,7 +66,7 @@ def get_ipr_statements_from_variants( return [] rows = [] statements = get_statements_from_variants(graphkb_conn, matches) - existing_statements = {s['@rid'] for s in statements} + existing_statements = {s["@rid"] for s in statements} for ipr_row in convert_statements_to_alterations( graphkb_conn, statements, disease_name, convert_to_rid_set(matches) @@ -70,13 +79,17 @@ def get_ipr_statements_from_variants( inferred_statements = [ s for s in get_statements_from_variants(graphkb_conn, inferred_matches) - if s['@rid'] not in existing_statements # do not duplicate if non-inferred match + if s["@rid"] + not in existing_statements # do not duplicate if non-inferred match ] for ipr_row in convert_statements_to_alterations( - graphkb_conn, inferred_statements, disease_name, convert_to_rid_set(inferred_matches) + graphkb_conn, + inferred_statements, + disease_name, + convert_to_rid_set(inferred_matches), ): - ipr_row['kbData']['inferred'] = True + ipr_row["kbData"]["inferred"] = True rows.append(ipr_row) return rows @@ -104,8 +117,8 @@ def annotate_expression_variants( logger.info(f"Starting annotation of {len(variants)} expression category_variants") iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - gene = row['gene'] - variant = row['variant'] + gene = row["gene"] + variant = row["variant"] if not variant: skipped += 1 @@ -114,23 +127,25 @@ def annotate_expression_variants( try: matches = gkb_match.match_expression_variant(graphkb_conn, gene, variant) - for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_name): - ipr_row['variant'] = row['key'] - ipr_row['variantType'] = row.get('variantType', 'exp') + for ipr_row in get_ipr_statements_from_variants( + graphkb_conn, matches, disease_name + ): + ipr_row["variant"] = row["key"] + ipr_row["variantType"] = row.get("variantType", "exp") alterations.append(ipr_row) except FeatureNotFoundError as err: problem_genes.add(gene) - logger.debug(f'Unrecognized gene ({gene} {variant}): {err}') + logger.debug(f"Unrecognized gene ({gene} {variant}): {err}") except ValueError as err: - logger.error(f'failed to match variants ({gene} {variant}): {err}') + logger.error(f"failed to match variants ({gene} {variant}): {err}") if skipped: - logger.info(f'skipped matching {skipped} expression information rows') + logger.info(f"skipped matching {skipped} expression information rows") if problem_genes: - logger.error(f'gene finding failures for expression {sorted(problem_genes)}') - logger.error(f'gene finding falure for {len(problem_genes)} expression genes') + logger.error(f"gene finding failures for expression {sorted(problem_genes)}") + logger.error(f"gene finding falure for {len(problem_genes)} expression genes") logger.info( - f'matched {len(variants)} expression variants to {len(alterations)} graphkb annotations' + f"matched {len(variants)} expression variants to {len(alterations)} graphkb annotations" ) return alterations @@ -157,36 +172,42 @@ def annotate_copy_variants( logger.info(f"Starting annotation of {len(variants)} copy category_variants") iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - gene = row['gene'] - variant = row['variant'] + gene = row["gene"] + variant = row["variant"] if variant not in REPORTED_COPY_VARIANTS: # https://www.bcgsc.ca/jira/browse/GERO-77 skipped += 1 - logger.debug(f"Dropping {gene} copy change '{variant}' - not in REPORTED_COPY_VARIANTS") + logger.debug( + f"Dropping {gene} copy change '{variant}' - not in REPORTED_COPY_VARIANTS" + ) continue try: matches = gkb_match.match_copy_variant(graphkb_conn, gene, variant) - for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_name): - ipr_row['variant'] = row['key'] - ipr_row['variantType'] = row.get('variantType', 'cnv') + for ipr_row in get_ipr_statements_from_variants( + graphkb_conn, matches, disease_name + ): + ipr_row["variant"] = row["key"] + ipr_row["variantType"] = row.get("variantType", "cnv") alterations.append(ipr_row) except FeatureNotFoundError as err: problem_genes.add(gene) - logger.debug(f'Unrecognized gene ({gene} {variant}): {err}') + logger.debug(f"Unrecognized gene ({gene} {variant}): {err}") except ValueError as err: - logger.error(f'failed to match variants ({gene} {variant}): {err}') + logger.error(f"failed to match variants ({gene} {variant}): {err}") if skipped: logger.info( - f'skipped matching {skipped} copy number variants not in {REPORTED_COPY_VARIANTS}' + f"skipped matching {skipped} copy number variants not in {REPORTED_COPY_VARIANTS}" ) if problem_genes: - logger.error(f'gene finding failures for copy variants {sorted(problem_genes)}') - logger.error(f'gene finding failure for {len(problem_genes)} copy variant genes') + logger.error(f"gene finding failures for copy variants {sorted(problem_genes)}") + logger.error( + f"gene finding failure for {len(problem_genes)} copy variant genes" + ) logger.info( - f'matched {len(variants)} copy category variants to {len(alterations)} graphkb annotations' + f"matched {len(variants)} copy category variants to {len(alterations)} graphkb annotations" ) return alterations @@ -208,14 +229,14 @@ def annotate_positional_variants( Returns: list of kbMatches records for IPR """ - VARIANT_KEYS = ('variant', 'hgvsProtein', 'hgvsCds', 'hgvsGenomic') + VARIANT_KEYS = ("variant", "hgvsProtein", "hgvsCds", "hgvsGenomic") errors = 0 alterations = [] problem_genes = set() iterfunc = tqdm if show_progress else iter for row in iterfunc(variants): - if not row.get('gene') and (not row.get('gene1') or not row.get('gene2')): + if not row.get("gene") and (not row.get("gene1") or not row.get("gene2")): # https://www.bcgsc.ca/jira/browse/GERO-56?focusedCommentId=1234791&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-1234791 # should not match single gene SVs continue @@ -232,58 +253,62 @@ def annotate_positional_variants( # DEVSU-1885 - fix malformed single deletion described as substitution of blank # eg. deletion described as substitution with nothing: 'chr1:g.150951027T>' if ( - variant[-1] == '>' - and 'g.' in variant + variant[-1] == ">" + and "g." in variant and variant[-2].isalpha() and variant[-3].isnumeric() ): logger.warning( f"Assuming malformed deletion variant {variant} is {variant[:-2] + 'del'}" ) - variant = variant[:-2] + 'del' - matches = gkb_match.match_positional_variant(graphkb_conn, variant) + variant = variant[:-2] + "del" + matches = gkb_match.match_positional_variant( + graphkb_conn, variant + ) else: raise parse_err for ipr_row in get_ipr_statements_from_variants( graphkb_conn, matches, disease_name ): - ipr_row['variant'] = row['key'] - ipr_row['variantType'] = row.get( - 'variantType', 'mut' if row.get('gene') else 'sv' + ipr_row["variant"] = row["key"] + ipr_row["variantType"] = row.get( + "variantType", "mut" if row.get("gene") else "sv" ) alterations.append(Hashabledict(ipr_row)) except FeatureNotFoundError as err: - logger.debug(f'failed to match positional variants ({variant}): {err}') + logger.debug(f"failed to match positional variants ({variant}): {err}") errors += 1 - if 'gene' in row: - problem_genes.add(row['gene']) - elif 'gene1' in row and f"({row['gene1']})" in str(err): - problem_genes.add(row['gene1']) - elif 'gene2' in row and f"({row['gene2']})" in str(err): - problem_genes.add(row['gene2']) - elif 'gene1' in row and 'gene2' in row: - problem_genes.add(row['gene1']) - problem_genes.add(row['gene2']) + if "gene" in row: + problem_genes.add(row["gene"]) + elif "gene1" in row and f"({row['gene1']})" in str(err): + problem_genes.add(row["gene1"]) + elif "gene2" in row and f"({row['gene2']})" in str(err): + problem_genes.add(row["gene2"]) + elif "gene1" in row and "gene2" in row: + problem_genes.add(row["gene1"]) + problem_genes.add(row["gene2"]) else: raise err except HTTPError as err: errors += 1 - logger.error(f'failed to match positional variants ({variant}): {err}') + logger.error(f"failed to match positional variants ({variant}): {err}") if problem_genes: - logger.error(f'gene finding failures for {sorted(problem_genes)}') - logger.error(f'{len(problem_genes)} gene finding failures for positional variants') + logger.error(f"gene finding failures for {sorted(problem_genes)}") + logger.error( + f"{len(problem_genes)} gene finding failures for positional variants" + ) if errors: - logger.error(f'skipped {errors} positional variants due to errors') + logger.error(f"skipped {errors} positional variants due to errors") # drop duplicates alterations: List[KbMatch] = list(set(alterations)) - variant_types = ", ".join(sorted(set([alt['variantType'] for alt in alterations]))) + variant_types = ", ".join(sorted(set([alt["variantType"] for alt in alterations]))) logger.info( - f'matched {len(variants)} {variant_types} positional variants to {len(alterations)} graphkb annotations' + f"matched {len(variants)} {variant_types} positional variants to {len(alterations)} graphkb annotations" ) return alterations @@ -291,8 +316,8 @@ def annotate_positional_variants( def annotate_msi( graphkb_conn: GraphKBConnection, - disease_name: str = 'cancer', - msi_category: str = 'microsatellite instability', + disease_name: str = "cancer", + msi_category: str = "microsatellite instability", ) -> List[KbMatch]: """Annotate microsatellite instablity from GraphKB in the IPR alterations format. @@ -307,26 +332,33 @@ def annotate_msi( gkb_matches = [] msi_categories = graphkb_conn.query( { - 'target': { - 'target': 'CategoryVariant', - 'filters': { - 'reference1': {'target': 'Signature', 'filters': {'name': msi_category}} + "target": { + "target": "CategoryVariant", + "filters": { + "reference1": { + "target": "Signature", + "filters": {"name": msi_category}, + } }, }, - 'queryType': 'similarTo', - 'returnProperties': ['@rid', 'displayName'], + "queryType": "similarTo", + "returnProperties": ["@rid", "displayName"], } ) if msi_categories: - for ipr_row in get_ipr_statements_from_variants(graphkb_conn, msi_categories, disease_name): - ipr_row['variant'] = msi_category - ipr_row['variantType'] = 'msi' + for ipr_row in get_ipr_statements_from_variants( + graphkb_conn, msi_categories, disease_name + ): + ipr_row["variant"] = msi_category + ipr_row["variantType"] = "msi" gkb_matches.append(ipr_row) return gkb_matches def annotate_tmb( - graphkb_conn: GraphKBConnection, disease_name: str = 'cancer', category: str = TMB_HIGH_CATEGORY + graphkb_conn: GraphKBConnection, + disease_name: str = "cancer", + category: str = TMB_HIGH_CATEGORY, ) -> List[KbMatch]: """Annotate Tumour Mutation Burden (tmb) categories from GraphKB in the IPR alterations format. @@ -342,22 +374,26 @@ def annotate_tmb( gkb_matches = [] categories = graphkb_conn.query( { - 'target': { - 'target': 'CategoryVariant', - 'filters': { - 'reference1': { - 'target': 'Signature', - 'filters': {'OR': [{'name': category}, {'displayName': category}]}, + "target": { + "target": "CategoryVariant", + "filters": { + "reference1": { + "target": "Signature", + "filters": { + "OR": [{"name": category}, {"displayName": category}] + }, } }, }, - 'queryType': 'similarTo', - 'returnProperties': ['@rid', 'displayName'], + "queryType": "similarTo", + "returnProperties": ["@rid", "displayName"], } ) if categories: - for ipr_row in get_ipr_statements_from_variants(graphkb_conn, categories, disease_name): - ipr_row['variant'] = category - ipr_row['variantType'] = 'tmb' + for ipr_row in get_ipr_statements_from_variants( + graphkb_conn, categories, disease_name + ): + ipr_row["variant"] = category + ipr_row["variantType"] = "tmb" gkb_matches.append(ipr_row) return gkb_matches diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py index 5f4e846..122c047 100644 --- a/pori_python/ipr/connection.py +++ b/pori_python/ipr/connection.py @@ -2,9 +2,10 @@ import json import os +import time import zlib from typing import Dict, List -import time + from .constants import DEFAULT_URL from .util import logger @@ -137,7 +138,9 @@ def set_analyst_comments(self, report_id: str, data: Dict) -> Dict: data=zlib.compress(json.dumps(data, allow_nan=False).encode("utf-8")), ) - def post_images(self, report_id: str, files: Dict[str, str], data: Dict[str, str] = {}) -> None: + def post_images( + self, report_id: str, files: Dict[str, str], data: Dict[str, str] = {} + ) -> None: """ Post images to the report """ @@ -168,7 +171,9 @@ def post_images(self, report_id: str, files: Dict[str, str], data: Dict[str, str handler.close() start_index += IMAGE_MAX if image_errors: - raise ValueError(f'Error uploading images ({", ".join(sorted(list(image_errors)))})') + raise ValueError( + f'Error uploading images ({", ".join(sorted(list(image_errors)))})' + ) def get_spec(self) -> Dict: """ diff --git a/pori_python/ipr/constants.py b/pori_python/ipr/constants.py index 493d620..948abc9 100644 --- a/pori_python/ipr/constants.py +++ b/pori_python/ipr/constants.py @@ -1,9 +1,17 @@ -DEFAULT_URL = 'https://iprstaging-api.bcgsc.ca/api' -GERMLINE_BASE_TERMS = ('pharmacogenomic', 'cancer predisposition') # based on graphkb.constants -VARIANT_CLASSES = {'Variant', 'CategoryVariant', 'PositionalVariant', 'CatalogueVariant'} +DEFAULT_URL = "https://iprstaging-api.bcgsc.ca/api" +GERMLINE_BASE_TERMS = ( + "pharmacogenomic", + "cancer predisposition", +) # based on graphkb.constants +VARIANT_CLASSES = { + "Variant", + "CategoryVariant", + "PositionalVariant", + "CatalogueVariant", +} # all possible values for review status are: ['pending', 'not required', 'passed', 'failed', 'initial'] -FAILED_REVIEW_STATUS = 'failed' +FAILED_REVIEW_STATUS = "failed" TMB_HIGH = 10.0 # genomic mutations per mb - https://www.bcgsc.ca/jira/browse/GERO-296 -TMB_HIGH_CATEGORY = 'high mutation burden' +TMB_HIGH_CATEGORY = "high mutation burden" diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py index c18608e..628428a 100644 --- a/pori_python/ipr/inputs.py +++ b/pori_python/ipr/inputs.py @@ -7,9 +7,10 @@ import os import pandas as pd from Bio.Data.IUPACData import protein_letters_3to1 -from pori_python.graphkb.match import INPUT_COPY_CATEGORIES, INPUT_EXPRESSION_CATEGORIES from typing import Callable, Dict, Iterable, List, Set, Tuple, cast +from pori_python.graphkb.match import INPUT_COPY_CATEGORIES, INPUT_EXPRESSION_CATEGORIES + from .types import ( IprCopyVariant, IprExprVariant, @@ -19,133 +20,133 @@ ) from .util import hash_key, logger, pandas_falsy -protein_letters_3to1.setdefault('Ter', '*') +protein_letters_3to1.setdefault("Ter", "*") -SPECIFICATION = os.path.join(os.path.dirname(__file__), 'content.spec.json') +SPECIFICATION = os.path.join(os.path.dirname(__file__), "content.spec.json") # content in the local specification should match the values in IPR_API_SPEC_JSON_URL -IPR_API_SPEC_JSON_URL = 'https://ipr-api.bcgsc.ca/api/spec.json' +IPR_API_SPEC_JSON_URL = "https://ipr-api.bcgsc.ca/api/spec.json" # TODO: GERO-307 - use SPECIFICATION json to derive the variant required and optional details defined below # 'cnvState' is for display -COPY_REQ = ['gene', 'kbCategory'] -COPY_KEY = ['gene'] +COPY_REQ = ["gene", "kbCategory"] +COPY_KEY = ["gene"] COPY_OPTIONAL = [ - 'cnvState', - 'copyChange', - 'lohState', # Loss of Heterzygosity state - informative detail to analyst - 'chromosomeBand', - 'start', - 'end', - 'size', - 'log2Cna', - 'cna', - 'comments', - 'library', - 'germline', + "cnvState", + "copyChange", + "lohState", # Loss of Heterzygosity state - informative detail to analyst + "chromosomeBand", + "start", + "end", + "size", + "log2Cna", + "cna", + "comments", + "library", + "germline", ] -SMALL_MUT_REQ = ['gene', 'proteinChange'] +SMALL_MUT_REQ = ["gene", "proteinChange"] # alternate details in the key, can distinguish / subtype events. SMALL_MUT_KEY = SMALL_MUT_REQ + [ - 'altSeq', - 'chromosome', - 'endPosition', - 'refSeq', - 'startPosition', - 'transcript', + "altSeq", + "chromosome", + "endPosition", + "refSeq", + "startPosition", + "transcript", ] SMALL_MUT_OPTIONAL = [ - 'altSeq', - 'comments', - 'chromosome', - 'endPosition', - 'germline', - 'hgvsCds', - 'hgvsGenomic', - 'hgvsProtein', - 'library', - 'ncbiBuild', - 'normalAltCount', - 'normalDepth', - 'normalRefCount', - 'refSeq', - 'rnaAltCount', - 'rnaDepth', - 'rnaRefCount', - 'startPosition', - 'transcript', - 'tumourAltCount', - 'tumourAltCopies', - 'tumourDepth', - 'tumourRefCount', - 'tumourRefCopies', - 'zygosity', + "altSeq", + "comments", + "chromosome", + "endPosition", + "germline", + "hgvsCds", + "hgvsGenomic", + "hgvsProtein", + "library", + "ncbiBuild", + "normalAltCount", + "normalDepth", + "normalRefCount", + "refSeq", + "rnaAltCount", + "rnaDepth", + "rnaRefCount", + "startPosition", + "transcript", + "tumourAltCount", + "tumourAltCopies", + "tumourDepth", + "tumourRefCount", + "tumourRefCopies", + "zygosity", ] -EXP_REQ = ['gene', 'kbCategory'] -EXP_KEY = ['gene'] +EXP_REQ = ["gene", "kbCategory"] +EXP_KEY = ["gene"] EXP_OPTIONAL = [ - 'biopsySiteFoldChange', - 'biopsySitePercentile', - 'biopsySiteQC', - 'biopsySiteZScore', - 'biopsySitekIQR', - 'comments', - 'diseaseFoldChange', - 'diseasekIQR', - 'diseasePercentile', - 'diseaseQC', - 'diseaseZScore', - 'expressionState', - 'histogramImage', - 'library', - 'primarySiteFoldChange', - 'primarySitekIQR', - 'primarySitePercentile', - 'primarySiteQC', - 'primarySiteZScore', - 'internalPancancerFoldChange', - 'internalPancancerkIQR', - 'internalPancancerPercentile', - 'internalPancancerQC', - 'internalPancancerZScore', - 'rnaReads', - 'rpkm', - 'tpm', + "biopsySiteFoldChange", + "biopsySitePercentile", + "biopsySiteQC", + "biopsySiteZScore", + "biopsySitekIQR", + "comments", + "diseaseFoldChange", + "diseasekIQR", + "diseasePercentile", + "diseaseQC", + "diseaseZScore", + "expressionState", + "histogramImage", + "library", + "primarySiteFoldChange", + "primarySitekIQR", + "primarySitePercentile", + "primarySiteQC", + "primarySiteZScore", + "internalPancancerFoldChange", + "internalPancancerkIQR", + "internalPancancerPercentile", + "internalPancancerQC", + "internalPancancerZScore", + "rnaReads", + "rpkm", + "tpm", ] SV_REQ = [ - 'eventType', - 'breakpoint', - 'gene1', # prev: nterm_hugo - 'gene2', # prev: cterm_hugo - 'exon1', # n-terminal - 'exon2', # c-terminal + "eventType", + "breakpoint", + "gene1", # prev: nterm_hugo + "gene2", # prev: cterm_hugo + "exon1", # n-terminal + "exon2", # c-terminal ] SV_KEY = SV_REQ[:] SV_OPTIONAL = [ - 'ctermTranscript', - 'ntermTranscript', - 'ctermGene', # combined hugo ensembl form - 'ntermGene', # combined hugo ensembl form - 'detectedIn', - 'conventionalName', - 'svg', - 'svgTitle', - 'name', - 'frame', - 'omicSupport', - 'highQuality', - 'comments', - 'library', - 'rnaAltCount', - 'rnaDepth', - 'tumourAltCount', - 'tumourDepth', - 'germline', - 'mavis_product_id', + "ctermTranscript", + "ntermTranscript", + "ctermGene", # combined hugo ensembl form + "ntermGene", # combined hugo ensembl form + "detectedIn", + "conventionalName", + "svg", + "svgTitle", + "name", + "frame", + "omicSupport", + "highQuality", + "comments", + "library", + "rnaAltCount", + "rnaDepth", + "tumourAltCount", + "tumourDepth", + "germline", + "mavis_product_id", ] @@ -170,7 +171,7 @@ def validate_variant_rows( Returns: the rows from the tab file as dictionaries """ - header = required + optional + ['key'] + header = required + optional + ["key"] result = [] keys = set() @@ -181,18 +182,18 @@ def validate_variant_rows( if not header_validated: for req_col in required: if req_col not in row: - raise ValueError(f'header missing required column ({req_col})') + raise ValueError(f"header missing required column ({req_col})") header_validated = True row_key = hash_key(row_to_key(row)) if row_key in keys: - raise ValueError(f'duplicate row key ({row_key}) from ({row_to_key(row)})') - row['key'] = row_key + raise ValueError(f"duplicate row key ({row_key}) from ({row_to_key(row)})") + row["key"] = row_key keys.add(row_key) for k, v in row.items(): if v is pd.NA: - row[k] = '' + row[k] = "" - result.append(cast(IprVariant, {col: row.get(col, '') for col in header})) + result.append(cast(IprVariant, {col: row.get(col, "") for col in header})) return result @@ -212,20 +213,20 @@ def preprocess_copy_variants(rows: Iterable[Dict]) -> List[IprCopyVariant]: display_name_mapping.update(dict([(v, v) for v in display_name_mapping.values()])) def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(['cnv'] + [row[key] for key in COPY_KEY]) + return tuple(["cnv"] + [row[key] for key in COPY_KEY]) result = validate_variant_rows(rows, COPY_REQ, COPY_OPTIONAL, row_key) ret_list = [cast(IprCopyVariant, var) for var in result] for row in ret_list: - kb_cat = row.get('kbCategory') - kb_cat = '' if pd.isnull(kb_cat) else str(kb_cat) + kb_cat = row.get("kbCategory") + kb_cat = "" if pd.isnull(kb_cat) else str(kb_cat) if kb_cat: if kb_cat not in INPUT_COPY_CATEGORIES.values(): - raise ValueError(f'invalid copy variant kbCategory value ({kb_cat})') - if not row.get('cnvState'): # apply default short display name - row['cnvState'] = display_name_mapping[kb_cat] - row['variant'] = kb_cat - row['variantType'] = 'cnv' + raise ValueError(f"invalid copy variant kbCategory value ({kb_cat})") + if not row.get("cnvState"): # apply default short display name + row["cnvState"] = display_name_mapping[kb_cat] + row["variant"] = kb_cat + row["variantType"] = "cnv" return ret_list @@ -238,28 +239,28 @@ def preprocess_small_mutations(rows: Iterable[Dict]) -> List[IprSmallMutationVar def row_key(row: IprSmallMutationVariant) -> Tuple[str, ...]: key_vals = [] - for kval in [row.get(key, '') for key in SMALL_MUT_KEY]: - key_vals.append(str(kval) if pd.notnull(kval) else '') - return tuple(['small mutation'] + key_vals) + for kval in [row.get(key, "") for key in SMALL_MUT_KEY]: + key_vals.append(str(kval) if pd.notnull(kval) else "") + return tuple(["small mutation"] + key_vals) result = validate_variant_rows(rows, SMALL_MUT_REQ, SMALL_MUT_OPTIONAL, row_key) if not result: return [] def pick_variant(row: IprSmallMutationVariant) -> str: - protein_change = row.get('proteinChange') + protein_change = row.get("proteinChange") if not pandas_falsy(protein_change): for longAA, shortAA in protein_letters_3to1.items(): protein_change = str(protein_change).replace(longAA, shortAA) - hgvsp = '{}:{}'.format(row['gene'], protein_change) + hgvsp = "{}:{}".format(row["gene"], protein_change) return hgvsp - for field in ['hgvsProtein', 'hgvsCds', 'hgvsGenomic']: + for field in ["hgvsProtein", "hgvsCds", "hgvsGenomic"]: if not pandas_falsy(row.get(field)): return str(row.get(field)) raise ValueError( - 'Variant field cannot be empty. Must include proteinChange or one of the hgvs fields (hgvsProtein, hgvsCds, hgvsGenomic) to build the variant string' + "Variant field cannot be empty. Must include proteinChange or one of the hgvs fields (hgvsProtein, hgvsCds, hgvsGenomic) to build the variant string" ) # 'location' and 'refAlt' are not currently used for matching; still optional and allowed blank @@ -268,21 +269,21 @@ def pick_variant(row: IprSmallMutationVariant) -> str: # for row in result: def convert_sm(row: IprVariant) -> IprSmallMutationVariant: ret = cast(IprSmallMutationVariant, row) - ret['variant'] = pick_variant(ret) - ret['variantType'] = 'mut' + ret["variant"] = pick_variant(ret) + ret["variantType"] = "mut" - if ret.get('startPosition') and not ret.get('endPosition'): - ret['endPosition'] = ret['startPosition'] + if ret.get("startPosition") and not ret.get("endPosition"): + ret["endPosition"] = ret["startPosition"] # default depth to alt + ref if not given - for sample_type in ('normal', 'rna', 'tumour'): + for sample_type in ("normal", "rna", "tumour"): if ( - ret.get(f'{sample_type}RefCount') - and ret.get(f'{sample_type}AltCount') - and not ret.get(f'{sample_type}Depth') + ret.get(f"{sample_type}RefCount") + and ret.get(f"{sample_type}AltCount") + and not ret.get(f"{sample_type}Depth") ): - ret[f'{sample_type}Depth'] = ( # type: ignore - ret[f'{sample_type}RefCount'] + ret[f'{sample_type}AltCount'] # type: ignore + ret[f"{sample_type}Depth"] = ( # type: ignore + ret[f"{sample_type}RefCount"] + ret[f"{sample_type}AltCount"] # type: ignore ) return ret @@ -298,65 +299,65 @@ def preprocess_expression_variants(rows: Iterable[Dict]) -> List[IprExprVariant] """ def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(['expression'] + [row[key] for key in EXP_KEY]) + return tuple(["expression"] + [row[key] for key in EXP_KEY]) variants = validate_variant_rows(rows, EXP_REQ, EXP_OPTIONAL, row_key) result = [cast(IprExprVariant, var) for var in variants] float_columns = [ col for col in EXP_REQ + EXP_OPTIONAL - if col.endswith('kIQR') - or col.endswith('Percentile') - or col.endswith('FoldChange') - or col.endswith('QC') - or col.endswith('ZScore') - or col in ['tpm', 'rpkm'] + if col.endswith("kIQR") + or col.endswith("Percentile") + or col.endswith("FoldChange") + or col.endswith("QC") + or col.endswith("ZScore") + or col in ["tpm", "rpkm"] ] errors = [] for row in result: - row['variant'] = row['kbCategory'] - if not row['expressionState'] and row['kbCategory']: - row['expressionState'] = row['kbCategory'] + row["variant"] = row["kbCategory"] + if not row["expressionState"] and row["kbCategory"]: + row["expressionState"] = row["kbCategory"] - if row['variant'] and not pd.isnull(row['variant']): - if row['variant'] not in INPUT_EXPRESSION_CATEGORIES.values(): + if row["variant"] and not pd.isnull(row["variant"]): + if row["variant"] not in INPUT_EXPRESSION_CATEGORIES.values(): err_msg = f"{row['gene']} variant '{row['variant']}' not in {INPUT_EXPRESSION_CATEGORIES.values()}" errors.append(err_msg) logger.error(err_msg) - row['variantType'] = 'exp' + row["variantType"] = "exp" for col in float_columns: - if row[col] in ['inf', '+inf', '-inf']: - row[col] = row[col].replace('inf', 'Infinity') + if row[col] in ["inf", "+inf", "-inf"]: + row[col] = row[col].replace("inf", "Infinity") # check images exist - if row['histogramImage'] and not os.path.exists(row['histogramImage']): + if row["histogramImage"] and not os.path.exists(row["histogramImage"]): raise FileNotFoundError(f'missing image ({row["histogramImage"]})') if errors: - raise ValueError(f'{len(errors)} Invalid expression variants in file') + raise ValueError(f"{len(errors)} Invalid expression variants in file") return result def create_graphkb_sv_notation(row: IprFusionVariant) -> str: """Generate GKB/IPR fusion style notation from a structural variant.""" - gene1 = row['gene1'] or '?' - gene2 = row['gene2'] or '?' - exon1 = str(row['exon1']) if row['exon1'] else '?' - exon2 = str(row['exon2']) if row['exon2'] else '?' - if not row['gene1']: + gene1 = row["gene1"] or "?" + gene2 = row["gene2"] or "?" + exon1 = str(row["exon1"]) if row["exon1"] else "?" + exon2 = str(row["exon2"]) if row["exon2"] else "?" + if not row["gene1"]: gene1, gene2 = gene2, gene1 exon1, exon2 = exon2, exon1 - if gene1 == '?': + if gene1 == "?": raise ValueError( f'both genes cannot be blank for a structural variant {row["key"]}. At least 1 gene must be entered' ) # force exons to integer repr string exon1 = exon1[:-2] if exon1.endswith(".0") else exon1 exon2 = exon2[:-2] if exon2.endswith(".0") else exon2 - return f'({gene1},{gene2}):fusion(e.{exon1},e.{exon2})' + return f"({gene1},{gene2}):fusion(e.{exon1},e.{exon2})" def preprocess_structural_variants(rows: Iterable[Dict]) -> List[IprFusionVariant]: @@ -366,21 +367,21 @@ def preprocess_structural_variants(rows: Iterable[Dict]) -> List[IprFusionVarian """ def row_key(row: Dict) -> Tuple[str, ...]: - return tuple(['sv'] + [row[key] for key in SV_KEY]) + return tuple(["sv"] + [row[key] for key in SV_KEY]) variants = validate_variant_rows(rows, SV_REQ, SV_OPTIONAL, row_key) result = [cast(IprFusionVariant, var) for var in variants] # genes are optional for structural variants for row in result: - row['variant'] = create_graphkb_sv_notation(row) - row['variantType'] = 'sv' + row["variant"] = create_graphkb_sv_notation(row) + row["variantType"] = "sv" # check and load the svg file where applicable - if row['svg'] and not pd.isnull(row['svg']): - if not os.path.exists(row['svg']): - raise FileNotFoundError(row['svg']) - with open(row['svg'], 'r') as fh: - row['svg'] = fh.read() + if row["svg"] and not pd.isnull(row["svg"]): + if not os.path.exists(row["svg"]): + raise FileNotFoundError(row["svg"]) + with open(row["svg"], "r") as fh: + row["svg"] = fh.read() return result @@ -408,39 +409,41 @@ def check_variant_links( missing_information_genes = set() missing_information_errors = set() - copy_variant_genes = {variant['gene'] for variant in copy_variants} - expression_variant_genes = {variant['gene'] for variant in expression_variants} + copy_variant_genes = {variant["gene"] for variant in copy_variants} + expression_variant_genes = {variant["gene"] for variant in expression_variants} genes_with_variants = set() # filter excess copy variants variant = IprVariant # to silence type errors for variant in copy_variants: - gene = variant['gene'] + gene = variant["gene"] if not gene: logger.error("copy_variant data cannot be applied to an empty genename") - elif variant['variant']: + elif variant["variant"]: genes_with_variants.add(gene) if expression_variant_genes and gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has a copy variant but is missing expression information' + f"gene ({gene}) has a copy variant but is missing expression information" ) for variant in expression_variants: - gene = variant['gene'] + gene = variant["gene"] if not gene: - logger.error("expression_variant data cannot be applied to an empty genename") - elif variant['variant']: + logger.error( + "expression_variant data cannot be applied to an empty genename" + ) + elif variant["variant"]: genes_with_variants.add(gene) if copy_variant_genes and gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has an expression variant but is missing copy number information' + f"gene ({gene}) has an expression variant but is missing copy number information" ) for variant in small_mutations: - gene = variant['gene'] + gene = variant["gene"] if not gene: logger.error("small_mutation data cannot be applied to an empty genename") continue @@ -448,104 +451,104 @@ def check_variant_links( if copy_variant_genes and gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has a small mutation but is missing copy number information' + f"gene ({gene}) has a small mutation but is missing copy number information" ) if expression_variant_genes and gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has a small mutation but is missing expression information' + f"gene ({gene}) has a small mutation but is missing expression information" ) genes_with_variants.add(gene) for variant in structural_variants: - for gene in [variant['gene1'], variant['gene2']]: + for gene in [variant["gene1"], variant["gene2"]]: if gene: # genes are optional for structural variants if gene not in copy_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has a structural variant but is missing copy number information' + f"gene ({gene}) has a structural variant but is missing copy number information" ) if gene not in expression_variant_genes: missing_information_genes.add(gene) missing_information_errors.add( - f'gene ({gene}) has a structural variant but is missing expression information' + f"gene ({gene}) has a structural variant but is missing expression information" ) genes_with_variants.add(gene) if missing_information_genes: for err_msg in sorted(missing_information_errors): logger.debug(err_msg) - link_err_msg = ( - f'Missing information variant links on {len(missing_information_genes)} genes' - ) + link_err_msg = f"Missing information variant links on {len(missing_information_genes)} genes" logger.warning(link_err_msg) return genes_with_variants -def check_comparators(content: Dict, expresssionVariants: List[IprExprVariant] = []) -> None: +def check_comparators( + content: Dict, expresssionVariants: List[IprExprVariant] = [] +) -> None: """ Given the optional content dictionary, check that based on the analyses present the correct/sufficient comparators have also been specified """ - mutation_burden = 'mutationBurden' - comparator_roles = {c['analysisRole'] for c in content.get('comparators', [])} + mutation_burden = "mutationBurden" + comparator_roles = {c["analysisRole"] for c in content.get("comparators", [])} - for image in content.get('images', []): - key = image['key'] + for image in content.get("images", []): + key = image["key"] if key.startswith(mutation_burden): - comp_type = key.split('.')[-1] - role = f'mutation burden ({comp_type})' + comp_type = key.split(".")[-1] + role = f"mutation burden ({comp_type})" if role in comparator_roles: continue - if '_sv.' in key: - sv_role = f'mutation burden SV ({comp_type})' + if "_sv." in key: + sv_role = f"mutation burden SV ({comp_type})" if sv_role in comparator_roles: continue - raise ValueError(f'missing required comparator definition ({role})') + raise ValueError(f"missing required comparator definition ({role})") if expresssionVariants: - required_comparators = {'expression (disease)'} + required_comparators = {"expression (disease)"} def all_none(row: IprExprVariant, columns: List[str]) -> bool: - return all([row.get(col) is None or row.get(col) == '' for col in columns]) + return all([row.get(col) is None or row.get(col) == "" for col in columns]) for exp in expresssionVariants: if not all_none( exp, [ - 'primarySitekIQR', - 'primarySitePercentile', - 'primarySiteZScore', - 'primarySiteFoldChange', + "primarySitekIQR", + "primarySitePercentile", + "primarySiteZScore", + "primarySiteFoldChange", ], ): - required_comparators.add('expression (primary site)') + required_comparators.add("expression (primary site)") if not all_none( exp, [ - 'biopsySitekIQR', - 'biopsySitePercentile', - 'biopsySiteZScore', - 'biopsySiteFoldChange', + "biopsySitekIQR", + "biopsySitePercentile", + "biopsySiteZScore", + "biopsySiteFoldChange", ], ): - required_comparators.add('expression (biopsy site)') + required_comparators.add("expression (biopsy site)") if not all_none( exp, [ - 'internalPancancerkIQR', - 'internalPancancerPercentile', - 'internalPancancerZScore', - 'internalPancancerFoldChange', + "internalPancancerkIQR", + "internalPancancerPercentile", + "internalPancancerZScore", + "internalPancancerFoldChange", ], ): - required_comparators.add('expression (internal pancancer cohort)') + required_comparators.add("expression (internal pancancer cohort)") if required_comparators - comparator_roles: - missing = '; '.join(sorted(list(required_comparators - comparator_roles))) - raise ValueError(f'missing required comparator definitions ({missing})') + missing = "; ".join(sorted(list(required_comparators - comparator_roles))) + raise ValueError(f"missing required comparator definitions ({missing})") def extend_with_default(validator_class): @@ -570,7 +573,9 @@ def check_null(checker, instance): type_checker = validator_class.TYPE_CHECKER.redefine("null", check_null) return jsonschema.validators.extend( - validator_class, validators={"properties": set_defaults}, type_checker=type_checker + validator_class, + validators={"properties": set_defaults}, + type_checker=type_checker, ) @@ -584,7 +589,7 @@ def validate_report_content(content: Dict, schema_file: str = SPECIFICATION) -> Adds defaults as reccommended by: https://python-jsonschema.readthedocs.io/en/latest/faq/#why-doesn-t-my-schema-s-default-property-set-the-default-on-my-instance """ - with open(schema_file, 'r') as fh: + with open(schema_file, "r") as fh: schema = json.load(fh) return DefaultValidatingDraft7Validator(schema).validate(content) diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py index 6de20d2..7361042 100644 --- a/pori_python/ipr/ipr.py +++ b/pori_python/ipr/ipr.py @@ -3,24 +3,32 @@ by other reporting systems """ +from typing import Dict, Iterable, List, Sequence, Set, Tuple + from pori_python.graphkb import GraphKBConnection from pori_python.graphkb import statement as gkb_statement from pori_python.graphkb import vocab as gkb_vocab -from typing import Dict, Iterable, List, Sequence, Set, Tuple from .constants import GERMLINE_BASE_TERMS, VARIANT_CLASSES -from .types import GkbStatement, ImageDefinition, IprFusionVariant, IprGene, IprVariant, KbMatch +from .types import ( + GkbStatement, + ImageDefinition, + IprFusionVariant, + IprGene, + IprVariant, + KbMatch, +) from .util import find_variant, logger def display_evidence_levels(statement: GkbStatement) -> str: result = [] - for evidence_level in statement.get('evidenceLevel', []) or []: + for evidence_level in statement.get("evidenceLevel", []) or []: if isinstance(evidence_level, str): result.append(evidence_level) - elif 'displayName' in evidence_level: - result.append(evidence_level['displayName']) - return ';'.join(sorted(result)) + elif "displayName" in evidence_level: + result.append(evidence_level["displayName"]) + return ";".join(sorted(result)) def filter_structural_variants( @@ -32,9 +40,13 @@ def filter_structural_variants( Filter structural variants to remove non-high quality events unless they are matched/annotated or they involve a gene that is a known fusion partner """ - matched_svs = {match['variant'] for match in kb_matches if match['variantType'] == 'sv'} + matched_svs = { + match["variant"] for match in kb_matches if match["variantType"] == "sv" + } fusion_genes = { - gene['name'] for gene in gene_annotations if gene.get('knownFusionPartner', False) + gene["name"] + for gene in gene_annotations + if gene.get("knownFusionPartner", False) } result = [] @@ -42,10 +54,10 @@ def filter_structural_variants( for structural_variant in structural_variants: if any( [ - structural_variant['highQuality'], - structural_variant['key'] in matched_svs, - structural_variant['gene1'] in fusion_genes, - structural_variant['gene2'] in fusion_genes, + structural_variant["highQuality"], + structural_variant["key"] in matched_svs, + structural_variant["gene1"] in fusion_genes, + structural_variant["gene2"] in fusion_genes, ] ): result.append(structural_variant) @@ -72,11 +84,15 @@ def get_evidencelevel_mapping(graphkb_conn: GraphKBConnection) -> Dict[str, str] # Filter IPR EvidenceLevel and map each outgoing CrossReferenceOf to displayName ipr_source_rid = graphkb_conn.get_source("ipr")["@rid"] - ipr_evidence_levels = filter(lambda d: d.get("source") == ipr_source_rid, evidence_levels) + ipr_evidence_levels = filter( + lambda d: d.get("source") == ipr_source_rid, evidence_levels + ) cross_references_mapping: Dict[str, str] = dict() ipr_rids_to_displayname = dict() for level in ipr_evidence_levels: - d = map(lambda i: (i, level["displayName"]), level.get("out_CrossReferenceOf", [])) + d = map( + lambda i: (i, level["displayName"]), level.get("out_CrossReferenceOf", []) + ) cross_references_mapping.update(d) ipr_rids_to_displayname[level["@rid"]] = level["displayName"] @@ -119,21 +135,25 @@ def convert_statements_to_alterations( - only report disease matched prognostic markers https://www.bcgsc.ca/jira/browse/GERO-72 and GERO-196 """ disease_matches = { - r['@rid'] - for r in gkb_vocab.get_term_tree(graphkb_conn, disease_name, ontology_class='Disease') + r["@rid"] + for r in gkb_vocab.get_term_tree( + graphkb_conn, disease_name, ontology_class="Disease" + ) } if not disease_matches: - raise ValueError(f'failed to match disease ({disease_name}) to graphkb') + raise ValueError(f"failed to match disease ({disease_name}) to graphkb") rows = [] ev_map = get_evidencelevel_mapping(graphkb_conn) # GERO-318 - add all IPR-A evidence equivalents to the approvedTherapy flag - approved = set([ev for (ev, ipr) in ev_map.items() if ipr == 'IPR-A']) + approved = set([ev for (ev, ipr) in ev_map.items() if ipr == "IPR-A"]) # get the recruitment status for any trial associated with a statement clinical_trials = [ - s['subject']['@rid'] for s in statements if s['subject']['@class'] == 'ClinicalTrial' + s["subject"]["@rid"] + for s in statements + if s["subject"]["@class"] == "ClinicalTrial" ] recruitment_statuses = {} if clinical_trials: @@ -141,71 +161,81 @@ def convert_statements_to_alterations( for rid in clinical_trials: query_result = graphkb_conn.query( { - 'target': {'target': 'ClinicalTrial', 'filters': {'@rid': rid}}, - 'returnProperties': ['@rid', 'recruitmentStatus'], + "target": {"target": "ClinicalTrial", "filters": {"@rid": rid}}, + "returnProperties": ["@rid", "recruitmentStatus"], } ) if query_result: - recruitment_statuses[rid] = query_result[0]['recruitmentStatus'] + recruitment_statuses[rid] = query_result[0]["recruitmentStatus"] for statement in statements: - variants = [c for c in statement['conditions'] if c['@class'] in VARIANT_CLASSES] - diseases = [c for c in statement['conditions'] if c['@class'] == 'Disease'] - disease_match = len(diseases) == 1 and diseases[0]['@rid'] in disease_matches - pmid = ';'.join([e['displayName'] for e in statement['evidence']]) + variants = [ + c for c in statement["conditions"] if c["@class"] in VARIANT_CLASSES + ] + diseases = [c for c in statement["conditions"] if c["@class"] == "Disease"] + disease_match = len(diseases) == 1 and diseases[0]["@rid"] in disease_matches + pmid = ";".join([e["displayName"] for e in statement["evidence"]]) ipr_section = gkb_statement.categorize_relevance( - graphkb_conn, statement['relevance']['@rid'] + graphkb_conn, statement["relevance"]["@rid"] ) approved_therapy = False - if ipr_section == 'therapeutic': - for level in statement['evidenceLevel'] or []: - if level['@rid'] in approved: + if ipr_section == "therapeutic": + for level in statement["evidenceLevel"] or []: + if level["@rid"] in approved: approved_therapy = True break - if ipr_section == 'prognostic' and not disease_match: + if ipr_section == "prognostic" and not disease_match: continue # GERO-72 / GERO-196 evidence_level_str = display_evidence_levels(statement) - evidence_levels = statement.get('evidenceLevel') or [] - ipr_evidence_levels = [ev_map[el.get('@rid', '')] for el in evidence_levels if el] - ipr_evidence_levels_str = ';'.join(sorted(set([el for el in ipr_evidence_levels]))) + evidence_levels = statement.get("evidenceLevel") or [] + ipr_evidence_levels = [ + ev_map[el.get("@rid", "")] for el in evidence_levels if el + ] + ipr_evidence_levels_str = ";".join( + sorted(set([el for el in ipr_evidence_levels])) + ) for variant in variants: - if variant['@rid'] not in variant_matches: + if variant["@rid"] not in variant_matches: continue row = KbMatch( { - 'approvedTherapy': approved_therapy, - 'category': ipr_section or 'unknown', - 'context': ( - statement['subject']['displayName'] if statement['subject'] else None + "approvedTherapy": approved_therapy, + "category": ipr_section or "unknown", + "context": ( + statement["subject"]["displayName"] + if statement["subject"] + else None ), - 'kbContextId': (statement['subject']['@rid'] if statement['subject'] else None), - 'disease': ';'.join(sorted(d['displayName'] for d in diseases)), - 'evidenceLevel': evidence_level_str, - 'iprEvidenceLevel': ipr_evidence_levels_str, - 'kbStatementId': statement['@rid'], - 'kbVariant': variant['displayName'], - 'kbVariantId': variant['@rid'], - 'matchedCancer': disease_match, - 'reference': pmid, - 'relevance': statement['relevance']['displayName'], - 'kbRelevanceId': statement['relevance']['@rid'], - 'externalSource': ( - str(statement['source'].get('displayName', '')) - if statement['source'] + "kbContextId": ( + statement["subject"]["@rid"] if statement["subject"] else None + ), + "disease": ";".join(sorted(d["displayName"] for d in diseases)), + "evidenceLevel": evidence_level_str, + "iprEvidenceLevel": ipr_evidence_levels_str, + "kbStatementId": statement["@rid"], + "kbVariant": variant["displayName"], + "kbVariantId": variant["@rid"], + "matchedCancer": disease_match, + "reference": pmid, + "relevance": statement["relevance"]["displayName"], + "kbRelevanceId": statement["relevance"]["@rid"], + "externalSource": ( + str(statement["source"].get("displayName", "")) + if statement["source"] else None ), - 'externalStatementId': statement.get('sourceId'), - 'reviewStatus': statement.get('reviewStatus'), - 'kbData': {}, + "externalStatementId": statement.get("sourceId"), + "reviewStatus": statement.get("reviewStatus"), + "kbData": {}, } ) - if statement['relevance']['name'] == 'eligibility': - row['kbData']['recruitment_status'] = recruitment_statuses.get( - row['kbContextId'], 'not found' + if statement["relevance"]["name"] == "eligibility": + row["kbData"]["recruitment_status"] = recruitment_statuses.get( + row["kbContextId"], "not found" ) rows.append(row) return rows @@ -228,22 +258,24 @@ def select_expression_plots( """ selected_variants = { - (match['variantType'], match['variant']) + (match["variantType"], match["variant"]) for match in kb_matches - if match['category'] == 'therapeutic' + if match["category"] == "therapeutic" } images_by_gene: Dict[str, ImageDefinition] = {} selected_genes = set() for variant in all_variants: - if (variant['variantType'], variant['key']) in selected_variants: - for key in ['gene', 'gene1', 'gene2']: + if (variant["variantType"], variant["key"]) in selected_variants: + for key in ["gene", "gene1", "gene2"]: gene = variant.get(key) if gene: selected_genes.add(str(gene)) - gene = str(variant.get('gene', '')) - hist = str(variant.get('histogramImage', '')) + gene = str(variant.get("gene", "")) + hist = str(variant.get("histogramImage", "")) if hist: - images_by_gene[gene] = ImageDefinition({'key': f'expDensity.{gene}', 'path': hist}) + images_by_gene[gene] = ImageDefinition( + {"key": f"expDensity.{gene}", "path": hist} + ) return [images_by_gene[gene] for gene in selected_genes if gene in images_by_gene] @@ -256,17 +288,17 @@ def create_key_alterations( """ alterations = [] type_mapping = { - 'mut': 'smallMutations', - 'cnv': 'CNVs', - 'sv': 'SVs', - 'exp': 'expressionOutliers', + "mut": "smallMutations", + "cnv": "CNVs", + "sv": "SVs", + "exp": "expressionOutliers", } counts: Dict[str, Set] = {v: set() for v in type_mapping.values()} skipped_variant_types = [] for kb_match in kb_matches: - variant_type = kb_match['variantType'] - variant_key = kb_match['variant'] - if kb_match['category'] == 'unknown': + variant_type = kb_match["variantType"] + variant_key = kb_match["variant"] + if kb_match["category"] == "unknown": continue if variant_type not in type_mapping.keys(): @@ -285,32 +317,36 @@ def create_key_alterations( counts[type_mapping[variant_type]].add(variant_key) - if variant_type == 'exp': - alterations.append(f'{variant.get("gene","")} ({variant.get("expressionState")})') - elif variant_type == 'cnv': + if variant_type == "exp": + alterations.append( + f'{variant.get("gene","")} ({variant.get("expressionState")})' + ) + elif variant_type == "cnv": alterations.append(f'{variant.get("gene","")} ({variant.get("cnvState")})') # only show germline if relevant - elif kb_match['category'] in GERMLINE_BASE_TERMS and variant.get('germline'): + elif kb_match["category"] in GERMLINE_BASE_TERMS and variant.get("germline"): alterations.append(f"germline {variant['variant']}") else: - alterations.append(variant['variant']) + alterations.append(variant["variant"]) counted_variants = set.union(*counts.values()) - counts['variantsUnknown'] = set() + counts["variantsUnknown"] = set() # count the un-matched variants for variant in all_variants: - if variant['variant'] and variant['key'] not in counted_variants: - counts['variantsUnknown'].add(variant['key']) + if variant["variant"] and variant["key"] not in counted_variants: + counts["variantsUnknown"].add(variant["key"]) return ( - [{'geneVariant': alt} for alt in set(alterations)], + [{"geneVariant": alt} for alt in set(alterations)], {k: len(v) for k, v in counts.items()}, ) def germline_kb_matches( - kb_matches: List[KbMatch], all_variants: Sequence[IprVariant], assume_somatic: bool = True + kb_matches: List[KbMatch], + all_variants: Sequence[IprVariant], + assume_somatic: bool = True, ) -> List[KbMatch]: """Filter kb_matches for matching to germline or somatic events using the 'germline' optional property. @@ -327,14 +363,14 @@ def germline_kb_matches( filtered list of kb_matches """ ret_list = [] - germ_alts = [alt for alt in kb_matches if alt['category'] in GERMLINE_BASE_TERMS] + germ_alts = [alt for alt in kb_matches if alt["category"] in GERMLINE_BASE_TERMS] somatic_alts = [alt for alt in kb_matches if alt not in germ_alts] if germ_alts: logger.info(f"checking germline status of {GERMLINE_BASE_TERMS}") for alt in germ_alts: - var_list = [v for v in all_variants if v['key'] == alt['variant']] - germline_var_list = [v for v in var_list if v.get('germline')] - unknown_var_list = [v for v in var_list if 'germline' not in v] + var_list = [v for v in all_variants if v["key"] == alt["variant"]] + germline_var_list = [v for v in var_list if v.get("germline")] + unknown_var_list = [v for v in var_list if "germline" not in v] if germline_var_list: logger.debug( f"germline kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" @@ -360,8 +396,10 @@ def germline_kb_matches( if somatic_alts: # Remove any matches to germline events for alt in somatic_alts: - var_list = [v for v in all_variants if v['key'] == alt['variant']] - somatic_var_list = [v for v in var_list if not v.get('germline', not assume_somatic)] + var_list = [v for v in all_variants if v["key"] == alt["variant"]] + somatic_var_list = [ + v for v in var_list if not v.get("germline", not assume_somatic) + ] if var_list and not somatic_var_list: logger.debug( f"Dropping germline match to somatic statement kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}" @@ -369,6 +407,8 @@ def germline_kb_matches( elif somatic_var_list: ret_list.append(alt) # match to somatic variant else: - ret_list.append(alt) # alteration not in any specific keys matches to check. + ret_list.append( + alt + ) # alteration not in any specific keys matches to check. return ret_list diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py index 2c6eefd..8c08f24 100644 --- a/pori_python/ipr/main.py +++ b/pori_python/ipr/main.py @@ -5,9 +5,10 @@ import logging import os from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser +from typing import Dict, List, Sequence + from pori_python.graphkb import GraphKBConnection from pori_python.graphkb.genes import get_gene_information -from typing import Dict, List, Sequence from .annotate import ( annotate_copy_variants, @@ -41,59 +42,68 @@ CACHE_GENE_MINIMUM = 5000 RENAMED_GENE_PROPERTIES = { # old_name: new_name - 'cancerRelated': 'kbStatementRelated', - 'cancerGene': 'cancerGeneListMatch', + "cancerRelated": "kbStatementRelated", + "cancerGene": "cancerGeneListMatch", } def file_path(path: str) -> str: if not os.path.exists(path): - raise argparse.ArgumentTypeError(f'{repr(path)} is not a valid filename. does not exist') + raise argparse.ArgumentTypeError( + f"{repr(path)} is not a valid filename. does not exist" + ) return path def timestamp() -> str: - return datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S') + return datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") def command_interface() -> None: parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) - req = parser.add_argument_group('required arguments') - (req if not os.environ.get('USER') else parser).add_argument( - '--username', - required=not os.environ.get('USER'), - default=os.environ.get('USER'), - help='username to use connecting to graphkb/ipr', + req = parser.add_argument_group("required arguments") + (req if not os.environ.get("USER") else parser).add_argument( + "--username", + required=not os.environ.get("USER"), + default=os.environ.get("USER"), + help="username to use connecting to graphkb/ipr", + ) + req.add_argument( + "--password", required=True, help="password to use connecting to graphkb/ipr" ) - req.add_argument('--password', required=True, help='password to use connecting to graphkb/ipr') req.add_argument( - '-c', '--content', required=True, type=file_path, help="Report Content as JSON" + "-c", "--content", required=True, type=file_path, help="Report Content as JSON" ) - parser.add_argument('--ipr_url', default=DEFAULT_URL) - parser.add_argument('--graphkb_url', default=None) - parser.add_argument('--log_level', default='info', choices=LOG_LEVELS.keys()) + parser.add_argument("--ipr_url", default=DEFAULT_URL) + parser.add_argument("--graphkb_url", default=None) + parser.add_argument("--log_level", default="info", choices=LOG_LEVELS.keys()) parser.add_argument( - '--therapeutics', default=False, help='Generate therapeutic options', action='store_true' + "--therapeutics", + default=False, + help="Generate therapeutic options", + action="store_true", ) parser.add_argument( - '--skip_comments', + "--skip_comments", default=False, - action='store_true', - help='Turn off generating the analyst comments section of the report', + action="store_true", + help="Turn off generating the analyst comments section of the report", ) parser.add_argument( - '-o', '--output_json_path', help='path to a JSON to output the report upload body' + "-o", + "--output_json_path", + help="path to a JSON to output the report upload body", ) parser.add_argument( - '-w', - '--always_write_output_json', + "-w", + "--always_write_output_json", action="store_true", - help='Write to output_json_path on successful IPR uploads instead of just when the upload fails', + help="Write to output_json_path on successful IPR uploads instead of just when the upload fails", ) args = parser.parse_args() - with open(args.content, 'r') as fh: + with open(args.content, "r") as fh: content = json.load(fh) create_report( @@ -118,12 +128,14 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict """ if ( ipr_spec - and 'components' in ipr_spec.keys() - and 'schemas' in ipr_spec['components'].keys() - and 'genesCreate' in ipr_spec['components']['schemas'].keys() - and 'properties' in ipr_spec['components']['schemas']['genesCreate'].keys() + and "components" in ipr_spec.keys() + and "schemas" in ipr_spec["components"].keys() + and "genesCreate" in ipr_spec["components"]["schemas"].keys() + and "properties" in ipr_spec["components"]["schemas"]["genesCreate"].keys() ): - genes_spec = ipr_spec['components']['schemas']['genesCreate']['properties'].keys() + genes_spec = ipr_spec["components"]["schemas"]["genesCreate"][ + "properties" + ].keys() # check what ipr report upload expects and adjust contents to match for old_name, new_name in RENAMED_GENE_PROPERTIES.items(): @@ -131,13 +143,13 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict logger.warning( f"Legacy IPR - Renaming property {new_name} to {old_name} for compatibility to ipr_spec" ) - for gene in upload_content['genes']: + for gene in upload_content["genes"]: if new_name in gene: gene[old_name] = gene[new_name] gene.pop(new_name) else: outdate_properties = 0 - for gene in upload_content['genes']: + for gene in upload_content["genes"]: if old_name in gene: gene[new_name] = gene[old_name] gene.pop(old_name) @@ -149,7 +161,7 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict # remove any unhandled incompatible keys removed_keys: Dict[str, int] = {} - for gene in upload_content['genes']: + for gene in upload_content["genes"]: unsupported_keys = [key for key in gene.keys() if key not in genes_spec] for key in unsupported_keys: if key in removed_keys: @@ -158,25 +170,29 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict removed_keys[key] = 1 gene.pop(key) for key, count in removed_keys.items(): - logger.warning(f"IPR unsupported property '{key}' removed from {count} genes.") + logger.warning( + f"IPR unsupported property '{key}' removed from {count} genes." + ) - drop_columns = ['variant', 'variantType', 'histogramImage'] + drop_columns = ["variant", "variantType", "histogramImage"] # DEVSU-2034 - use a 'displayName' VARIANT_LIST_KEYS = [ - 'expressionVariants', - 'smallMutations', - 'copyVariants', - 'structuralVariants', - 'probeResults', - 'msi', + "expressionVariants", + "smallMutations", + "copyVariants", + "structuralVariants", + "probeResults", + "msi", ] for variant_list_section in VARIANT_LIST_KEYS: for variant in upload_content.get(variant_list_section, []): - if not variant.get('displayName'): - variant['displayName'] = ( - variant.get('variant') or variant.get('kbCategory') or variant.get('key', '') + if not variant.get("displayName"): + variant["displayName"] = ( + variant.get("variant") + or variant.get("kbCategory") + or variant.get("key", "") ) - if variant_list_section == 'probeResults': + if variant_list_section == "probeResults": # currently probeResults will error if they do NOT have a 'variant' column. # smallMutations will error if they DO have a 'variant' column. continue @@ -184,20 +200,22 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict if col in variant: del variant[col] # tmburMutationBurden is a single value, not list - if upload_content.get('tmburMutationBurden'): - if not upload_content['tmburMutationBurden'].get('displayName'): - upload_content['tmburMutationBurden']['displayName'] = upload_content[ - 'tmburMutationBurden' - ].get('kbCategory', '') - - for row in upload_content['kbMatches']: - del row['kbContextId'] - del row['kbRelevanceId'] + if upload_content.get("tmburMutationBurden"): + if not upload_content["tmburMutationBurden"].get("displayName"): + upload_content["tmburMutationBurden"]["displayName"] = upload_content[ + "tmburMutationBurden" + ].get("kbCategory", "") + + for row in upload_content["kbMatches"]: + del row["kbContextId"] + del row["kbRelevanceId"] return upload_content def create_report(**kwargs) -> Dict: - logger.warning("Deprecated function 'create_report' called - use ipr_report instead") + logger.warning( + "Deprecated function 'create_report' called - use ipr_report instead" + ) return ipr_report(**kwargs) @@ -206,12 +224,12 @@ def ipr_report( password: str, content: Dict, ipr_url: str = DEFAULT_URL, - log_level: str = 'info', - output_json_path: str = '', + log_level: str = "info", + output_json_path: str = "", always_write_output_json: bool = False, ipr_upload: bool = True, interactive: bool = False, - graphkb_url: str = '', + graphkb_url: str = "", generate_therapeutics: bool = False, generate_comments: bool = True, match_germline: bool = False, @@ -245,23 +263,29 @@ def ipr_report( # set the default logging configuration logging.basicConfig( level=LOG_LEVELS[log_level], - format='%(asctime)s %(name)s %(levelname)s %(message)s', - datefmt='%m-%d-%y %H:%M:%S', + format="%(asctime)s %(name)s %(levelname)s %(message)s", + datefmt="%m-%d-%y %H:%M:%S", ) # validate the JSON content follows the specification try: validate_report_content(content) except jsonschema.exceptions.ValidationError as err: - logger.error("Failed schema check - report variants may be corrupted or unmatched.") + logger.error( + "Failed schema check - report variants may be corrupted or unmatched." + ) logger.error(f"Failed schema check: {err}") - kb_disease_match = content['kbDiseaseMatch'] + kb_disease_match = content["kbDiseaseMatch"] # validate the input variants - small_mutations = preprocess_small_mutations(content.get('smallMutations', [])) - structural_variants = preprocess_structural_variants(content.get('structuralVariants', [])) - copy_variants = preprocess_copy_variants(content.get('copyVariants', [])) - expression_variants = preprocess_expression_variants(content.get('expressionVariants', [])) + small_mutations = preprocess_small_mutations(content.get("smallMutations", [])) + structural_variants = preprocess_structural_variants( + content.get("structuralVariants", []) + ) + copy_variants = preprocess_copy_variants(content.get("copyVariants", [])) + expression_variants = preprocess_expression_variants( + content.get("expressionVariants", []) + ) if expression_variants: check_comparators(content, expression_variants) @@ -274,7 +298,7 @@ def ipr_report( ipr_spec = ipr_conn.get_spec() if graphkb_url: - logger.info(f'connecting to graphkb: {graphkb_url}') + logger.info(f"connecting to graphkb: {graphkb_url}") graphkb_conn = GraphKBConnection(graphkb_url) else: graphkb_conn = GraphKBConnection() @@ -285,61 +309,65 @@ def ipr_report( # Signature category variants tmb_variant: IprVariant = {} tmb_matches = [] - if 'tmburMutationBurden' in content.keys(): + if "tmburMutationBurden" in content.keys(): tmb_val = 0.0 tmb = {} try: - tmb = content.get('tmburMutationBurden', {}) - tmb_val = tmb['genomeIndelTmb'] + tmb['genomeSnvTmb'] + tmb = content.get("tmburMutationBurden", {}) + tmb_val = tmb["genomeIndelTmb"] + tmb["genomeSnvTmb"] except Exception as err: logger.error(f"tmburMutationBurden parsing failure: {err}") if tmb_val >= TMB_HIGH: logger.warning( - f'GERO-296 - tmburMutationBurden high -checking graphkb matches for {TMB_HIGH_CATEGORY}' + f"GERO-296 - tmburMutationBurden high -checking graphkb matches for {TMB_HIGH_CATEGORY}" ) - if not tmb.get('key'): - tmb['key'] = TMB_HIGH_CATEGORY - if not tmb.get('kbCategory'): - tmb['kbCategory'] = TMB_HIGH_CATEGORY + if not tmb.get("key"): + tmb["key"] = TMB_HIGH_CATEGORY + if not tmb.get("kbCategory"): + tmb["kbCategory"] = TMB_HIGH_CATEGORY # GERO-296 - try matching to graphkb - tmb_matches = annotate_tmb(graphkb_conn, kb_disease_match, TMB_HIGH_CATEGORY) + tmb_matches = annotate_tmb( + graphkb_conn, kb_disease_match, TMB_HIGH_CATEGORY + ) if tmb_matches: - tmb_variant['kbCategory'] = TMB_HIGH_CATEGORY # type: ignore - tmb_variant['variant'] = TMB_HIGH_CATEGORY - tmb_variant['key'] = tmb['key'] - tmb_variant['variantType'] = 'tmb' + tmb_variant["kbCategory"] = TMB_HIGH_CATEGORY # type: ignore + tmb_variant["variant"] = TMB_HIGH_CATEGORY + tmb_variant["key"] = tmb["key"] + tmb_variant["variantType"] = "tmb" logger.info( f"GERO-296 '{TMB_HIGH_CATEGORY}' matches {len(tmb_matches)} statements." ) gkb_matches.extend(tmb_matches) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") - msi = content.get('msi', []) + msi = content.get("msi", []) msi_matches = [] msi_variant: IprVariant = {} if msi: # only one msi variant per library if isinstance(msi, list): - msi_cat = msi[0].get('kbCategory') + msi_cat = msi[0].get("kbCategory") elif isinstance(msi, str): msi_cat = msi else: - msi_cat = msi.get('kbCategory') + msi_cat = msi.get("kbCategory") msi_variant = msi.copy() - logger.info(f'Matching GKB msi {msi_cat}') + logger.info(f"Matching GKB msi {msi_cat}") msi_matches = annotate_msi(graphkb_conn, kb_disease_match, msi_cat) if msi_matches: - msi_variant['kbCategory'] = msi_cat # type: ignore - msi_variant['variant'] = msi_cat - msi_variant['key'] = msi_cat - msi_variant['variantType'] = 'msi' - logger.info(f"GERO-295 '{msi_cat}' matches {len(msi_matches)} msi statements.") + msi_variant["kbCategory"] = msi_cat # type: ignore + msi_variant["variant"] = msi_cat + msi_variant["key"] = msi_cat + msi_variant["variantType"] = "msi" + logger.info( + f"GERO-295 '{msi_cat}' matches {len(msi_matches)} msi statements." + ) gkb_matches.extend(msi_matches) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") - logger.info(f'annotating {len(small_mutations)} small mutations') + logger.info(f"annotating {len(small_mutations)} small mutations") gkb_matches.extend( annotate_positional_variants( graphkb_conn, small_mutations, kb_disease_match, show_progress=interactive @@ -347,15 +375,18 @@ def ipr_report( ) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") - logger.info(f'annotating {len(structural_variants)} structural variants') + logger.info(f"annotating {len(structural_variants)} structural variants") gkb_matches.extend( annotate_positional_variants( - graphkb_conn, structural_variants, kb_disease_match, show_progress=interactive + graphkb_conn, + structural_variants, + kb_disease_match, + show_progress=interactive, ) ) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") - logger.info(f'annotating {len(copy_variants)} copy variants') + logger.info(f"annotating {len(copy_variants)} copy variants") gkb_matches.extend( annotate_copy_variants( graphkb_conn, copy_variants, kb_disease_match, show_progress=interactive @@ -363,10 +394,13 @@ def ipr_report( ) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") - logger.info(f'annotating {len(expression_variants)} expression variants') + logger.info(f"annotating {len(expression_variants)} expression variants") gkb_matches.extend( annotate_expression_variants( - graphkb_conn, expression_variants, kb_disease_match, show_progress=interactive + graphkb_conn, + expression_variants, + kb_disease_match, + show_progress=interactive, ) ) logger.debug(f"\tgkb_matches: {len(gkb_matches)}") @@ -378,68 +412,79 @@ def ipr_report( if tmb_matches: all_variants.append(tmb_variant) # type: ignore - if match_germline: # verify germline kb statements matched germline observed variants + if ( + match_germline + ): # verify germline kb statements matched germline observed variants gkb_matches = germline_kb_matches(gkb_matches, all_variants) if gkb_matches: - logger.info(f"Removing {len(gkb_matches)} germline events without medical matches.") + logger.info( + f"Removing {len(gkb_matches)} germline events without medical matches." + ) if custom_kb_match_filter: - logger.info(f'custom_kb_match_filter on {len(gkb_matches)} variants') + logger.info(f"custom_kb_match_filter on {len(gkb_matches)} variants") gkb_matches = custom_kb_match_filter(gkb_matches) - logger.info(f'\t custom_kb_match_filter left {len(gkb_matches)} variants') + logger.info(f"\t custom_kb_match_filter left {len(gkb_matches)} variants") key_alterations, variant_counts = create_key_alterations(gkb_matches, all_variants) - logger.info('fetching gene annotations') + logger.info("fetching gene annotations") gene_information = get_gene_information(graphkb_conn, sorted(genes_with_variants)) if generate_therapeutics: - logger.info('generating therapeutic options') + logger.info("generating therapeutic options") targets = create_therapeutic_options(graphkb_conn, gkb_matches, all_variants) else: targets = [] - logger.info('generating analyst comments') + logger.info("generating analyst comments") if generate_comments: comments = { - 'comments': summarize( - graphkb_conn, gkb_matches, disease_name=kb_disease_match, variants=all_variants + "comments": summarize( + graphkb_conn, + gkb_matches, + disease_name=kb_disease_match, + variants=all_variants, ) } else: - comments = {'comments': ''} + comments = {"comments": ""} # thread safe deep-copy the original content output = json.loads(json.dumps(content)) output.update( { - 'kbMatches': [trim_empty_values(a) for a in gkb_matches], - 'copyVariants': [ - trim_empty_values(c) for c in copy_variants if c['gene'] in genes_with_variants + "kbMatches": [trim_empty_values(a) for a in gkb_matches], + "copyVariants": [ + trim_empty_values(c) + for c in copy_variants + if c["gene"] in genes_with_variants ], - 'smallMutations': [trim_empty_values(s) for s in small_mutations], - 'expressionVariants': [ + "smallMutations": [trim_empty_values(s) for s in small_mutations], + "expressionVariants": [ trim_empty_values(e) for e in expression_variants - if e['gene'] in genes_with_variants + if e["gene"] in genes_with_variants ], - 'kbDiseaseMatch': kb_disease_match, - 'kbUrl': graphkb_conn.url, - 'kbVersion': timestamp(), - 'structuralVariants': [ + "kbDiseaseMatch": kb_disease_match, + "kbUrl": graphkb_conn.url, + "kbVersion": timestamp(), + "structuralVariants": [ trim_empty_values(s) for s in filter_structural_variants( structural_variants, gkb_matches, gene_information ) ], - 'genes': gene_information, - 'genomicAlterationsIdentified': key_alterations, - 'variantCounts': variant_counts, - 'analystComments': comments, - 'therapeuticTarget': targets, + "genes": gene_information, + "genomicAlterationsIdentified": key_alterations, + "variantCounts": variant_counts, + "analystComments": comments, + "therapeuticTarget": targets, } ) - output.setdefault('images', []).extend(select_expression_plots(gkb_matches, all_variants)) + output.setdefault("images", []).extend( + select_expression_plots(gkb_matches, all_variants) + ) output = clean_unsupported_content(output, ipr_spec) ipr_result = None @@ -447,7 +492,7 @@ def ipr_report( if ipr_upload: try: - logger.info(f'Uploading to IPR {ipr_conn.url}') + logger.info(f"Uploading to IPR {ipr_conn.url}") ipr_result = ipr_conn.upload_report(output, async_upload, mins_to_wait) logger.info(ipr_result) output.update(ipr_result) @@ -456,11 +501,11 @@ def ipr_report( logger.error(f"ipr_conn.upload_report failed: {err}", exc_info=True) if output_json_path: if always_write_output_json or not ipr_result: - logger.info(f'Writing IPR upload json to: {output_json_path}') - with open(output_json_path, 'w') as fh: + logger.info(f"Writing IPR upload json to: {output_json_path}") + with open(output_json_path, "w") as fh: fh.write(json.dumps(output)) - logger.info(f'made {graphkb_conn.request_count} requests to graphkb') - logger.info(f'average load {int(graphkb_conn.load or 0)} req/s') + logger.info(f"made {graphkb_conn.request_count} requests to graphkb") + logger.info(f"average load {int(graphkb_conn.load or 0)} req/s") if upload_error: raise upload_error return output diff --git a/pori_python/ipr/summary.py b/pori_python/ipr/summary.py index 491bfad..a91f57e 100644 --- a/pori_python/ipr/summary.py +++ b/pori_python/ipr/summary.py @@ -1,14 +1,14 @@ import base64 import json +from typing import Callable, Dict, List, Sequence, Set, Tuple +from urllib.parse import urlencode + from pori_python.graphkb import GraphKBConnection from pori_python.graphkb.constants import RELEVANCE_BASE_TERMS from pori_python.graphkb.statement import categorize_relevance from pori_python.graphkb.types import Ontology, Record from pori_python.graphkb.util import convert_to_rid_list from pori_python.graphkb.vocab import get_term_tree -from typing import Callable, Dict, List, Sequence, Set, Tuple -from urllib.parse import urlencode - from pori_python.ipr.inputs import create_graphkb_sv_notation from .types import GkbStatement, IprVariant, KbMatch @@ -20,10 +20,10 @@ logger, ) -OTHER_DISEASES = 'other disease types' -ENTREZ_GENE_URL = 'https://www.ncbi.nlm.nih.gov/gene' +OTHER_DISEASES = "other disease types" +ENTREZ_GENE_URL = "https://www.ncbi.nlm.nih.gov/gene" # TODO: https://www.bcgsc.ca/jira/browse/DEVSU-1181 -GRAPHKB_GUI = 'https://graphkb.bcgsc.ca' +GRAPHKB_GUI = "https://graphkb.bcgsc.ca" def filter_by_record_class( @@ -37,32 +37,33 @@ def check(name: str) -> bool: else: return name in record_classes - return [rec for rec in record_list if check(rec['@class'])] + return [rec for rec in record_list if check(rec["@class"])] def natural_join(word_list: List[str]) -> str: if len(word_list) > 1: - return ', '.join(word_list[:-1]) + ', and ' + word_list[-1] - return ''.join(word_list) + return ", ".join(word_list[:-1]) + ", and " + word_list[-1] + return "".join(word_list) def natural_join_records( - records: Sequence[Record], covert_to_word: Callable[[Dict], str] = lambda x: x['displayName'] + records: Sequence[Record], + covert_to_word: Callable[[Dict], str] = lambda x: x["displayName"], ) -> str: word_list = sorted(list({covert_to_word(rec) for rec in records})) return natural_join(word_list) -def create_graphkb_link(record_ids: List[str], record_class: str = 'Statement') -> str: +def create_graphkb_link(record_ids: List[str], record_class: str = "Statement") -> str: """ Create a link for a set of statements to the GraphKB client """ record_ids = sorted(list(set(record_ids))) if len(record_ids) == 1: return f'{GRAPHKB_GUI}/view/{record_class}/{record_ids[0].replace("#", "")}' - complex_param = base64.b64encode(json.dumps({'target': record_ids}).encode("utf-8")) - search_params = {'complex': complex_param, '@class': record_class} - return f'{GRAPHKB_GUI}/data/table?{urlencode(search_params)}' + complex_param = base64.b64encode(json.dumps({"target": record_ids}).encode("utf-8")) + search_params = {"complex": complex_param, "@class": record_class} + return f"{GRAPHKB_GUI}/data/table?{urlencode(search_params)}" def substitute_sentence_template( @@ -77,60 +78,78 @@ def substitute_sentence_template( """Create the filled-in sentence template for a given template and list of substitutions which may be the result of the aggregation of 1 or more statements. """ - disease_conditions = filter_by_record_class(conditions, 'Disease') + disease_conditions = filter_by_record_class(conditions, "Disease") variant_conditions = filter_by_record_class( - conditions, 'CategoryVariant', 'CatalogueVariant', 'PositionalVariant' + conditions, "CategoryVariant", "CatalogueVariant", "PositionalVariant" ) other_conditions = filter_by_record_class( conditions, - 'CategoryVariant', - 'CatalogueVariant', - 'PositionalVariant', - 'Disease', + "CategoryVariant", + "CatalogueVariant", + "PositionalVariant", + "Disease", exclude=True, ) - result = template.replace(r'{relevance}', relevance['displayName']) + result = template.replace(r"{relevance}", relevance["displayName"]) def merge_diseases(diseases: List[Ontology]) -> str: if len(convert_to_rid_set(diseases) - disease_matches) >= 2 and all( - [d['@class'] == 'Disease' for d in diseases] + [d["@class"] == "Disease" for d in diseases] ): words = sorted( - list(set([s['displayName'] for s in diseases if s['@rid'] in disease_matches])) + list( + set( + [ + s["displayName"] + for s in diseases + if s["@rid"] in disease_matches + ] + ) + ) ) words.append(OTHER_DISEASES) return natural_join(words) else: return natural_join_records(diseases) - if r'{subject}' in template: + if r"{subject}" in template: # remove subject from the conditions replacements subjects_ids = convert_to_rid_set(subjects) - disease_conditions = [d for d in disease_conditions if d['@rid'] not in subjects_ids] - variant_conditions = [d for d in variant_conditions if d['@rid'] not in subjects_ids] - other_conditions = [d for d in other_conditions if d['@rid'] not in subjects_ids] + disease_conditions = [ + d for d in disease_conditions if d["@rid"] not in subjects_ids + ] + variant_conditions = [ + d for d in variant_conditions if d["@rid"] not in subjects_ids + ] + other_conditions = [ + d for d in other_conditions if d["@rid"] not in subjects_ids + ] - result = result.replace(r'{subject}', merge_diseases(subjects)) + result = result.replace(r"{subject}", merge_diseases(subjects)) - if r'{conditions:disease}' in template: - result = result.replace(r'{conditions:disease}', merge_diseases(disease_conditions)) + if r"{conditions:disease}" in template: + result = result.replace( + r"{conditions:disease}", merge_diseases(disease_conditions) + ) else: other_conditions.extend(disease_conditions) - if r'{conditions:variant}' in template: - result = result.replace(r'{conditions:variant}', natural_join_records(variant_conditions)) + if r"{conditions:variant}" in template: + result = result.replace( + r"{conditions:variant}", natural_join_records(variant_conditions) + ) else: other_conditions.extend(variant_conditions) - result = result.replace(r'{conditions}', natural_join_records(other_conditions)) + result = result.replace(r"{conditions}", natural_join_records(other_conditions)) - link_url = create_graphkb_link(statement_rids) if statement_rids else '' + link_url = create_graphkb_link(statement_rids) if statement_rids else "" - if r'{evidence}' in template: - evidence_str = ', '.join(sorted(list({e['displayName'] for e in evidence}))) + if r"{evidence}" in template: + evidence_str = ", ".join(sorted(list({e["displayName"] for e in evidence}))) if link_url: evidence_str = f'{evidence_str}' - result = result.replace(r'{evidence}', evidence_str) + result = result.replace(r"{evidence}", evidence_str) return result @@ -148,18 +167,20 @@ def aggregate_statements( def generate_key(statement: GkbStatement) -> Tuple: result = [ - cond['displayName'] - for cond in filter_by_record_class(statement['conditions'], 'Disease', exclude=True) - if cond['@rid'] != statement['subject']['@rid'] + cond["displayName"] + for cond in filter_by_record_class( + statement["conditions"], "Disease", exclude=True + ) + if cond["@rid"] != statement["subject"]["@rid"] ] - if statement.get('subject', {}).get('@class', 'Disease') != 'Disease': - subject = statement['subject'] - if subject['@class'] == 'Therapy': - alt = get_preferred_drug_representation(graphkb_conn, subject['@rid']) - statement['subject'] = alt - result.append(statement['subject']['displayName']) - result.append(statement['relevance']['displayName']) - result.append(statement['displayNameTemplate']) + if statement.get("subject", {}).get("@class", "Disease") != "Disease": + subject = statement["subject"] + if subject["@class"] == "Therapy": + alt = get_preferred_drug_representation(graphkb_conn, subject["@rid"]) + statement["subject"] = alt + result.append(statement["subject"]["displayName"]) + result.append(statement["relevance"]["displayName"]) + result.append(statement["displayNameTemplate"]) return tuple(sorted(set(result))) for statement in statements: @@ -171,12 +192,12 @@ def generate_key(statement: GkbStatement) -> Tuple: conditions = [] subjects = [] evidence = [] - relevance = group[0]['relevance'] - template = group[0]['displayNameTemplate'] + relevance = group[0]["relevance"] + template = group[0]["displayNameTemplate"] for statement in group: - conditions.extend(statement['conditions']) - evidence.extend(statement['evidence']) - subjects.append(statement['subject']) + conditions.extend(statement["conditions"]) + evidence.extend(statement["evidence"]) + subjects.append(statement["subject"]) sentence = substitute_sentence_template( template, @@ -189,17 +210,17 @@ def generate_key(statement: GkbStatement) -> Tuple: ) for statement in group: - result[statement['@rid']] = sentence + result[statement["@rid"]] = sentence return result def display_variant(variant: IprVariant) -> str: """Short, human readable variant description string.""" - gene = variant.get('gene', '') - if not gene and 'gene1' in variant and 'gene2' in variant: + gene = variant.get("gene", "") + if not gene and "gene1" in variant and "gene2" in variant: gene = f'({variant.get("gene1", "")},{variant.get("gene2", "")})' - if variant.get('kbCategory'): + if variant.get("kbCategory"): return f'{variant.get("kbCategory")} of {gene}' # Special display of IprFusionVariant with exons @@ -208,28 +229,32 @@ def display_variant(variant: IprVariant) -> str: # Use chosen legacy 'proteinChange' or an hgvs description of lowest detail. hgvs = variant.get( - 'proteinChange', - variant.get('hgvsProtein', variant.get('hgvsCds', variant.get('hgvsGenomic', ''))), + "proteinChange", + variant.get( + "hgvsProtein", variant.get("hgvsCds", variant.get("hgvsGenomic", "")) + ), ) if gene and hgvs: - return f'{gene}:{hgvs}' + return f"{gene}:{hgvs}" elif variant.get("variant"): return variant.get("variant") - raise ValueError(f'Unable to form display_variant of {variant}') + raise ValueError(f"Unable to form display_variant of {variant}") def display_variants(gene_name: str, variants: List[IprVariant]) -> str: - result = sorted(list({v for v in [display_variant(e) for e in variants] if gene_name in v})) + result = sorted( + list({v for v in [display_variant(e) for e in variants] if gene_name in v}) + ) variants_text = natural_join(result) if len(result) > 1: + return f"Multiple variants of the gene {gene_name} were observed in this case: {variants_text}" + elif result: return ( - f'Multiple variants of the gene {gene_name} were observed in this case: {variants_text}' + f"{variants_text[0].upper()}{variants_text[1:]} was observed in this case." ) - elif result: - return f'{variants_text[0].upper()}{variants_text[1:]} was observed in this case.' - return '' + return "" def create_section_html( @@ -242,14 +267,16 @@ def create_section_html( """ Generate HTML for a gene section of the comments """ - output = [f'
{description}.
{variants_text}
-''' +""" ) sentences_used: Set[str] = set() for section in [ - {s for (s, v) in sentence_categories.items() if v == 'diagnostic'}, - {s for (s, v) in sentence_categories.items() if v == 'biological'}, - {s for (s, v) in sentence_categories.items() if v in ['therapeutic', 'prognostic']}, + {s for (s, v) in sentence_categories.items() if v == "diagnostic"}, + {s for (s, v) in sentence_categories.items() if v == "biological"}, { s for (s, v) in sentence_categories.items() - if v not in ['diagnostic', 'biological', 'therapeutic', 'prognostic', 'resistance'] + if v in ["therapeutic", "prognostic"] }, - {s for (s, v) in sentence_categories.items() if v == 'resistance'}, + { + s + for (s, v) in sentence_categories.items() + if v + not in [ + "diagnostic", + "biological", + "therapeutic", + "prognostic", + "resistance", + ] + }, + {s for (s, v) in sentence_categories.items() if v == "resistance"}, ]: - content = '. '.join(sorted(list(section - sentences_used))) + content = ". ".join(sorted(list(section - sentences_used))) sentences_used.update(section) - output.append(f'{content}
') - return '\n'.join(output) + output.append(f"{content}
") + return "\n".join(output) def section_statements_by_genes( @@ -315,16 +358,16 @@ def section_statements_by_genes( genes: Dict[str, Set[str]] = {} for statement in statements: - for condition in statement['conditions']: - if condition.get('biotype', '') == 'gene': - gene = get_preferred_gene_name(graphkb_conn, condition['@rid']) - genes.setdefault(gene, set()).add(statement['@rid']) + for condition in statement["conditions"]: + if condition.get("biotype", "") == "gene": + gene = get_preferred_gene_name(graphkb_conn, condition["@rid"]) + genes.setdefault(gene, set()).add(statement["@rid"]) else: - for cond_ref_key in ('reference1', 'reference2'): + for cond_ref_key in ("reference1", "reference2"): cond_ref_gene = condition.get(cond_ref_key) if cond_ref_gene: gene = get_preferred_gene_name(graphkb_conn, str(cond_ref_gene)) - genes.setdefault(gene, set()).add(statement['@rid']) + genes.setdefault(gene, set()).add(statement["@rid"]) return genes @@ -338,12 +381,12 @@ def summarize( """Given a list of GraphKB matches, generate a text summary to add to the report.""" templates: Dict[str, List[GkbStatement]] = {} statements: Dict[str, GkbStatement] = {} - variants_by_keys = {v['key']: v for v in variants} + variants_by_keys = {v["key"]: v for v in variants} variant_keys_by_statement_ids: Dict[str, Set[str]] = {} for match in matches: - rid = match['kbStatementId'] - exp_variant = match['variant'] + rid = match["kbStatementId"] + exp_variant = match["variant"] variant_keys_by_statement_ids.setdefault(rid, set()).add(exp_variant) exp_variants_by_statements: Dict[str, List[IprVariant]] = {} @@ -355,27 +398,31 @@ def summarize( exp_variants_by_statements[rid] = [] disease_matches = convert_to_rid_set( - get_term_tree(graphkb_conn, disease_name, ontology_class='Disease') + get_term_tree(graphkb_conn, disease_name, ontology_class="Disease") ) # get details for statements for match in matches: - rid = match['kbStatementId'].replace('#', '') - result = graphkb_conn.request(f'/statements/{rid}?neighbors=1')['result'] + rid = match["kbStatementId"].replace("#", "") + result = graphkb_conn.request(f"/statements/{rid}?neighbors=1")["result"] - templates.setdefault(result['displayNameTemplate'], []).append(result) - statements[result['@rid']] = result + templates.setdefault(result["displayNameTemplate"], []).append(result) + statements[result["@rid"]] = result # aggregate similar sentences sentences = {} for template, group in templates.items(): - sentences.update(aggregate_statements(graphkb_conn, template, group, disease_matches)) + sentences.update( + aggregate_statements(graphkb_conn, template, group, disease_matches) + ) # section statements by genes - statements_by_genes = section_statements_by_genes(graphkb_conn, list(statements.values())) + statements_by_genes = section_statements_by_genes( + graphkb_conn, list(statements.values()) + ) output: List[str] = [ - '