Skip to content

Commit

Permalink
Merge pull request #69 from mihaitodor/spdi-normalization
Browse files Browse the repository at this point in the history
Add SPDI normalization without using NCBI APIs
  • Loading branch information
rhdolin authored Aug 31, 2023
2 parents a7c0bed + 34b8d1e commit f9b5802
Show file tree
Hide file tree
Showing 9 changed files with 248 additions and 214 deletions.
4 changes: 4 additions & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
import flask
from flask_cors import CORS
import os
from .fasta import download_fasta


def create_app():
# First ensure we have the fasta files locally
download_fasta()

# App and API
options = {
'swagger_url': '/',
Expand Down
158 changes: 3 additions & 155 deletions app/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
from threading import Lock
from uuid import uuid4
import pyliftover
import requests
from datetime import datetime
import pymongo
from flask import abort
from itertools import groupby
import re
from .input_normalization import normalize

# MongoDB Client URIs
FHIR_genomics_data_client_uri = "mongodb+srv://download:[email protected]/FHIRGenomicsData"
Expand Down Expand Up @@ -116,8 +116,6 @@ def get_liftover(from_db, to_db):

SUPPORTED_GENOMIC_SOURCE_CLASSES = ['germline', 'somatic']

NCBI_VARIATION_SERVICES_BASE_URL = 'https://api.ncbi.nlm.nih.gov/variation/v0/'

CHROMOSOME_CSV_FILE = 'app/_Dict_Chromosome.csv'

# Utility Functions
Expand Down Expand Up @@ -163,26 +161,6 @@ def merge_ranges(ranges):
return merged_ranges


def get_hgvs_contextuals_url(hgvs):
return f"{NCBI_VARIATION_SERVICES_BASE_URL}hgvs/{hgvs}/contextuals"


def get_spdi_all_equivalent_contextual_url(contextual_SPDI):
return f'{NCBI_VARIATION_SERVICES_BASE_URL}spdi/{contextual_SPDI}/all_equivalent_contextual'


def get_spdi_canonical_representative_url(contextual_SPDI):
return f'{NCBI_VARIATION_SERVICES_BASE_URL}spdi/{contextual_SPDI}/canonical_representative'


def build_spdi(seq_id, position, deleted_sequence, inserted_sequence):
return f"{seq_id}:{position}:{deleted_sequence}:{inserted_sequence}"


def get_spdi_elements(response_object):
return (response_object['seq_id'], response_object['position'], response_object['deleted_sequence'], response_object['inserted_sequence'])


def validate_subject(patient_id):
if not patients_db.find_one({"patientID": patient_id}):
abort(400, f"Patient ({patient_id}) not found.")
Expand All @@ -196,7 +174,7 @@ def get_variant(variant):
variant = variant.lstrip()

if variant.count(":") == 1: # HGVS expression
SPDIs = hgvs_2_contextual_SPDIs(variant)
SPDIs = normalize(variant)
if not SPDIs:
abort(400, f'Cannot normalize variant: {variant}')
elif not SPDIs["GRCh37"] and not SPDIs["GRCh38"]:
Expand All @@ -205,7 +183,7 @@ def get_variant(variant):
normalized_variant = {"variant": variant, "GRCh37": SPDIs["GRCh37"], "GRCh38": SPDIs["GRCh38"]}

elif variant.count(":") == 3: # SPDI expression
SPDIs = SPDI_2_contextual_SPDIs(variant)
SPDIs = normalize(variant)
if not SPDIs:
abort(400, f'Cannot normalize variant: {variant}')
elif not SPDIs["GRCh37"] and not SPDIs["GRCh38"]:
Expand Down Expand Up @@ -1001,136 +979,6 @@ def get_intersected_regions(bed_id, build, chrom, start, end, intersected_region
intersected_regions.append(f'{ref_seq}:{max(start, csePair["Start"])}-{min(end, csePair["End"])}')


def hgvs_2_contextual_SPDIs(hgvs):

# convert hgvs to contextualSPDI
url = get_hgvs_contextuals_url(hgvs)
headers = {'Accept': 'application/json'}

r = requests.get(url, headers=headers)
if r.status_code != 200:
return False

response = r.json()
raw_data = response['data']
raw_SPDI = raw_data['spdis'][0]

seq_id, position, deleted_sequence, inserted_sequence = get_spdi_elements(raw_SPDI)

contextual_SPDI = build_spdi(seq_id, position, deleted_sequence, inserted_sequence)

# convert contextualSPDI to build37 and build38 contextual SPDIs
url = get_spdi_all_equivalent_contextual_url(contextual_SPDI)
headers = {'Accept': 'application/json'}

r = requests.get(url, headers=headers)
if r.status_code != 200:
return False

response = r.json()
raw_SPDI_List = response['data']['spdis']

b37SPDI = None
b38SPDI = None
for item in raw_SPDI_List:
if item['seq_id'].startswith("NC_"):
temp = get_build_and_chrom_by_ref_seq(item['seq_id'])
if temp:
seq_id, position, deleted_sequence, inserted_sequence = get_spdi_elements(item)

if temp['build'] == 'GRCh37':
b37SPDI = build_spdi(seq_id, position, deleted_sequence, inserted_sequence)
elif temp['build'] == 'GRCh38':
b38SPDI = build_spdi(seq_id, position, deleted_sequence, inserted_sequence)
else:
return False

return {"GRCh37": b37SPDI, "GRCh38": b38SPDI}


def hgvs_2_canonical_SPDI(hgvs):

# convert hgvs to contextualSPDI
url = get_hgvs_contextuals_url(hgvs)
headers = {'Accept': 'application/json'}

r = requests.get(url, headers=headers)
if r.status_code != 200:
return False

response = r.json()
raw_data = response['data']
raw_SPDI = raw_data['spdis'][0]

seq_id, position, deleted_sequence, inserted_sequence = get_spdi_elements(raw_SPDI)

contextual_SPDI = build_spdi(seq_id, position, deleted_sequence, inserted_sequence)

# convert contextualSPDI to canonical SPDI
url = get_spdi_canonical_representative_url(contextual_SPDI)
headers = {'Accept': 'application/json'}

r = requests.get(url, headers=headers)
if r.status_code != 200:
return False

response = r.json()
raw_SPDI = response['data']

seq_id, position, deleted_sequence, inserted_sequence = get_spdi_elements(raw_SPDI)

canonical_SPDI = build_spdi(seq_id, position, deleted_sequence, inserted_sequence)

return {"canonicalSPDI": canonical_SPDI}


def SPDI_2_contextual_SPDIs(spdi):
url = get_spdi_all_equivalent_contextual_url(spdi)
headers = {'Accept': 'application/json'}

r = requests.get(url, headers=headers)
if r.status_code != 200:
return False

response = r.json()
raw_SPDI_List = response['data']['spdis']

b37SPDI = None
b38SPDI = None
for item in raw_SPDI_List:
if item['seq_id'].startswith("NC_"):
temp = get_build_and_chrom_by_ref_seq(item['seq_id'])
if temp:
seq_id, position, deleted_sequence, inserted_sequence = get_spdi_elements(item)

if temp['build'] == 'GRCh37':
b37SPDI = build_spdi(seq_id, position, deleted_sequence, inserted_sequence)
elif temp['build'] == 'GRCh38':
b38SPDI = build_spdi(seq_id, position, deleted_sequence, inserted_sequence)
else:
return False

return {"GRCh37": b37SPDI, "GRCh38": b38SPDI}


def SPDI_2_canonical_SPDI(spdi):
url = get_spdi_canonical_representative_url(spdi)
headers = {'Accept': 'application/json'}

r = requests.get(url, headers=headers)
if r.status_code != 200:
return False

response = r.json()
raw_SPDI = response['data']

seq_id, position, deleted_sequence, inserted_sequence = get_spdi_elements(raw_SPDI)

canonical_SPDI = build_spdi(seq_id, position, deleted_sequence, inserted_sequence)

return {"canonicalSPDI": canonical_SPDI}


def query_clinvar_by_variants(normalized_variant_list, code_list, query, population=False):
variant_list = []
for item in normalized_variant_list:
Expand Down
50 changes: 42 additions & 8 deletions app/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,11 @@ def find_subject_specific_variants(
subject = subject.strip()
common.validate_subject(subject)

variants = list(map(common.get_variant, variants))
try:
variants = list(map(common.get_variant, variants))
except Exception as err:
print(f"Unexpected {err=}, {type(err)=}")
abort(422, 'Failed LiftOver')

# Query
query = {}
Expand Down Expand Up @@ -833,13 +837,22 @@ def find_subject_tx_implications(
if ranges:
ranges = list(map(common.get_range, ranges))
common.get_lift_over_range(ranges)
variants = common.get_variants(ranges, query)
try:
variants = common.get_variants(ranges, query)
except Exception as err:
print(f"Unexpected {err=}, {type(err)=}")
abort(422, 'Failed LiftOver')

if not variants:
return jsonify({"resourceType": "Parameters"})
normalized_variants = [{variant["BUILD"]: variant["SPDI"]} for variant in variants]

if variants and not ranges:
normalized_variants = list(map(common.get_variant, variants))
try:
normalized_variants = list(map(common.get_variant, variants))
except Exception as err:
print(f"Unexpected {err=}, {type(err)=}")
abort(422, 'Failed LiftOver')

# Result Object
result = OrderedDict()
Expand Down Expand Up @@ -1105,13 +1118,22 @@ def find_subject_dx_implications(
if ranges:
ranges = list(map(common.get_range, ranges))
common.get_lift_over_range(ranges)
variants = common.get_variants(ranges, query)
try:
variants = common.get_variants(ranges, query)
except Exception as err:
print(f"Unexpected {err=}, {type(err)=}")
abort(422, 'Failed LiftOver')

if not variants:
return jsonify({"resourceType": "Parameters"})
normalized_variants = [{variant["BUILD"]: variant["SPDI"]} for variant in variants]

if variants and not ranges:
normalized_variants = list(map(common.get_variant, variants))
try:
normalized_variants = list(map(common.get_variant, variants))
except Exception as err:
print(f"Unexpected {err=}, {type(err)=}")
abort(422, 'Failed LiftOver')

# Result Object
result = OrderedDict()
Expand Down Expand Up @@ -1373,7 +1395,11 @@ def find_population_specific_variants(
# Parameters
variants = list(map(lambda x: x.strip().split(","), variants))
for i in range(len(variants)):
variants[i] = list(map(common.get_variant, variants[i]))
try:
variants[i] = list(map(common.get_variant, variants[i]))
except Exception as err:
print(f"Unexpected {err=}, {type(err)=}")
abort(422, 'Failed LiftOver')

# Query
query = {}
Expand Down Expand Up @@ -1884,7 +1910,11 @@ def find_population_tx_implications(
return jsonify({"resourceType": "Parameters"})

if variants:
variants = list(map(common.get_variant, variants))
try:
variants = list(map(common.get_variant, variants))
except Exception as err:
print(f"Unexpected {err=}, {type(err)=}")
abort(422, 'Failed LiftOver')

condition_code_list = []
if conditions:
Expand Down Expand Up @@ -2090,7 +2120,11 @@ def find_population_dx_implications(
return jsonify({"resourceType": "Parameters"})

if variants:
variants = list(map(common.get_variant, variants))
try:
variants = list(map(common.get_variant, variants))
except Exception as err:
print(f"Unexpected {err=}, {type(err)=}")
abort(422, 'Failed LiftOver')

condition_code_list = []
if conditions:
Expand Down
23 changes: 23 additions & 0 deletions app/fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from pathlib import Path
from pyfastx import Fasta
from urllib.request import urlretrieve


def download_fasta():
try:
# Make sure the parent folder exists
Path('FASTA').mkdir(exist_ok=True)

for build in ['GRCh37', 'GRCh38']:
filename = build + '_latest_genomic.fna.gz'
filepath = 'FASTA/' + filename

# Download files
if not Path(filepath).is_file():
urlretrieve('https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/' + build + '_latest/refseq_identifiers/' + filename, filepath)

# Build indexes
if not Path(filepath + '.fxi').is_file():
Fasta(filepath)
except Exception as error:
print(error)
Loading

0 comments on commit f9b5802

Please sign in to comment.