Skip to content

Commit 6cd14fb

Browse files
authored
Merge pull request #14 from krassowski/get-coding-consequence
Get coding consequence, add range example
2 parents 50b1e4a + 7c1909b commit 6cd14fb

File tree

4 files changed

+118
-3
lines changed

4 files changed

+118
-3
lines changed

README.md

+65
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,71 @@ The base position should use the latest genome assembly (GRCh38 at the time of w
184184
you can use the position in previous assembly coordinates by replacing `POSITION` with `POSITION_GRCH37`.
185185
For more information of the arguments accepted by the SNP database see the [entrez help page](https://www.ncbi.nlm.nih.gov/snp/docs/entrez_help/) on NCBI website.
186186

187+
#### Obtaining amino acids change information for variants in given range
188+
189+
First we search for dbSNP rs identifiers for variants in given region:
190+
191+
```python
192+
dbsnp_ids = (
193+
entrez_api
194+
.search(
195+
'12[CHROMOSOME] AND human[ORGANISM] AND 21178600:21178720[POSITION]',
196+
database='snp',
197+
max_results=100
198+
)
199+
.data
200+
['esearchresult']
201+
['idlist']
202+
)
203+
```
204+
205+
Then fetch the variant data for identifiers:
206+
207+
```python
208+
variant_data = entrez_api.fetch(
209+
['rs' + rs_id for rs_id in dbsnp_ids],
210+
max_results=10,
211+
database='snp'
212+
)
213+
```
214+
215+
And parse the data, extracting the HGVS out of summary:
216+
217+
```python
218+
from easy_entrez.parsing import parse_dbsnp_variants
219+
from pandas import Series
220+
221+
222+
def select_protein_hgvs(items):
223+
return [
224+
[sequence, hgvs]
225+
for entry in items
226+
for sequence, hgvs in [entry.split(':')]
227+
if hgvs.startswith('p.')
228+
]
229+
230+
231+
protein_hgvs = (
232+
parse_dbsnp_variants(variant_data)
233+
.summary
234+
.HGVS
235+
.apply(select_protein_hgvs)
236+
.explode()
237+
.dropna()
238+
.apply(Series)
239+
.rename(columns={0: 'sequence', 1: 'hgvs'})
240+
)
241+
protein_hgvs.head()
242+
```
243+
244+
> | rs_id | sequence | hgvs |
245+
> |:-------------|:------------|:------------|
246+
> | rs1940853486 | NP_006437.3 | p.Gly203Ter |
247+
> | rs1940853414 | NP_006437.3 | p.Glu202Gly |
248+
> | rs1940853378 | NP_006437.3 | p.Glu202Lys |
249+
> | rs1940853299 | NP_006437.3 | p.Lys201Thr |
250+
> | rs1940852987 | NP_006437.3 | p.Asp198Glu |
251+
187252
#### Find PubMed ID from DOI
188253

189254
When searching GWAS catalog PMID is needed over DOI. You can covert one to the other using:

easy_entrez/parsing.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,25 @@ class VariantSet:
3939
alt_frequencies: DataFrame
4040
#: Preferred identifiers map (old → new); old != new for merged variants.
4141
preferred_ids: dict
42+
#: Data from DOCSUM field including GENE, HGVS, etc.
43+
summary: DataFrame
4244

4345
def __repr__(self):
4446
return f'<VariantSet with {len(self.coordinates)} variants>'
4547

4648

49+
def parse_docsum(docsum: str) -> dict:
50+
result = {}
51+
for entry in docsum.split('|'):
52+
key, value = entry.split('=', maxsplit=1)
53+
result[key] = value
54+
if 'HGVS' in result:
55+
result['HGVS'] = result['HGVS'].replace('&gt;', '>').split(',')
56+
if 'LEN' in result:
57+
result['LEN'] = float(result['LEN'])
58+
return result
59+
60+
4761
def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) -> VariantSet:
4862
"""Parse coordinates, frequencies and preferred IDs of dbSNP variants.
4963
@@ -62,6 +76,7 @@ def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) ->
6276
results = []
6377
alt_frequencies = []
6478
preferred_id = {}
79+
summaries = []
6580

6681
for i, snp in enumerate(snps):
6782
error = snp.find('.//ns0:error', namespaces)
@@ -80,6 +95,16 @@ def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) ->
8095
chrom_prev, pos_prev = snp.find('.//ns0:CHRPOS_PREV_ASSM', namespaces).text.split(':')
8196
sig_class = snp.find('.//ns0:FXN_CLASS', namespaces).text
8297

98+
doc_sum = snp.find('.//ns0:DOCSUM', namespaces).text
99+
try:
100+
doc_sum = parse_docsum(doc_sum)
101+
summaries.append({
102+
**doc_sum,
103+
'rs_id': f'rs{rs_id}'
104+
})
105+
except Exception as e:
106+
warn(f'Failed to parse DOCSUM: {e}')
107+
83108
merged_into = snp.find('.//ns0:SNP_ID', namespaces).text
84109
if rs_id != merged_into:
85110
was_merged = snp.find('.//ns0:MERGED_SORT', namespaces).text
@@ -138,7 +163,8 @@ def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) ->
138163
return VariantSet(
139164
coordinates=DataFrame(results).set_index('rs_id'),
140165
alt_frequencies=DataFrame(alt_frequencies),
141-
preferred_ids=preferred_id
166+
preferred_ids=preferred_id,
167+
summary=DataFrame(summaries).set_index('rs_id')
142168
)
143169

144170

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def get_long_description(file_name):
1414
package_data={'easy_entrez': ['data/*.tsv', 'py.typed']},
1515
# required for mypy to work
1616
zip_safe=False,
17-
version='0.3.4',
17+
version='0.3.5',
1818
license='MIT',
1919
description='Python REST API for Entrez E-Utilities: stateless, easy to use, reliable.',
2020
long_description=get_long_description('README.md'),

tests/test_parsing.py

+25-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from typing import Dict, Union
33
from dataclasses import dataclass
44
from xml.etree.ElementTree import Element, fromstring
5-
from easy_entrez.parsing import parse_dbsnp_variants, VariantSet
5+
from easy_entrez.parsing import parse_dbsnp_variants, VariantSet, parse_docsum
66
from easy_entrez.queries import FetchQuery
77
try:
88
from typing import Literal
@@ -17,6 +17,25 @@ class DummyResponse:
1717
data: Union[Element, Dict]
1818

1919

20+
DOCSUM_CODING = "HGVS=NC_000012.12:g.21178699A&gt;G,NC_000012.11:g.21331633A&gt;G,NG_011745.1:g.52506A&gt;G,NM_006446.5:c.605A&gt;G,NM_006446.4:c.605A&gt;G,NP_006437.3:p.Glu202Gly|SEQ=[A/G]|LEN=1|GENE=SLCO1B1:10599"
21+
22+
23+
def test_docsum():
24+
assert parse_docsum(DOCSUM_CODING) == {
25+
'HGVS': [
26+
'NC_000012.12:g.21178699A>G',
27+
'NC_000012.11:g.21331633A>G',
28+
'NG_011745.1:g.52506A>G',
29+
'NM_006446.5:c.605A>G',
30+
'NM_006446.4:c.605A>G',
31+
'NP_006437.3:p.Glu202Gly'
32+
],
33+
'SEQ': '[A/G]',
34+
'LEN': 1,
35+
'GENE': 'SLCO1B1:10599'
36+
}
37+
38+
2039
@pytest.mark.optional
2140
def test_parse_two_snps():
2241
response = DummyResponse(
@@ -50,6 +69,11 @@ def test_parse_two_snps():
5069
assert frequencies.total_count.min() > 0
5170
assert '1000Genomes' in set(frequencies.study)
5271

72+
summary = variant_set.summary
73+
assert len(summary) == 2
74+
assert set(summary.index) == {'rs6311', 'rs662138'}
75+
assert set(summary.columns) == {'HGVS', 'SEQ', 'LEN', 'GENE'}
76+
5377

5478
@pytest.mark.optional
5579
def test_merged_variant_solving():

0 commit comments

Comments
 (0)