Skip to content

Commit 71c56a0

Browse files
committed
🚂Append Ensembl&Eutils API to retrieve sequences
1 parent 6485591 commit 71c56a0

File tree

12 files changed

+255
-35
lines changed

12 files changed

+255
-35
lines changed

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212

1313
Profiling Protein Structures from Protein Data Bank and integrate various resources.
1414

15+
[![HitCount](http://hits.dwyl.com/naturegeorge/pdb-profiling.svg)](http://hits.dwyl.com/naturegeorge/pdb-profiling)
16+
1517
## Goal
1618

1719
* Gather helpful/insightful indexes to evaluate a PDB structure's usefulness in:
@@ -45,6 +47,12 @@ Profiling Protein Structures from Protein Data Bank and integrate various resour
4547
* Interactome3D API
4648
* <https://interactome3d.irbbarcelona.org/>
4749
* ModBase API (?)
50+
* Ensembl REST API
51+
* <https://rest.ensembl.org/documentation>
52+
* NOTE: currently only support <https://rest.ensembl.org/documentation/info/sequence_id> to retrieve ensembl sequences
53+
* Eutils API
54+
* <https://eutils.ncbi.nlm.nih.gov/entrez/eutils/>
55+
* NOTE: currently only support minimum use
4856
* Download data from PDB Archive against unexpected needs
4957
* wwwPDB&RCSB: <https://ftp.wwpdb.org/pub/pdb/data/structures/>
5058
* EBI: <http://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/>

pdb_profiling/__init__.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,21 +36,13 @@ def default_config():
3636
from pdb_profiling.processors.uniprot.api import UniProtFASTA
3737
# Use Existing Handled PDBe API Results (e.g. tsv format results)
3838
ProcessPDBe.use_existing = True
39-
# Init Abclog Logger
40-
Abclog.init_logger(logName='PDB-Profiling')
41-
# Init ProcessPDBe's Logger (pass it with Abclog Logger)
42-
ProcessPDBe.init_logger(logger=Abclog.logger)
4339
# Use Existing API Results (e.g. json format results downloaded from web)
4440
UnsyncFetch.use_existing = True
41+
# Init Abclog Logger
42+
Abclog.init_logger(logName='PDB-Profiling')
4543
# Init WebFetcher's Logger (pass it with Abclog Logger)
46-
UnsyncFetch.init_setting(ProcessPDBe.logger)
44+
UnsyncFetch.init_setting(Abclog.logger)
4745
# Set WebFetcher's Semaphore
4846
Base.set_web_semaphore(30)
4947
# Set Folder that store downloaded and handled files
5048
Base.set_folder('./')
51-
# Init ModelServer API's Logger (pass it with Abclog Logger)
52-
PDBeModelServer.init_logger(logger=ProcessPDBe.logger)
53-
# Init PDBArchive API's Logger (pass it with Abclog Logger)
54-
PDBArchive.init_logger(logger=ProcessPDBe.logger)
55-
# Init UniProtFASTA API's Logger (pass it with Abclog Logger)
56-
UniProtFASTA.init_logger(logger=ProcessPDBe.logger)

pdb_profiling/processors/ensembl/__init__.py

Whitespace-only changes.
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# @Created Date: 2020-09-26 01:54:56 pm
2+
# @Filename: api.py
3+
# @Email: 1730416009@stu.suda.edu.cn
4+
# @Author: ZeFeng Zhu
5+
# @Last Modified: 2020-09-26 01:55:04 pm
6+
# @Copyright (c) 2020 MinghuiGroup, Soochow University
7+
from typing import Dict, Tuple, Iterable, Generator, Union
8+
from pathlib import Path
9+
from pdb_profiling.log import Abclog
10+
from pdb_profiling.fetcher.webfetch import UnsyncFetch
11+
12+
13+
BASE_URL = 'https://rest.ensembl.org/'
14+
15+
16+
class EnsemblAPI(Abclog):
17+
'''
18+
Implement The Ensembl REST API
19+
20+
* <https://rest.ensembl.org/documentation/>
21+
* DEMO: <https://rest.ensembl.org/sequence/id/ENST00000288602?content-type=text/x-fasta;type=protein>
22+
'''
23+
headers = {"Content-Type": "text/x-fasta"}
24+
api_set = frozenset(('sequence/id/',))
25+
26+
@classmethod
27+
def get_file_suffix(cls) -> str:
28+
res = cls.headers["Content-Type"].split('/')[1]
29+
assert res in ('plain', 'x-seqxml+xml', 'x-fasta'), f"Unexcepted Case: {cls.headers}"
30+
return res.replace('x-', '').replace('seqxml+', '')
31+
32+
@classmethod
33+
def task_unit(cls, suffix: str, identifier: str, params: Dict, folder: Path) -> Tuple:
34+
args = dict(
35+
url=f'{BASE_URL}{suffix}{identifier}',
36+
headers=cls.headers,
37+
params=params)
38+
return 'get', args, folder/f'{identifier}.{cls.get_file_suffix()}'
39+
40+
@classmethod
41+
def yieldTasks(cls, suffix: str, identifiers: Iterable[str], params_collection: Iterable[Dict], folder: Path) -> Generator:
42+
for identifier, params in zip(identifiers, params_collection):
43+
yield cls.task_unit(suffix, identifier, params, folder)
44+
45+
@classmethod
46+
def retrieve(cls, suffix: str, identifiers: Iterable[str], params_collection: Iterable[Dict], folder: Union[Path, str], concur_req: int = 20, rate: float = 1.5, ret_res: bool = True, **kwargs):
47+
assert suffix in cls.api_set, f"Invalid suffix! Valid set is \n{cls.api_set}"
48+
folder = Path(folder)
49+
res = UnsyncFetch.multi_tasks(
50+
cls.yieldTasks(suffix, identifiers, params_collection, folder),
51+
concur_req=concur_req,
52+
rate=rate,
53+
logger=cls.logger,
54+
ret_res=ret_res,
55+
semaphore=kwargs.get('semaphore', None))
56+
return res
57+
58+
@classmethod
59+
def single_retrieve(cls, suffix: str, identifier: str, params: Dict, folder: Union[Path, str], semaphore, rate: float = 1.5):
60+
assert suffix in cls.api_set, f"Invalid suffix! Valid set is \n{cls.api_set}"
61+
folder = Path(folder)
62+
return UnsyncFetch.single_task(
63+
task=cls.task_unit(suffix, identifier, params, folder),
64+
semaphore=semaphore,
65+
rate=rate)

pdb_profiling/processors/eutils/__init__.py

Whitespace-only changes.
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# @Created Date: 2020-09-26 02:51:49 pm
2+
# @Filename: api.py
3+
# @Email: 1730416009@stu.suda.edu.cn
4+
# @Author: ZeFeng Zhu
5+
# @Last Modified: 2020-09-26 02:51:59 pm
6+
# @Copyright (c) 2020 MinghuiGroup, Soochow University
7+
from typing import Dict, Tuple, Iterable, Generator, Union
8+
from pathlib import Path
9+
from pdb_profiling.log import Abclog
10+
from pdb_profiling.fetcher.webfetch import UnsyncFetch
11+
12+
BASE_URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
13+
14+
15+
class EutilsAPI(Abclog):
16+
'''
17+
Implement The Entrez Programming Utilities (E-utilities) API
18+
19+
* Entrez Programming Utilities Help: <https://www.ncbi.nlm.nih.gov/books/NBK25501/>
20+
* DEMO URL 1: <https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=NP_001291289&rettype=fasta>
21+
* DEMO URL 2: <https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sequences&id=NM_001304360&rettype=fasta>
22+
'''
23+
headers = {"Content-Type": "text/plain"}
24+
api_set = frozenset(('efetch.fcgi', 'einfo.fcgi', 'esearch.fcgi',
25+
'epost.fcgi', 'esummary.fcgi'))
26+
27+
@classmethod
28+
def dumpsParams(cls, params: Dict) -> str:
29+
return '&'.join(f'{key}={value}' for key, value in params.items())
30+
31+
@classmethod
32+
def task_unit(cls, suffix: str, params: Dict, folder: Path) -> Tuple:
33+
args = dict(
34+
url=f'{BASE_URL}{suffix}',
35+
headers=cls.headers,
36+
params=params)
37+
return 'get', args, folder/f'{cls.dumpsParams(params)}.{params.get("retmode", params.get("rettype", "txt"))}'
38+
39+
@classmethod
40+
def yieldTasks(cls, suffix: str, params_collection: Iterable[Dict], folder: Path) -> Generator:
41+
for params in params_collection:
42+
yield cls.task_unit(suffix, params, folder)
43+
44+
@classmethod
45+
def retrieve(cls, suffix: str, params_collection: Iterable[Dict], folder: Union[Path, str], concur_req: int = 20, rate: float = 1.5, ret_res: bool = True, **kwargs):
46+
assert suffix in cls.api_set, f"Invalid suffix! Valid set is \n{cls.api_set}"
47+
folder = Path(folder)
48+
res = UnsyncFetch.multi_tasks(
49+
cls.yieldTasks(suffix, params_collection, folder),
50+
concur_req=concur_req,
51+
rate=rate,
52+
logger=cls.logger,
53+
ret_res=ret_res,
54+
semaphore=kwargs.get('semaphore', None))
55+
return res
56+
57+
@classmethod
58+
def single_retrieve(cls, suffix: str, params: Dict, folder: Union[Path, str], semaphore, rate: float = 1.5):
59+
assert suffix in cls.api_set, f"Invalid suffix! Valid set is \n{cls.api_set}"
60+
folder = Path(folder)
61+
return UnsyncFetch.single_task(
62+
task=cls.task_unit(suffix, params, folder),
63+
semaphore=semaphore,
64+
rate=rate)

pdb_profiling/processors/pdbe/api.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636

3737
PDB_ARCHIVE_URL_EBI: str = 'http://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/'
3838
PDB_ARCHIVE_URL_WWPDB: str = 'https://ftp.wwpdb.org/pub/pdb/data/structures/'
39-
PDB_ARCHIVE_VERSIONED_URL: str = 'http://ftp-versioned.wwpdb.org/pdb_versioned/data/
39+
PDB_ARCHIVE_VERSIONED_URL: str = 'http://ftp-versioned.wwpdb.org/pdb_versioned/data/'
4040

4141
# https://ftp.wwpdb.org/pub/pdb/data/structures/obsolete/mmCIF/a0/2a01.cif.gz
4242
# http://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/obsolete/mmCIF/a0/2a01.cif.gz
@@ -707,7 +707,7 @@ class PDBeModelServer(Abclog):
707707

708708
root = 'model-server/v1/'
709709
headers = {'accept': 'text/plain', 'Content-Type': 'application/json'}
710-
api_sets = frozenset(('atoms', 'residueInteraction', 'assembly', 'full', 'ligand'
710+
api_set = frozenset(('atoms', 'residueInteraction', 'assembly', 'full', 'ligand'
711711
'residueSurroundings', 'symmetryMates', 'query-many'))
712712

713713
@classmethod
@@ -765,7 +765,7 @@ class PDBArchive(Abclog):
765765
* EBI: PDB_ARCHIVE_URL_EBI: str = 'http://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/'
766766
'''
767767
root = PDB_ARCHIVE_URL_EBI
768-
api_sets = frozenset(f'{i}/{j}/' for i in ('obsolete', 'divided')
768+
api_set = frozenset(f'{i}/{j}/' for i in ('obsolete', 'divided')
769769
for j in ('mmCIF', 'pdb', 'XML'))
770770

771771
@classmethod
@@ -802,9 +802,14 @@ class PDBVersioned(PDBArchive):
802802
Download files from PDB Versioned
803803
804804
* wwwPDB Versioned: PDB_ARCHIVE_VERSIONED_URL: str = 'http://ftp-versioned.wwpdb.org/pdb_versioned/data/entries/'
805+
806+
>>> PDBVersioned.single_retrieve(
807+
('2wmg', '_v1-2'), 'entries/',
808+
init_folder_from_suffix(Base.get_folder(), 'pdb-versioned/entries'),
809+
Base.get_web_semaphore()).result()
805810
'''
806811
root = PDB_ARCHIVE_VERSIONED_URL
807-
api_sets = frozenset(('entries/', 'removed/'))
812+
api_set = frozenset(('entries/', 'removed/'))
808813

809814
@classmethod
810815
def task_unit(cls, pdb_with_version: Tuple, suffix: str, file_suffix: str, folder: Path):

pdb_profiling/processors/pdbe/record.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ def get_id(self):
200200
return self.pdb_id
201201

202202
def fetch_from_modelServer_api(self, api_suffix: str, method: str = 'post', data_collection=None, params=None, then_func: Optional[Callable[[Unfuture], Unfuture]] = None) -> Unfuture:
203-
assert api_suffix in PDBeModelServer.api_sets, f"Invlaid API SUFFIX! Valid set:\n{PDBeModelServer.api_sets}"
203+
assert api_suffix in PDBeModelServer.api_set, f"Invlaid API SUFFIX! Valid set:\n{PDBeModelServer.api_set}"
204204
task = self.tasks.get((PDBeModelServer.root, api_suffix, method, data_collection, params, then_func), None)
205205
if task is not None:
206206
return task
@@ -218,7 +218,7 @@ def fetch_from_modelServer_api(self, api_suffix: str, method: str = 'post', data
218218
return task
219219

220220
def fetch_from_PDBArchive(self, api_suffix: str, then_func: Optional[Callable[[Unfuture], Unfuture]] = None, **kwargs) -> Unfuture:
221-
assert api_suffix in PDBArchive.api_sets, f"Invlaid API SUFFIX! Valid set:\n{PDBArchive.api_sets}"
221+
assert api_suffix in PDBArchive.api_set, f"Invlaid API SUFFIX! Valid set:\n{PDBArchive.api_set}"
222222
task = self.tasks.get((PDBArchive.root, api_suffix, then_func), None)
223223
if task is not None:
224224
return task
@@ -880,6 +880,7 @@ def __init__(self, identifier:str):
880880
self.set_id(identifier)
881881
self.tasks = dict()
882882

883+
883884
'''
884885
TODO: Deal with carbohydrate polymer in PISA
885886

pdb_profiling/processors/proteins/api.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ class ProteinsAPI(Abclog):
2121
'''
2222

2323
headers = {'Accept': 'application/json'}
24-
api_sets = frozenset((
25-
'coordinates', 'coordinates/', 'coordinates/location/',
26-
'uniparc', 'uniparc/accession/', 'uniparc/best/guess',
27-
'uniparc/dbreference/', 'uniparc/proteome/', 'uniparc/sequence', # NOTE: uniparc/sequence use POST method!
28-
'uniparc/upi/'))
24+
api_set = frozenset((
25+
'proteins', 'proteins/covid-19/entries', 'proteins/interaction/', 'proteins/',
26+
'coordinates', 'coordinates/', 'coordinates/location/',
27+
'uniparc', 'uniparc/accession/', 'uniparc/best/guess',
28+
'uniparc/dbreference/', 'uniparc/proteome/', 'uniparc/sequence', # NOTE: uniparc/sequence use POST method!
29+
'uniparc/upi/'))
2930

3031
@classmethod
3132
def get_file_suffix(cls) -> str:
@@ -43,7 +44,7 @@ def task_unit(cls, suffix: str, params: Dict, folder: Path, identifier:Optional[
4344
url=f'{BASE_URL}{suffix}' if identifier is None else f'{BASE_URL}{suffix}{identifier}',
4445
headers=cls.headers,
4546
params=params)
46-
return 'get', args, folder/f'{identifier if identifier is not None else cls.dumpsParams(params)}.{cls.get_file_suffix()}'
47+
return 'get', args, folder/f'{identifier.replace(":", "_")+"_"+cls.dumpsParams(params) if identifier is not None else cls.dumpsParams(params)}.{cls.get_file_suffix()}'
4748

4849
@classmethod
4950
def yieldTasks(cls, suffix: str, params_collection: Iterable[Dict], folder: Path, identifiers: Optional[Iterable[str]]) -> Generator:
@@ -57,7 +58,7 @@ def yieldTasks(cls, suffix: str, params_collection: Iterable[Dict], folder: Path
5758

5859
@classmethod
5960
def retrieve(cls, suffix: str, params_collection: Iterable[Dict], folder: Union[Path, str], identifiers: Optional[Iterable[str]] = None, concur_req: int = 20, rate: float = 1.5, ret_res: bool = True, **kwargs):
60-
assert suffix in cls.api_sets, f"Invalid suffix! Valid set is \n{cls.api_sets}"
61+
assert suffix in cls.api_set, f"Invalid suffix! Valid set is \n{cls.api_set}"
6162
folder = Path(folder)
6263
res = UnsyncFetch.multi_tasks(
6364
cls.yieldTasks(suffix, params_collection, folder, identifiers),
@@ -70,7 +71,7 @@ def retrieve(cls, suffix: str, params_collection: Iterable[Dict], folder: Union[
7071

7172
@classmethod
7273
def single_retrieve(cls, suffix: str, params:Dict, folder: Union[Path, str], semaphore, identifier:Optional[str]=None, rate: float = 1.5):
73-
assert suffix in cls.api_sets, f"Invalid suffix! Valid set is \n{cls.api_sets}"
74+
assert suffix in cls.api_set, f"Invalid suffix! Valid set is \n{cls.api_set}"
7475
folder = Path(folder)
7576
return UnsyncFetch.single_task(
7677
task=cls.task_unit(suffix, params, folder, identifier),

pdb_profiling/processors/uniprot/api.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -405,12 +405,16 @@ async def process(cls, path: Union[str, Path, Unfuture]):
405405
await fob
406406
return path
407407

408+
@classmethod
409+
def task_unit(cls, unp:str, folder: Union[str, Path]):
410+
cur_fileName = f'{unp}.fasta'
411+
cur_filePath = str(Path(folder, cur_fileName))
412+
return ('get', {'url': f'{BASE_URL}/uniprot/{cur_fileName}', 'params': cls.params}, cur_filePath)
413+
408414
@classmethod
409415
def yieldTasks(cls, lyst: Iterable, folder: Union[str, Path]) -> Generator:
410416
for unp in lyst:
411-
cur_fileName = f'{unp}.fasta'
412-
cur_filePath = str(Path(folder, cur_fileName))
413-
yield ('get', {'url': f'{BASE_URL}/uniprot/{cur_fileName}', 'params': cls.params}, cur_filePath)
417+
return cls.task_unit(unp, folder)
414418

415419
@classmethod
416420
def retrieve(cls, lyst: Iterable, folder: Union[str, Path], concur_req: int = 20, rate: float = 1.5, ret_res: bool = True, semaphore=None):
@@ -421,3 +425,10 @@ def retrieve(cls, lyst: Iterable, folder: Union[str, Path], concur_req: int = 20
421425
logger=cls.logger,
422426
ret_res=ret_res,
423427
semaphore=semaphore)
428+
429+
@classmethod
430+
def single_retrieve(cls, identifier: str, folder: Union[str, Path], semaphore, rate: float = 1.5):
431+
return UnsyncFetch.single_task(
432+
task=cls.task_unit(identifier, folder),
433+
semaphore=semaphore,
434+
rate=rate)

0 commit comments

Comments
 (0)