-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSFtool.py
executable file
·171 lines (137 loc) · 6.7 KB
/
SFtool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Herramienta para el manejo automático de hallazgos secundarios.
Esta herramienta permite a los usuarios analizar archivos VCF para el manejo automático de hallazgos secundarios relacionados con riesgo personal, riesgo reproductivo y farmacogenético.
@Dependencies InterVar and AnnoVar
@Usage python3 SFtool.py input_file.vcf --mode <Option: 'basic' or 'advanced'> --evidence <integer> --assembly <Option: '37' or '38'>
@Arguments:
-vcf (str): Ruta al archivo VCF de entrada.
-outpath (str): Ruta al directorio donde se guardarán los resultados.
-mode (str): Modo de análisis (básico o avanzado).
-evidence (int): Nivel de evidencia de ClinVar para el modo avanzado (1-4).#comprobar que lo he puesto de memoria
-assembly (str): Ensamblaje genómico a utilizar (GRCh37 o GRCh38).
@Author Javier Perez FLorido, Edurne Urrutia Lafuente
@Date 2023/08/01
@email [email protected], [email protected]
@github https://github.com/babelomics/secondaryfindings
"""
import os
import json
from modules.misc.arguments import arguments
from modules.misc.build_json_bed_files import build_json_bed_files
from modules.misc.clinvar_utils import clinvar_manager
from modules.FG.run_fg_module import run_pharmacogenomic_risk_module
from modules.PR_RR.run_pers_repro_risk_module import run_pers_repro_risk_module
from modules.misc.check_dependencies import check_dependencies
from modules.misc.vcf_utils import normalize_vcf, intersect_vcf_with_bed
from modules.misc.report_utils import generate_report
def main():
"""
Get the arguments from command line
"""
[args, categories] = arguments()
vcf_file = args.vcf_file
mode = args.mode
evidence = args.evidence
assembly = str(args.assembly)
out_path = args.output_dir
temp_path = os.path.join(out_path,'tmp')
if not os.path.exists(temp_path):
os.makedirs(temp_path)
"""
Read config file
"""
# Leer el archivo de configuración config.json
with open(args.config_file, "r") as config_file:
config_data = json.load(config_file)
# Get values from config file
categories_path = config_data["categories_path"]
clinvar_path = config_data["clinvar_path"]
clinvar_ddbb_version = config_data["clinvar_ddbb_version"]
bcftools_path = config_data["bcftools_path"]
htslib_path = config_data["htslib_path"]
personal_risk_geneset_file = config_data["personal_risk_geneset_file"]
reproductive_risk_geneset_file = config_data["reproductive_risk_geneset_file"]
genebe_path = config_data["genebe_path"]
java_path = config_data["java_path"]
genebe_apikey = config_data["genebe_apikey"]
genebe_username = config_data["genebe_username"]
python_path = config_data["python_path"]
pharmCAT_path = config_data["pharmCAT_path"]
if assembly == '37':
reference_genome = config_data["reference_genome_37_path"]
elif assembly == '38':
reference_genome = config_data["reference_genome_38_path"]
"""
Create several directories: clinvar, temp and final_output directories (if they dont exist)
"""
for folder in [clinvar_path, temp_path, out_path]:
if not os.path.exists(folder):
os.mkdir(folder)
"""
Check dependencies
"""
check_dependencies(genebe_path, bcftools_path, java_path, pharmCAT_path, python_path)
"""
Generate JSON and BED files for PR or RR categories
"""
if "pr" in categories:
# Check whether BED file (and consequently, JSON file) for each category exist. If not, create them
# Personal risk catalogue
if not os.path.exists(f"{categories_path}/PR/pr_risk_genes_GRCh{assembly}.bed"):
build_json_bed_files("pr", assembly, categories_path, personal_risk_geneset_file, vcf_file)
if "rr" in categories:
# Reproductive risk catalogue
if not os.path.exists(f"{categories_path}/RR/rr_risk_genes_GRCh{assembly}.bed"):
build_json_bed_files("rr", assembly, categories_path, reproductive_risk_geneset_file, vcf_file)
"""
In advanced mode, check/update clinVar database
"""
# If "advanced" mode, check whether Clinvar Database exists
if mode == 'advanced' and ("pr" in categories or "rr" in categories):
clinvar_db = clinvar_manager(clinvar_path, clinvar_ddbb_version, assembly)
else:
clinvar_db = None
"""
VCF normalization: only of PR or RR cateogry (FG has its own normalization procedure)
"""
if "pr" in categories or "rr" in categories:
norm_vcf_file = normalize_vcf(vcf_file, temp_path, bcftools_path, reference_genome)
"""
Normalized VCF and BED intersection for each category
"""
input_vcf_files = {}
for category in categories:
if category == "pr" or category == "rr":
category_bed_file = os.path.join(categories_path + category.upper(), category + '_risk_genes_GRCh' + assembly + '.bed')
generated_vcf_file = intersect_vcf_with_bed(norm_vcf_file, category_bed_file, temp_path, category)
input_vcf_files[category] = generated_vcf_file
"""
Execute modules selected by the user according to categories
"""
# Run modules selected by user
if "pr" in categories:
# Run Personal Risk (PR) module
pr_results = run_pers_repro_risk_module(input_vcf_files['pr'], assembly, mode, evidence, clinvar_db, 'pr', personal_risk_geneset_file, genebe_path, java_path, genebe_apikey, genebe_username)
else:
pr_results = None
if "rr" in categories:
# Run Reproductive Risk (RR) module
rr_results = run_pers_repro_risk_module(input_vcf_files['rr'], assembly, mode, evidence, clinvar_db, 'rr', reproductive_risk_geneset_file, genebe_path, java_path, genebe_apikey, genebe_username)
else:
rr_results = None
if "fg" in categories: # Run Pharmacogenetic (FG) module - pharmCAT
if assembly == "38": # pharmCAT is only allowed for GRCh38 assembly
[pharmCAT_report_file, haplot_results] = run_pharmacogenomic_risk_module(vcf_file, python_path, pharmCAT_path, bcftools_path, htslib_path, java_path, out_path)
else:
haplot_results = None
print("Farmacogenomic module (pharmCAT) is available only for GRCh38 human assembly")
else:
haplot_results = None
"""
Create report
"""
generate_report(pr_results, rr_results, haplot_results, pharmCAT_report_file, config_data, args, clinvar_db, categories, out_path)
if __name__ == "__main__":
main()