Skip to content

Commit 1dadb81

Browse files
committed
update
1 parent 423be68 commit 1dadb81

File tree

4 files changed

+756
-0
lines changed

4 files changed

+756
-0
lines changed

src/RF.py

+336
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
import sys
2+
import collections
3+
import operator
4+
import itertools
5+
from bisect import bisect_left
6+
import os
7+
import glob
8+
import concurrent.futures
9+
import math
10+
import time
11+
12+
import matplotlib
13+
14+
matplotlib.use("Agg")
15+
import matplotlib.pyplot as plt
16+
from matplotlib.backends.backend_pdf import PdfPages
17+
18+
import RFcommonfn
19+
import RFread
20+
import RFlib
21+
22+
23+
Ent = RFcommonfn.Ent
24+
25+
param_set = {
26+
"mzML_files",
27+
"library",
28+
"ms1_ppm",
29+
"sample_info",
30+
}
31+
param_dict = RFcommonfn.read_param(param_set)
32+
mzML_files = sorted(glob.glob(param_dict["mzML_files"]))
33+
basename_l = [os.path.basename(x) for x in mzML_files]
34+
ms1ppm = float(param_dict["ms1_ppm"]) / 1e6
35+
36+
37+
def get_sample_info():
38+
sample_info_file = glob.glob(param_dict["sample_info"])[0]
39+
sample_type = dict()
40+
with open(sample_info_file) as sfile:
41+
next(sfile)
42+
for line in sfile:
43+
if line[0] != "#" and line.rstrip():
44+
lsp = line.rstrip("\n").split("\t")
45+
sample_type[lsp[0]] = lsp[1].upper()
46+
return sample_type
47+
48+
49+
def cos_sim(list1, list2):
50+
if len(list1) != len(list2):
51+
print("adf")
52+
sys.exit()
53+
if sum(list1) <= 0 or sum(list2) <= 0:
54+
return 0
55+
return sum(math.sqrt(x * y) for x, y in zip(list1, list2)) / math.sqrt(
56+
sum(list1) * sum(list2)
57+
)
58+
59+
60+
def aveMS1spec(mzML_file):
61+
basename0 = os.path.basename(mzML_file)
62+
print(basename0)
63+
ms1_scans, ms2_scans, _ = RFread.print_eic_ms(mzML_file)
64+
ms1_peaks = sorted(
65+
(mz, i, ii)
66+
for ii, ms1_ in enumerate(ms1_scans)
67+
for mz, i in zip(ms1_.mz_l, ms1_.I_l)
68+
)
69+
avespec = []
70+
while ms1_peaks:
71+
maxI = max(ms1_peaks, key=operator.itemgetter(1))[0]
72+
pos0 = bisect_left(ms1_peaks, (maxI - 0.005,))
73+
pos1 = bisect_left(ms1_peaks, (maxI + 0.005,))
74+
if len(set(ii for _, _, ii in ms1_peaks[pos0:pos1])) > len(ms1_scans) / 3:
75+
avespec.append(
76+
(maxI, sum(i for _, i, _ in ms1_peaks[pos0:pos1]) / len(ms1_scans))
77+
)
78+
del ms1_peaks[pos0:pos1]
79+
return sorted(avespec), ms2_scans, basename0
80+
81+
82+
from multiprocessing import freeze_support
83+
84+
if __name__ == "__main__":
85+
start_time = time.time()
86+
print(len(mzML_files), "mzML files")
87+
freeze_support()
88+
sample_type = get_sample_info()
89+
with concurrent.futures.ProcessPoolExecutor(max_workers=9) as executor:
90+
avespec_l = list(executor.map(aveMS1spec, mzML_files))
91+
92+
ms1_peaks = sorted((mz, i, ii) for ms1_, _, ii in avespec_l for mz, i in ms1_)
93+
conspec = []
94+
while ms1_peaks:
95+
maxI = max(ms1_peaks, key=operator.itemgetter(1))[0]
96+
pos0 = bisect_left(ms1_peaks, (maxI - 0.005,))
97+
pos1 = bisect_left(ms1_peaks, (maxI + 0.005,))
98+
if (
99+
len(
100+
set(
101+
ii
102+
for _, _, ii in ms1_peaks[pos0:pos1]
103+
if sample_type[ii] != "BLANK"
104+
)
105+
)
106+
>= 5
107+
):
108+
conspec.append(maxI)
109+
del ms1_peaks[pos0:pos1]
110+
lib_ent = RFlib.get_cpds()
111+
con_tab = dict()
112+
con_ms1 = dict()
113+
for ii, conmz in enumerate(sorted(conspec), 1):
114+
con_tab[(conmz, ii)] = dict()
115+
err_bd = RFcommonfn.bound_ppm(conmz * ms1ppm)
116+
pos_0 = bisect_left(lib_ent, (conmz - err_bd,))
117+
pos_1 = bisect_left(lib_ent, (conmz + err_bd,))
118+
ms1name = [
119+
x.replace("\n", "---")
120+
for _, x in sorted(
121+
set((abs(ent.Mmass - conmz), ent.name) for ent in lib_ent[pos_0:pos_1])
122+
)
123+
]
124+
con_ms1[conmz] = ms1name
125+
126+
for avespec, ms2_scans, basename0 in avespec_l:
127+
pos = bisect_left(ms2_scans, (conmz,))
128+
if pos > 0 and ms2_scans[pos][0] - conmz > conmz - ms2_scans[pos - 1][0]:
129+
pos -= 1
130+
expMS2 = ms2_scans[pos]
131+
top10 = sorted(expMS2[1].I_l, reverse=True)[min(len(expMS2[1].I_l) - 1, 9)]
132+
topN = [x for x in zip(expMS2[1].mz_l, expMS2[1].I_l) if x[1] >= top10]
133+
topmz = [x for x, _ in topN]
134+
topI = [x for _, x in topN]
135+
if abs(conmz - expMS2[0]) > 0.51:
136+
print(
137+
"{} {} {} {}".format(
138+
abs(conmz - expMS2[0]), expMS2[0], conmz, basename0
139+
)
140+
)
141+
142+
score_ent = []
143+
for ent in lib_ent[pos_0:pos_1]:
144+
ms2_I = []
145+
ent_I = []
146+
xfrag = set()
147+
hpeak = 0
148+
mpeak = 0
149+
fent = sorted(
150+
[
151+
(y, x)
152+
for x, y in zip(ent.mz, ent.I)
153+
if (ent.charge * conmz - x) > 3.3
154+
],
155+
reverse=True,
156+
)[:10]
157+
for f_I, f_mz in fent:
158+
err_bd = 0.01
159+
pos0 = bisect_left(topmz, f_mz - err_bd)
160+
pos1 = bisect_left(topmz, f_mz + err_bd, lo=pos0)
161+
ent_I.append(f_I)
162+
163+
if pos0 != pos1:
164+
ms2_I.append(max(topI[pos0:pos1]))
165+
for i in range(pos0, pos1):
166+
xfrag.add(i)
167+
if f_I == fent[0][0]:
168+
hpeak = 1
169+
mpeak += 1
170+
else:
171+
ms2_I.append(0)
172+
173+
if hpeak:
174+
for nn, (f_mz, f_I) in enumerate(zip(topmz, topI)):
175+
if nn not in xfrag and (ent.charge * conmz - f_mz) > 3.3:
176+
ms2_I.append(f_I)
177+
ent_I.append(0)
178+
cs = cos_sim(ent_I, ms2_I)
179+
score_ent.append((cs, ent, mpeak))
180+
pos0 = bisect_left(avespec, (conmz - 0.005,))
181+
pos1 = bisect_left(avespec, (conmz + 0.005,))
182+
aveI = sum(x for _, x in avespec[pos0:pos1])
183+
ave_mz = (
184+
sum(mz * i for mz, i in avespec[pos0:pos1]) / aveI if aveI > 0 else None
185+
)
186+
if score_ent:
187+
score_ent = max(score_ent)
188+
sc = score_ent[0]
189+
mpeak = score_ent[2]
190+
ent = score_ent[1]
191+
exp_mz = expMS2[1].mz_l
192+
exp_I = expMS2[1].I_l
193+
else:
194+
sc = None
195+
mpeak = None
196+
ent = Ent(
197+
conmz, "m/z={:.4f}".format(conmz), tuple(), tuple(), "", 1, None, ""
198+
)
199+
exp_mz = exp_I = tuple()
200+
con_tab[(conmz, ii)][basename0] = (
201+
aveI,
202+
sc,
203+
ent,
204+
exp_mz,
205+
exp_I,
206+
ave_mz,
207+
mpeak,
208+
)
209+
210+
for basename0 in basename_l:
211+
open("ann_{}.txt".format(basename0), "w")
212+
213+
frago = open("quant_frag.txt", "w")
214+
frago.write(
215+
"group\tID\talt_IDs\tMS1\tmz\tadduct\tID\tcount\tfrag_m/z\t"
216+
+ "\t".join(basename_l)
217+
+ "\t"
218+
+ "\t".join("score_" + x for x in basename_l)
219+
+ "\t"
220+
+ "\t".join("mass_error_" + x for x in basename_l)
221+
+ "\n"
222+
)
223+
224+
for x, y in sorted(con_tab.items(), key=lambda x: x[0][0]):
225+
c = collections.Counter(yy[2] for yy in y.values())
226+
ent, cn = c.most_common(1)[0]
227+
identified = (
228+
"" if all(yy[2].name.startswith("m/z=") for yy in y.values()) else "*"
229+
)
230+
id_with_count = "{} ({})".format(ent.name.replace("\n", "---"), cn)
231+
alt_id_with_count = " --- ".join(
232+
"{} ({})".format(ent.name.replace("\n", "---"), cn)
233+
for ent, cn in c.most_common()[1:]
234+
)
235+
count_pos = sum(1 for qs in y.values() if qs[0] > 0)
236+
frago.write(
237+
"{}\t{}\t{}\t{}\t{:.4f}\t{}\t{}\t{}\t{}".format(
238+
x[1],
239+
id_with_count,
240+
alt_id_with_count,
241+
" --- ".join(con_ms1[x[0]]),
242+
x[0],
243+
ent.adduct,
244+
identified,
245+
count_pos,
246+
"precursor",
247+
)
248+
)
249+
250+
for basename0 in basename_l:
251+
qs = y[basename0]
252+
frago.write("\t{:.1f}".format(qs[0]))
253+
for basename0 in basename_l:
254+
qs = y[basename0]
255+
frago.write("\t{:.2f}".format(qs[1], qs[6]) if qs and qs[1] else "\t")
256+
for basename0 in basename_l:
257+
qs = y[basename0]
258+
frago.write(
259+
"\t{:.3f}".format(ent.Mmass - qs[5]) if qs[1] and qs[5] else "\t"
260+
)
261+
262+
frago.write("\n")
263+
264+
if ent.mz:
265+
mzML_f = dict()
266+
for basename0 in basename_l:
267+
qs = y[basename0]
268+
mzML_f[basename0] = dict()
269+
for f_mz in ent.mz:
270+
pos0 = bisect_left(qs[3], f_mz - 0.01)
271+
pos1 = bisect_left(qs[3], f_mz + 0.01)
272+
mzML_f[basename0][f_mz] = (
273+
max(qs[4][pos0:pos1]) if pos0 < pos1 else 0
274+
)
275+
maxf = max(mzML_f[basename0].values())
276+
if maxf > 0:
277+
for f_mz in ent.mz:
278+
mzML_f[basename0][f_mz] /= maxf
279+
280+
for f_mz, _ in sorted(
281+
zip(ent.mz, ent.I), key=operator.itemgetter(1), reverse=True
282+
):
283+
frago.write(
284+
"{}\t{}\t{}\t{}\t{:.4f}\t{}\t{}\t{}\t{}".format(
285+
x[1],
286+
id_with_count,
287+
alt_id_with_count,
288+
"",
289+
x[0],
290+
ent.adduct,
291+
identified,
292+
sum(1 for basename0 in basename_l if mzML_f[basename0][f_mz] > 0),
293+
f_mz,
294+
)
295+
)
296+
for basename0 in basename_l:
297+
frago.write("\t{:.2f}".format(mzML_f[basename0][f_mz]))
298+
frago.write("\n")
299+
300+
for basename0 in basename_l:
301+
qs = y[basename0]
302+
if qs[1]:
303+
with open("ann_{}.txt".format(os.path.basename(basename0)), "a") as ann:
304+
ann.write("NAME:\n")
305+
ann.write("{}\n".format(qs[2].name))
306+
ann.write("ADDUCT: {}\n".format(qs[2].adduct))
307+
ann.write("TARGET_M/Z: {:.6f}\n".format(x[0]))
308+
ann.write("DOT_PRODUCT: {:.3f}\n".format(qs[1]))
309+
ann.write("EXPERIMENTAL_SPECTRUM:\n")
310+
for mz, i in sorted(
311+
zip(qs[3], qs[4]), key=operator.itemgetter(1), reverse=True
312+
):
313+
ann.write("{:.6f} {:.2f}\n".format(mz, i))
314+
ann.write("LIBRARY_SPECTRUM:\n")
315+
for mz, i in zip(qs[2].mz, qs[2].I):
316+
ann.write("{:.6f} {:.2f}\n".format(mz, i))
317+
ann.write("\n")
318+
319+
with PdfPages("aveMS1spec.pdf") as pdf0:
320+
for avespec, _, basename0 in avespec_l:
321+
plt.figure(figsize=(9, 4))
322+
ax = plt.subplot(1, 1, 1)
323+
ax.set_title("{} ave. MS1".format(basename0[:-5]))
324+
exp_ = ax.vlines(
325+
x=[mz for mz, _ in avespec],
326+
ymin=0,
327+
ymax=[i for _, i in avespec],
328+
color="black",
329+
lw=0.5,
330+
)
331+
ax.set_xlabel("m/z") # ,fontsize=22)
332+
ax.set_ylabel("intensity") # ,fontsize=22)
333+
pdf0.savefig()
334+
plt.close()
335+
336+
print("Run time = {:.1f} mins".format(((time.time() - start_time) / 60)))

src/RFcommonfn.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from bisect import bisect_left
2+
import operator
3+
import os
4+
import collections
5+
import sys
6+
7+
Ent = collections.namedtuple("Ent", ("Mmass name mz I adduct charge rt formu"))
8+
Spec = collections.namedtuple("Spec", ("rt mz_l I_l"))
9+
10+
11+
def read_param(param_set):
12+
param_dict = dict()
13+
with open("param.txt") as input_param:
14+
key_ = ""
15+
value_ = ""
16+
for line in (l.strip() for l in input_param if l[0] != "#"):
17+
if not line and key_ and value_:
18+
param_dict[key_] = value_[:-1]
19+
key_ = ""
20+
value_ = ""
21+
elif line in param_set:
22+
key_ = line
23+
elif key_:
24+
value_ += line + "\n"
25+
param_dict[key_] = value_[:-1]
26+
return param_dict
27+
28+
29+
def bound_ppm(mz_diff):
30+
return max(0.002, mz_diff)

0 commit comments

Comments
 (0)