|
| 1 | +import sys |
| 2 | +import collections |
| 3 | +import operator |
| 4 | +import itertools |
| 5 | +from bisect import bisect_left |
| 6 | +import os |
| 7 | +import glob |
| 8 | +import concurrent.futures |
| 9 | +import math |
| 10 | +import time |
| 11 | + |
| 12 | +import matplotlib |
| 13 | + |
| 14 | +matplotlib.use("Agg") |
| 15 | +import matplotlib.pyplot as plt |
| 16 | +from matplotlib.backends.backend_pdf import PdfPages |
| 17 | + |
| 18 | +import RFcommonfn |
| 19 | +import RFread |
| 20 | +import RFlib |
| 21 | + |
| 22 | + |
| 23 | +Ent = RFcommonfn.Ent |
| 24 | + |
| 25 | +param_set = { |
| 26 | + "mzML_files", |
| 27 | + "library", |
| 28 | + "ms1_ppm", |
| 29 | + "sample_info", |
| 30 | +} |
| 31 | +param_dict = RFcommonfn.read_param(param_set) |
| 32 | +mzML_files = sorted(glob.glob(param_dict["mzML_files"])) |
| 33 | +basename_l = [os.path.basename(x) for x in mzML_files] |
| 34 | +ms1ppm = float(param_dict["ms1_ppm"]) / 1e6 |
| 35 | + |
| 36 | + |
| 37 | +def get_sample_info(): |
| 38 | + sample_info_file = glob.glob(param_dict["sample_info"])[0] |
| 39 | + sample_type = dict() |
| 40 | + with open(sample_info_file) as sfile: |
| 41 | + next(sfile) |
| 42 | + for line in sfile: |
| 43 | + if line[0] != "#" and line.rstrip(): |
| 44 | + lsp = line.rstrip("\n").split("\t") |
| 45 | + sample_type[lsp[0]] = lsp[1].upper() |
| 46 | + return sample_type |
| 47 | + |
| 48 | + |
| 49 | +def cos_sim(list1, list2): |
| 50 | + if len(list1) != len(list2): |
| 51 | + print("adf") |
| 52 | + sys.exit() |
| 53 | + if sum(list1) <= 0 or sum(list2) <= 0: |
| 54 | + return 0 |
| 55 | + return sum(math.sqrt(x * y) for x, y in zip(list1, list2)) / math.sqrt( |
| 56 | + sum(list1) * sum(list2) |
| 57 | + ) |
| 58 | + |
| 59 | + |
| 60 | +def aveMS1spec(mzML_file): |
| 61 | + basename0 = os.path.basename(mzML_file) |
| 62 | + print(basename0) |
| 63 | + ms1_scans, ms2_scans, _ = RFread.print_eic_ms(mzML_file) |
| 64 | + ms1_peaks = sorted( |
| 65 | + (mz, i, ii) |
| 66 | + for ii, ms1_ in enumerate(ms1_scans) |
| 67 | + for mz, i in zip(ms1_.mz_l, ms1_.I_l) |
| 68 | + ) |
| 69 | + avespec = [] |
| 70 | + while ms1_peaks: |
| 71 | + maxI = max(ms1_peaks, key=operator.itemgetter(1))[0] |
| 72 | + pos0 = bisect_left(ms1_peaks, (maxI - 0.005,)) |
| 73 | + pos1 = bisect_left(ms1_peaks, (maxI + 0.005,)) |
| 74 | + if len(set(ii for _, _, ii in ms1_peaks[pos0:pos1])) > len(ms1_scans) / 3: |
| 75 | + avespec.append( |
| 76 | + (maxI, sum(i for _, i, _ in ms1_peaks[pos0:pos1]) / len(ms1_scans)) |
| 77 | + ) |
| 78 | + del ms1_peaks[pos0:pos1] |
| 79 | + return sorted(avespec), ms2_scans, basename0 |
| 80 | + |
| 81 | + |
| 82 | +from multiprocessing import freeze_support |
| 83 | + |
| 84 | +if __name__ == "__main__": |
| 85 | + start_time = time.time() |
| 86 | + print(len(mzML_files), "mzML files") |
| 87 | + freeze_support() |
| 88 | + sample_type = get_sample_info() |
| 89 | + with concurrent.futures.ProcessPoolExecutor(max_workers=9) as executor: |
| 90 | + avespec_l = list(executor.map(aveMS1spec, mzML_files)) |
| 91 | + |
| 92 | + ms1_peaks = sorted((mz, i, ii) for ms1_, _, ii in avespec_l for mz, i in ms1_) |
| 93 | + conspec = [] |
| 94 | + while ms1_peaks: |
| 95 | + maxI = max(ms1_peaks, key=operator.itemgetter(1))[0] |
| 96 | + pos0 = bisect_left(ms1_peaks, (maxI - 0.005,)) |
| 97 | + pos1 = bisect_left(ms1_peaks, (maxI + 0.005,)) |
| 98 | + if ( |
| 99 | + len( |
| 100 | + set( |
| 101 | + ii |
| 102 | + for _, _, ii in ms1_peaks[pos0:pos1] |
| 103 | + if sample_type[ii] != "BLANK" |
| 104 | + ) |
| 105 | + ) |
| 106 | + >= 5 |
| 107 | + ): |
| 108 | + conspec.append(maxI) |
| 109 | + del ms1_peaks[pos0:pos1] |
| 110 | + lib_ent = RFlib.get_cpds() |
| 111 | + con_tab = dict() |
| 112 | + con_ms1 = dict() |
| 113 | + for ii, conmz in enumerate(sorted(conspec), 1): |
| 114 | + con_tab[(conmz, ii)] = dict() |
| 115 | + err_bd = RFcommonfn.bound_ppm(conmz * ms1ppm) |
| 116 | + pos_0 = bisect_left(lib_ent, (conmz - err_bd,)) |
| 117 | + pos_1 = bisect_left(lib_ent, (conmz + err_bd,)) |
| 118 | + ms1name = [ |
| 119 | + x.replace("\n", "---") |
| 120 | + for _, x in sorted( |
| 121 | + set((abs(ent.Mmass - conmz), ent.name) for ent in lib_ent[pos_0:pos_1]) |
| 122 | + ) |
| 123 | + ] |
| 124 | + con_ms1[conmz] = ms1name |
| 125 | + |
| 126 | + for avespec, ms2_scans, basename0 in avespec_l: |
| 127 | + pos = bisect_left(ms2_scans, (conmz,)) |
| 128 | + if pos > 0 and ms2_scans[pos][0] - conmz > conmz - ms2_scans[pos - 1][0]: |
| 129 | + pos -= 1 |
| 130 | + expMS2 = ms2_scans[pos] |
| 131 | + top10 = sorted(expMS2[1].I_l, reverse=True)[min(len(expMS2[1].I_l) - 1, 9)] |
| 132 | + topN = [x for x in zip(expMS2[1].mz_l, expMS2[1].I_l) if x[1] >= top10] |
| 133 | + topmz = [x for x, _ in topN] |
| 134 | + topI = [x for _, x in topN] |
| 135 | + if abs(conmz - expMS2[0]) > 0.51: |
| 136 | + print( |
| 137 | + "{} {} {} {}".format( |
| 138 | + abs(conmz - expMS2[0]), expMS2[0], conmz, basename0 |
| 139 | + ) |
| 140 | + ) |
| 141 | + |
| 142 | + score_ent = [] |
| 143 | + for ent in lib_ent[pos_0:pos_1]: |
| 144 | + ms2_I = [] |
| 145 | + ent_I = [] |
| 146 | + xfrag = set() |
| 147 | + hpeak = 0 |
| 148 | + mpeak = 0 |
| 149 | + fent = sorted( |
| 150 | + [ |
| 151 | + (y, x) |
| 152 | + for x, y in zip(ent.mz, ent.I) |
| 153 | + if (ent.charge * conmz - x) > 3.3 |
| 154 | + ], |
| 155 | + reverse=True, |
| 156 | + )[:10] |
| 157 | + for f_I, f_mz in fent: |
| 158 | + err_bd = 0.01 |
| 159 | + pos0 = bisect_left(topmz, f_mz - err_bd) |
| 160 | + pos1 = bisect_left(topmz, f_mz + err_bd, lo=pos0) |
| 161 | + ent_I.append(f_I) |
| 162 | + |
| 163 | + if pos0 != pos1: |
| 164 | + ms2_I.append(max(topI[pos0:pos1])) |
| 165 | + for i in range(pos0, pos1): |
| 166 | + xfrag.add(i) |
| 167 | + if f_I == fent[0][0]: |
| 168 | + hpeak = 1 |
| 169 | + mpeak += 1 |
| 170 | + else: |
| 171 | + ms2_I.append(0) |
| 172 | + |
| 173 | + if hpeak: |
| 174 | + for nn, (f_mz, f_I) in enumerate(zip(topmz, topI)): |
| 175 | + if nn not in xfrag and (ent.charge * conmz - f_mz) > 3.3: |
| 176 | + ms2_I.append(f_I) |
| 177 | + ent_I.append(0) |
| 178 | + cs = cos_sim(ent_I, ms2_I) |
| 179 | + score_ent.append((cs, ent, mpeak)) |
| 180 | + pos0 = bisect_left(avespec, (conmz - 0.005,)) |
| 181 | + pos1 = bisect_left(avespec, (conmz + 0.005,)) |
| 182 | + aveI = sum(x for _, x in avespec[pos0:pos1]) |
| 183 | + ave_mz = ( |
| 184 | + sum(mz * i for mz, i in avespec[pos0:pos1]) / aveI if aveI > 0 else None |
| 185 | + ) |
| 186 | + if score_ent: |
| 187 | + score_ent = max(score_ent) |
| 188 | + sc = score_ent[0] |
| 189 | + mpeak = score_ent[2] |
| 190 | + ent = score_ent[1] |
| 191 | + exp_mz = expMS2[1].mz_l |
| 192 | + exp_I = expMS2[1].I_l |
| 193 | + else: |
| 194 | + sc = None |
| 195 | + mpeak = None |
| 196 | + ent = Ent( |
| 197 | + conmz, "m/z={:.4f}".format(conmz), tuple(), tuple(), "", 1, None, "" |
| 198 | + ) |
| 199 | + exp_mz = exp_I = tuple() |
| 200 | + con_tab[(conmz, ii)][basename0] = ( |
| 201 | + aveI, |
| 202 | + sc, |
| 203 | + ent, |
| 204 | + exp_mz, |
| 205 | + exp_I, |
| 206 | + ave_mz, |
| 207 | + mpeak, |
| 208 | + ) |
| 209 | + |
| 210 | + for basename0 in basename_l: |
| 211 | + open("ann_{}.txt".format(basename0), "w") |
| 212 | + |
| 213 | + frago = open("quant_frag.txt", "w") |
| 214 | + frago.write( |
| 215 | + "group\tID\talt_IDs\tMS1\tmz\tadduct\tID\tcount\tfrag_m/z\t" |
| 216 | + + "\t".join(basename_l) |
| 217 | + + "\t" |
| 218 | + + "\t".join("score_" + x for x in basename_l) |
| 219 | + + "\t" |
| 220 | + + "\t".join("mass_error_" + x for x in basename_l) |
| 221 | + + "\n" |
| 222 | + ) |
| 223 | + |
| 224 | + for x, y in sorted(con_tab.items(), key=lambda x: x[0][0]): |
| 225 | + c = collections.Counter(yy[2] for yy in y.values()) |
| 226 | + ent, cn = c.most_common(1)[0] |
| 227 | + identified = ( |
| 228 | + "" if all(yy[2].name.startswith("m/z=") for yy in y.values()) else "*" |
| 229 | + ) |
| 230 | + id_with_count = "{} ({})".format(ent.name.replace("\n", "---"), cn) |
| 231 | + alt_id_with_count = " --- ".join( |
| 232 | + "{} ({})".format(ent.name.replace("\n", "---"), cn) |
| 233 | + for ent, cn in c.most_common()[1:] |
| 234 | + ) |
| 235 | + count_pos = sum(1 for qs in y.values() if qs[0] > 0) |
| 236 | + frago.write( |
| 237 | + "{}\t{}\t{}\t{}\t{:.4f}\t{}\t{}\t{}\t{}".format( |
| 238 | + x[1], |
| 239 | + id_with_count, |
| 240 | + alt_id_with_count, |
| 241 | + " --- ".join(con_ms1[x[0]]), |
| 242 | + x[0], |
| 243 | + ent.adduct, |
| 244 | + identified, |
| 245 | + count_pos, |
| 246 | + "precursor", |
| 247 | + ) |
| 248 | + ) |
| 249 | + |
| 250 | + for basename0 in basename_l: |
| 251 | + qs = y[basename0] |
| 252 | + frago.write("\t{:.1f}".format(qs[0])) |
| 253 | + for basename0 in basename_l: |
| 254 | + qs = y[basename0] |
| 255 | + frago.write("\t{:.2f}".format(qs[1], qs[6]) if qs and qs[1] else "\t") |
| 256 | + for basename0 in basename_l: |
| 257 | + qs = y[basename0] |
| 258 | + frago.write( |
| 259 | + "\t{:.3f}".format(ent.Mmass - qs[5]) if qs[1] and qs[5] else "\t" |
| 260 | + ) |
| 261 | + |
| 262 | + frago.write("\n") |
| 263 | + |
| 264 | + if ent.mz: |
| 265 | + mzML_f = dict() |
| 266 | + for basename0 in basename_l: |
| 267 | + qs = y[basename0] |
| 268 | + mzML_f[basename0] = dict() |
| 269 | + for f_mz in ent.mz: |
| 270 | + pos0 = bisect_left(qs[3], f_mz - 0.01) |
| 271 | + pos1 = bisect_left(qs[3], f_mz + 0.01) |
| 272 | + mzML_f[basename0][f_mz] = ( |
| 273 | + max(qs[4][pos0:pos1]) if pos0 < pos1 else 0 |
| 274 | + ) |
| 275 | + maxf = max(mzML_f[basename0].values()) |
| 276 | + if maxf > 0: |
| 277 | + for f_mz in ent.mz: |
| 278 | + mzML_f[basename0][f_mz] /= maxf |
| 279 | + |
| 280 | + for f_mz, _ in sorted( |
| 281 | + zip(ent.mz, ent.I), key=operator.itemgetter(1), reverse=True |
| 282 | + ): |
| 283 | + frago.write( |
| 284 | + "{}\t{}\t{}\t{}\t{:.4f}\t{}\t{}\t{}\t{}".format( |
| 285 | + x[1], |
| 286 | + id_with_count, |
| 287 | + alt_id_with_count, |
| 288 | + "", |
| 289 | + x[0], |
| 290 | + ent.adduct, |
| 291 | + identified, |
| 292 | + sum(1 for basename0 in basename_l if mzML_f[basename0][f_mz] > 0), |
| 293 | + f_mz, |
| 294 | + ) |
| 295 | + ) |
| 296 | + for basename0 in basename_l: |
| 297 | + frago.write("\t{:.2f}".format(mzML_f[basename0][f_mz])) |
| 298 | + frago.write("\n") |
| 299 | + |
| 300 | + for basename0 in basename_l: |
| 301 | + qs = y[basename0] |
| 302 | + if qs[1]: |
| 303 | + with open("ann_{}.txt".format(os.path.basename(basename0)), "a") as ann: |
| 304 | + ann.write("NAME:\n") |
| 305 | + ann.write("{}\n".format(qs[2].name)) |
| 306 | + ann.write("ADDUCT: {}\n".format(qs[2].adduct)) |
| 307 | + ann.write("TARGET_M/Z: {:.6f}\n".format(x[0])) |
| 308 | + ann.write("DOT_PRODUCT: {:.3f}\n".format(qs[1])) |
| 309 | + ann.write("EXPERIMENTAL_SPECTRUM:\n") |
| 310 | + for mz, i in sorted( |
| 311 | + zip(qs[3], qs[4]), key=operator.itemgetter(1), reverse=True |
| 312 | + ): |
| 313 | + ann.write("{:.6f} {:.2f}\n".format(mz, i)) |
| 314 | + ann.write("LIBRARY_SPECTRUM:\n") |
| 315 | + for mz, i in zip(qs[2].mz, qs[2].I): |
| 316 | + ann.write("{:.6f} {:.2f}\n".format(mz, i)) |
| 317 | + ann.write("\n") |
| 318 | + |
| 319 | + with PdfPages("aveMS1spec.pdf") as pdf0: |
| 320 | + for avespec, _, basename0 in avespec_l: |
| 321 | + plt.figure(figsize=(9, 4)) |
| 322 | + ax = plt.subplot(1, 1, 1) |
| 323 | + ax.set_title("{} ave. MS1".format(basename0[:-5])) |
| 324 | + exp_ = ax.vlines( |
| 325 | + x=[mz for mz, _ in avespec], |
| 326 | + ymin=0, |
| 327 | + ymax=[i for _, i in avespec], |
| 328 | + color="black", |
| 329 | + lw=0.5, |
| 330 | + ) |
| 331 | + ax.set_xlabel("m/z") # ,fontsize=22) |
| 332 | + ax.set_ylabel("intensity") # ,fontsize=22) |
| 333 | + pdf0.savefig() |
| 334 | + plt.close() |
| 335 | + |
| 336 | + print("Run time = {:.1f} mins".format(((time.time() - start_time) / 60))) |
0 commit comments