@@ -145,6 +145,7 @@ public void ApplyProteinParsimony(out List<ProteinGroup> proteinGroups)
145
145
146
146
// add proteins with unique peptides to the parsimony dictionary before applying parsimony algorithm (more efficient)
147
147
Dictionary < Protein , HashSet < CompactPeptide > > parsimonyDict = new Dictionary < Protein , HashSet < CompactPeptide > > ( ) ;
148
+ HashSet < Protein > proteinsWithUniquePeptides = new HashSet < Protein > ( ) ;
148
149
HashSet < CompactPeptide > usedPeptides = new HashSet < CompactPeptide > ( ) ;
149
150
HashSet < string > usedBaseSequences = new HashSet < string > ( ) ;
150
151
@@ -162,6 +163,7 @@ public void ApplyProteinParsimony(out List<ProteinGroup> proteinGroups)
162
163
163
164
if ( proteinContainsUniquePeptide )
164
165
{
166
+ proteinsWithUniquePeptides . Add ( kvp . Key ) ;
165
167
parsimonyDict . Add ( kvp . Key , kvp . Value ) ;
166
168
foreach ( var peptide in kvp . Value )
167
169
{
@@ -175,9 +177,7 @@ public void ApplyProteinParsimony(out List<ProteinGroup> proteinGroups)
175
177
// greedy algorithm adds the next protein that will account for the most unaccounted-for peptides
176
178
HashSet < CompactPeptide > bestProteinPeptideList = new HashSet < CompactPeptide > ( ) ;
177
179
Protein bestProtein = null ;
178
-
179
- var initialDictCompactPeptides = new HashSet < CompactPeptide > ( compactPeptideToProteinPeptideMatching . Keys . Distinct ( ) . ToList ( ) ) ;
180
- int startingPeptides = initialDictCompactPeptides . Count ;
180
+ int startingPeptides = compactPeptideToProteinPeptideMatching . Keys . Count ;
181
181
bool currentBestPeptidesIsOne = false ;
182
182
183
183
// as long as there are peptides that have not been accounted for, keep going
@@ -299,6 +299,7 @@ public void ApplyProteinParsimony(out List<ProteinGroup> proteinGroups)
299
299
proteinListHere . Add ( kvp . Key ) ;
300
300
proteinGroups . Add ( new ProteinGroup ( proteinListHere , kvp . Value , uniquePeptidesHere ) ) ;
301
301
}
302
+
302
303
303
304
// grab indistinguishable proteins
304
305
foreach ( var proteinGroup in proteinGroups )
@@ -307,14 +308,17 @@ public void ApplyProteinParsimony(out List<ProteinGroup> proteinGroups)
307
308
{
308
309
foreach ( var kvp in newDict )
309
310
{
310
- if ( ! parsimonyDict . ContainsKey ( kvp . Key ) )
311
+ if ( ! proteinsWithUniquePeptides . Contains ( kvp . Key ) )
311
312
{
312
- if ( kvp . Value . Count == proteinGroup . PeptideList . Count )
313
+ if ( ! parsimonyDict . ContainsKey ( kvp . Key ) )
313
314
{
314
- if ( kvp . Value . SetEquals ( proteinGroup . PeptideList ) )
315
+ if ( kvp . Value . Count == proteinGroup . PeptideList . Count )
315
316
{
316
- proteinGroup . Proteins . Add ( kvp . Key ) ;
317
- parsimonyDict . Add ( kvp . Key , kvp . Value ) ;
317
+ if ( kvp . Value . SetEquals ( proteinGroup . PeptideList ) )
318
+ {
319
+ proteinGroup . Proteins . Add ( kvp . Key ) ;
320
+ parsimonyDict . Add ( kvp . Key , kvp . Value ) ;
321
+ }
318
322
}
319
323
}
320
324
}
@@ -335,13 +339,13 @@ public void ApplyProteinParsimony(out List<ProteinGroup> proteinGroups)
335
339
HashSet < PeptideWithSetModifications > newPeptides = new HashSet < PeptideWithSetModifications > ( ) ;
336
340
HashSet < Protein > proteinListHere ;
337
341
338
- // get the peptide 's protein group after parsimony
342
+ // get the CompactPeptide 's protein list after parsimony
339
343
peptideProteinListMatch . TryGetValue ( peptide , out proteinListHere ) ;
340
344
341
- // find peptide 's original (unparsimonious) virtual peptide matches
345
+ // find CompactPeptide 's original (unparsimonious) peptide matches
342
346
compactPeptideToProteinPeptideMatching . TryGetValue ( peptide , out oldPeptides ) ;
343
347
344
- // get the virtual peptides that belong to the post-parsimony protein(s) only
348
+ // get the peptides that belong to the post-parsimony protein(s) only
345
349
foreach ( var peptide1 in oldPeptides )
346
350
{
347
351
if ( proteinListHere . Contains ( peptide1 . Protein ) )
@@ -350,7 +354,7 @@ public void ApplyProteinParsimony(out List<ProteinGroup> proteinGroups)
350
354
}
351
355
}
352
356
353
- // make new dictionary using only virtual peptides from parsimonious protein list
357
+ // make new dictionary using only peptides from parsimonious protein list
354
358
answer . Add ( peptide , newPeptides ) ;
355
359
}
356
360
}
@@ -367,6 +371,9 @@ public void ScoreProteinGroups(List<ProteinGroup> proteinGroups, List<NewPsmWith
367
371
368
372
Dictionary < string , List < NewPsmWithFdr > > peptideBaseSeqToPsmMatching = new Dictionary < string , List < NewPsmWithFdr > > ( ) ;
369
373
Dictionary < CompactPeptide , NewPsmWithFdr > peptideToBestPsmMatching = new Dictionary < CompactPeptide , NewPsmWithFdr > ( ) ;
374
+ Dictionary < CompactPeptide , HashSet < ProteinGroup > > peptideToProteinGroupMatching = new Dictionary < CompactPeptide , HashSet < ProteinGroup > > ( ) ;
375
+ HashSet < CompactPeptide > allRazorPeptides = new HashSet < CompactPeptide > ( ) ;
376
+ HashSet < ProteinGroup > proteinGroupsToRemove = new HashSet < ProteinGroup > ( ) ;
370
377
371
378
// match the peptide base sequence to all of its PSMs
372
379
foreach ( var psm in psmList )
@@ -388,6 +395,25 @@ public void ScoreProteinGroups(List<ProteinGroup> proteinGroups, List<NewPsmWith
388
395
}
389
396
}
390
397
398
+ // add every psm that corresponds to the protein group's peptides to the group
399
+ foreach ( var proteinGroup in proteinGroups )
400
+ {
401
+ foreach ( var peptide in proteinGroup . PeptideList )
402
+ {
403
+ string peptideBaseSequence = string . Join ( "" , peptide . BaseSequence . Select ( b => char . ConvertFromUtf32 ( b ) ) ) ;
404
+ List < NewPsmWithFdr > psmListForThisBaseSeq = new List < NewPsmWithFdr > ( ) ;
405
+
406
+ peptideBaseSeqToPsmMatching . TryGetValue ( peptideBaseSequence , out psmListForThisBaseSeq ) ;
407
+ foreach ( var psm in psmListForThisBaseSeq )
408
+ {
409
+ if ( ! proteinGroup . TotalPsmList . Contains ( psm ) )
410
+ {
411
+ proteinGroup . TotalPsmList . Add ( psm ) ;
412
+ }
413
+ }
414
+ }
415
+ }
416
+
391
417
// find the best psm per base sequence
392
418
foreach ( var kvp in peptideBaseSeqToPsmMatching )
393
419
{
@@ -424,12 +450,12 @@ public void ScoreProteinGroups(List<ProteinGroup> proteinGroups, List<NewPsmWith
424
450
thisProteinGroupsPsmList . Add ( psm ) ;
425
451
}
426
452
}
427
- proteinGroup . PsmList = thisProteinGroupsPsmList ;
453
+ proteinGroup . BestPsmList = thisProteinGroupsPsmList ;
428
454
429
455
// remove CompactPeptides that are not associated with the best psm per base sequence from the group
430
456
HashSet < CompactPeptide > newPeptideList = new HashSet < CompactPeptide > ( ) ;
431
457
HashSet < CompactPeptide > newUniquePeptideList = new HashSet < CompactPeptide > ( ) ;
432
- foreach ( var psm in proteinGroup . PsmList )
458
+ foreach ( var psm in proteinGroup . BestPsmList )
433
459
{
434
460
CompactPeptide peptide = psm . thisPSM . newPsm . GetCompactPeptide ( variableModifications , localizeableModifications ) ;
435
461
@@ -440,11 +466,67 @@ public void ScoreProteinGroups(List<ProteinGroup> proteinGroups, List<NewPsmWith
440
466
newUniquePeptideList . Add ( peptide ) ;
441
467
}
442
468
}
469
+
470
+ // for finding razor peptides later
471
+ foreach ( var peptide in proteinGroup . PeptideList )
472
+ {
473
+ HashSet < ProteinGroup > proteinGroupsHere = new HashSet < ProteinGroup > ( ) ;
474
+ if ( peptideToProteinGroupMatching . ContainsKey ( peptide ) )
475
+ {
476
+ peptideToProteinGroupMatching . TryGetValue ( peptide , out proteinGroupsHere ) ;
477
+ proteinGroupsHere . Add ( proteinGroup ) ;
478
+ }
479
+ else
480
+ {
481
+ proteinGroupsHere . Add ( proteinGroup ) ;
482
+ peptideToProteinGroupMatching . Add ( peptide , proteinGroupsHere ) ;
483
+ }
484
+ }
485
+
443
486
proteinGroup . PeptideList = newPeptideList ;
444
487
proteinGroup . UniquePeptideList = newUniquePeptideList ;
445
488
446
489
// score the group (scoring algorithm defined in the ProteinGroup class)
447
- proteinGroup . scoreThisProteinGroup ( ) ;
490
+ proteinGroup . ScoreThisProteinGroup ( ) ;
491
+
492
+ // remove empty protein groups (peptides were too poor quality and group doesn't exist anymore)
493
+ if ( proteinGroup . proteinGroupScore == 0 )
494
+ proteinGroupsToRemove . Add ( proteinGroup ) ;
495
+ }
496
+
497
+ foreach ( var proteinGroup in proteinGroupsToRemove )
498
+ {
499
+ proteinGroups . Remove ( proteinGroup ) ;
500
+ }
501
+
502
+ // build razor peptide list (peptides that have >1 protein groups in the final protein group list)
503
+ foreach ( var kvp in peptideToProteinGroupMatching )
504
+ {
505
+ if ( kvp . Value . Count > 1 )
506
+ allRazorPeptides . Add ( kvp . Key ) ;
507
+ }
508
+
509
+ foreach ( var proteinGroup in proteinGroups )
510
+ {
511
+ foreach ( var peptide in proteinGroup . PeptideList )
512
+ {
513
+ // build razor peptide list for each protein group
514
+ if ( allRazorPeptides . Contains ( peptide ) )
515
+ {
516
+ proteinGroup . RazorPeptideList . Add ( peptide ) ;
517
+ }
518
+
519
+ // build PeptideWithSetMod list to calc sequence coverage
520
+ HashSet < PeptideWithSetModifications > peptidesWithSetMods = null ;
521
+ compactPeptideToProteinPeptideMatching . TryGetValue ( peptide , out peptidesWithSetMods ) ;
522
+ foreach ( var pep in peptidesWithSetMods )
523
+ {
524
+ proteinGroup . PeptideWithSetModsList . Add ( pep ) ;
525
+ }
526
+ }
527
+
528
+ // calculate sequence coverage for each protein in the group
529
+ proteinGroup . CalculateSequenceCoverage ( ) ;
448
530
}
449
531
}
450
532
0 commit comments