Skip to content

Commit

Permalink
Merge pull request rusteomics#51 from aukeheerdink/Cascadia-support
Browse files Browse the repository at this point in the history
Implemented Cascadia support
  • Loading branch information
douweschulte authored Dec 19, 2024
2 parents d32c547 + 5d0a9cb commit f0f98e5
Show file tree
Hide file tree
Showing 7 changed files with 240 additions and 4 deletions.
2 changes: 2 additions & 0 deletions clippy.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ doc-valid-idents = [
"NovoB",
"pi-PrimeNovo",
"π-PrimeNovo",
"Cascadia",
"SpectrumSequenceList",
]
avoid-breaking-exported-api = false
check-private-items = true
6 changes: 5 additions & 1 deletion rustyms/src/identification/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use super::{
DeepNovoFamilyData, FastaData, IdentifiedPeptide, IdentifiedPeptideIter,
IdentifiedPeptideSource, InstaNovoData, MSFraggerData, MZTabData, MaxQuantData, NovoBData,
NovorData, OpairData, PLGSData, PLinkData, PeaksData, PepNetData, PowerNovoData, SageData,
SpectrumSequenceListData,
};

// TODO:
Expand Down Expand Up @@ -115,10 +116,13 @@ pub fn open_identified_peptides_file<'a>(
}),
Some("deepnovo_denovo") => {
DeepNovoFamilyData::parse_file(path, custom_database).map(IdentifiedPeptideIter::into_box)
},
Some("ssl") => {
SpectrumSequenceListData::parse_file(path, custom_database).map(IdentifiedPeptideIter::into_box)
}
_ => Err(CustomError::error(
"Unknown extension",
"Use CSV, TSV, TXT, PSMTSV, deepnovo_denovo, or Fasta, or any of these as a gzipped file (eg csv.gz).",
"Use CSV, SSL, TSV, TXT, PSMTSV, deepnovo_denovo, or Fasta, or any of these as a gzipped file (eg csv.gz).",
Context::show(path.to_string_lossy()),
)),
}
Expand Down
21 changes: 19 additions & 2 deletions rustyms/src/identification/identified_peptide.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::{
deepnovofamily::DeepNovoFamilyData, fasta::FastaData, instanovo::InstaNovoData,
novob::NovoBData, novor::NovorData, opair::OpairData, peaks::PeaksData, pepnet::PepNetData,
plink::PLinkData, powernovo::PowerNovoData, system::MassOverCharge, MSFraggerData,
MZTabData, MaxQuantData, PLGSData, SageData,
MZTabData, MaxQuantData, PLGSData, SageData, SpectrumSequenceListData,
},
ontologies::CustomDatabase,
peptide::{SemiAmbiguous, SimpleLinear},
Expand Down Expand Up @@ -66,6 +66,8 @@ pub enum MetaData {
PowerNovo(PowerNovoData),
/// Sage metadata
Sage(SageData),
/// SpectrumSequenceList metadata
SpectrumSequenceList(SpectrumSequenceListData),
}

/// A peptide as stored in a identified peptide file, either a simple linear one or a cross-linked peptidoform
Expand Down Expand Up @@ -182,6 +184,7 @@ impl IdentifiedPeptide {
}
}
MetaData::MSFragger(MSFraggerData { peptide, .. })
| MetaData::SpectrumSequenceList(SpectrumSequenceListData { peptide, .. })
| MetaData::MaxQuant(MaxQuantData { peptide, .. })
| MetaData::MZTab(MZTabData { peptide, .. })
| MetaData::DeepNovoFamily(DeepNovoFamilyData { peptide, .. }) => {
Expand Down Expand Up @@ -214,6 +217,7 @@ impl IdentifiedPeptide {
/// Get the name of the format
pub const fn format_name(&self) -> &'static str {
match &self.metadata {
MetaData::SpectrumSequenceList(_) => "SpectrumSequenceList",
MetaData::DeepNovoFamily(_) => "DeepNovo Family",
MetaData::Fasta(_) => "Fasta",
MetaData::InstaNovo(_) => "InstaNovo",
Expand All @@ -235,6 +239,9 @@ impl IdentifiedPeptide {
/// Get the format version detected
pub fn format_version(&self) -> String {
match &self.metadata {
MetaData::SpectrumSequenceList(SpectrumSequenceListData { version, .. }) => {
version.to_string()
}
MetaData::DeepNovoFamily(DeepNovoFamilyData { version, .. }) => version.to_string(),
MetaData::Fasta(_) => "Fasta".to_string(),
MetaData::InstaNovo(InstaNovoData { version, .. }) => version.to_string(),
Expand Down Expand Up @@ -271,6 +278,7 @@ impl IdentifiedPeptide {
MetaData::Novor(NovorData { id, scan, .. }) => id.unwrap_or(*scan).to_string(),
MetaData::Opair(OpairData { scan, .. })
| MetaData::NovoB(NovoBData { scan, .. })
| MetaData::SpectrumSequenceList(SpectrumSequenceListData { scan, .. })
| MetaData::InstaNovo(InstaNovoData { scan, .. }) => scan.to_string(),
MetaData::Sage(SageData { id, .. }) | MetaData::MZTab(MZTabData { id, .. }) => {
id.to_string()
Expand Down Expand Up @@ -336,6 +344,9 @@ impl IdentifiedPeptide {
| MetaData::MZTab(MZTabData { z, .. }) => Some(*z),
MetaData::Peaks(PeaksData { z, .. })
| MetaData::DeepNovoFamily(DeepNovoFamilyData { z, .. }) => *z,
MetaData::SpectrumSequenceList(SpectrumSequenceListData { z, .. }) => {
(z.value >= 0).then_some(Charge::new::<crate::system::charge::e>(z.value as usize))
}
MetaData::Fasta(_) | MetaData::PowerNovo(_) | MetaData::PepNet(_) => None,
}
}
Expand All @@ -361,6 +372,7 @@ impl IdentifiedPeptide {
| MetaData::MSFragger(MSFraggerData { rt, .. }) => Some(*rt),
MetaData::MaxQuant(MaxQuantData { rt, .. })
| MetaData::Novor(NovorData { rt, .. })
| MetaData::SpectrumSequenceList(SpectrumSequenceListData { rt, .. })
| MetaData::MZTab(MZTabData { rt, .. }) => *rt,
MetaData::DeepNovoFamily(_)
| MetaData::InstaNovo(_)
Expand Down Expand Up @@ -409,6 +421,7 @@ impl IdentifiedPeptide {
),

MetaData::Opair(OpairData { raw_file, scan, .. })
| MetaData::SpectrumSequenceList(SpectrumSequenceListData { raw_file, scan, .. })
| MetaData::InstaNovo(InstaNovoData { raw_file, scan, .. }) => {
SpectrumIds::FileKnown(vec![(raw_file.clone(), vec![SpectrumId::Index(*scan)])])
}
Expand Down Expand Up @@ -483,6 +496,7 @@ impl IdentifiedPeptide {
}
MetaData::DeepNovoFamily(_)
| MetaData::Fasta(_)
| MetaData::SpectrumSequenceList(_)
| MetaData::PowerNovo(_)
| MetaData::PepNet(_) => None,
}
Expand Down Expand Up @@ -510,7 +524,10 @@ impl IdentifiedPeptide {
MetaData::DeepNovoFamily(DeepNovoFamilyData { mz, z, .. }) => {
mz.and_then(|mz| z.map(|z| (mz, z)).map(|(mz, z)| mz * z.to_float()))
}
MetaData::Fasta(_) | MetaData::PowerNovo(_) | MetaData::PepNet(_) => None,
MetaData::Fasta(_)
| MetaData::PowerNovo(_)
| MetaData::SpectrumSequenceList(_)
| MetaData::PepNet(_) => None,
}
}

Expand Down
4 changes: 4 additions & 0 deletions rustyms/src/identification/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ mod plgs;
mod plink;
mod powernovo;
mod sage;
mod ssl;

use crate::*;
pub use deepnovofamily::*;
Expand All @@ -39,6 +40,7 @@ pub use plgs::*;
pub use plink::*;
pub use powernovo::*;
pub use sage::*;
pub use ssl::*;

#[cfg(test)]
mod deepnovofamily_tests;
Expand Down Expand Up @@ -68,3 +70,5 @@ mod plink_tests;
mod powernovo_tests;
#[cfg(test)]
mod sage_tests;
#[cfg(test)]
mod ssl_tests;
2 changes: 1 addition & 1 deletion rustyms/src/identification/opair.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ impl std::fmt::Display for OpairVersion {
f,
"{}",
match self {
Self::Opair => "Opair",
Self::Opair => "",
}
)
}
Expand Down
110 changes: 110 additions & 0 deletions rustyms/src/identification/ssl.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
use crate::{
error::CustomError,
identification::{
common_parser::{OptionalColumn, OptionalLocation},
IdentifiedPeptide, IdentifiedPeptideSource, MetaData,
},
ontologies::CustomDatabase,
system::{isize::Charge, MassOverCharge, Time},
LinearPeptide, SemiAmbiguous,
};

use std::path::{Path, PathBuf};

use serde::{Deserialize, Serialize};

use super::{
common_parser::Location,
csv::{parse_csv, CsvLine},
BoxedIdentifiedPeptideIter,
};

static NUMBER_ERROR: (&str, &str) = (
"Invalid SpectrumSequenceList line",
"This column is not a number but it is required to be a number in this format",
);

format_family!(
/// The format for any SSL file
SpectrumSequenceListFormat,
/// The data from any SSL file
SpectrumSequenceListData,
SpectrumSequenceListVersion, [&SSL], b'\t', None;
required {
raw_file: PathBuf, |location: Location, _| Ok(Path::new(&location.get_string()).to_owned());
scan: usize, |location: Location, _| location.parse(NUMBER_ERROR);
z: Charge, |location: Location, _| location
.trim_end_matches(".0")
.parse::<isize>(NUMBER_ERROR)
.map(Charge::new::<crate::system::e>);
}
optional {
start_time: Time, |location: Location, _| location.parse::<f64>(NUMBER_ERROR).map(Time::new::<crate::system::time::min>);
end_time: Time, |location: Location, _| location.parse::<f64>(NUMBER_ERROR).map(Time::new::<crate::system::time::min>);
peptide: LinearPeptide<SemiAmbiguous>, |location: Location, custom_database: Option<&CustomDatabase>| LinearPeptide::pro_forma(location.as_str(), custom_database).map(|p|p.into_semi_ambiguous().unwrap());
score: f64, |location: Location, _| location.parse::<f64>(NUMBER_ERROR);
score_type: String, |location: Location, _| Ok(location.get_string());
rt: Time, |location: Location, _| location.parse::<f64>(NUMBER_ERROR).map(Time::new::<crate::system::time::min>);
adduct: String, |location: Location, _| Ok(location.get_string());
precursormz: MassOverCharge, |location: Location, _| location.parse::<f64>(NUMBER_ERROR).map(MassOverCharge::new::<crate::system::mz>);
moleculename: String, |location: Location, _| Ok(location.get_string());
inchikey: String, |location: Location, _| Ok(location.get_string());
otherkeys: String, |location: Location, _| Ok(location.or_empty().get_string());
ion_mobility: f64, |location: Location, _| location.parse::<f64>(NUMBER_ERROR);
ion_mobility_units: String, |location: Location, _| Ok(location.get_string());
ccs: f64, |location: Location, _| location.parse::<f64>(NUMBER_ERROR);
}
);

impl From<SpectrumSequenceListData> for IdentifiedPeptide {
fn from(value: SpectrumSequenceListData) -> Self {
Self {
score: value.score,
local_confidence: None,
metadata: MetaData::SpectrumSequenceList(value),
}
}
}

/// General type of SSL files
pub const SSL: SpectrumSequenceListFormat = SpectrumSequenceListFormat {
version: SpectrumSequenceListVersion::SSL,
raw_file: "file",
scan: "scan",
z: "charge",
start_time: OptionalColumn::Optional("start-time"),
end_time: OptionalColumn::Optional("end-time"),
peptide: OptionalColumn::Optional("sequence"),
score: OptionalColumn::Optional("score"),
score_type: OptionalColumn::Optional("score-type"),
rt: OptionalColumn::Optional("retention-time"),
adduct: OptionalColumn::Optional("adduct"),
precursormz: OptionalColumn::Optional("precursorMZ"),
moleculename: OptionalColumn::Optional("moleculename"),
inchikey: OptionalColumn::Optional("inchikey"),
otherkeys: OptionalColumn::Optional("otherkeys"),
ion_mobility: OptionalColumn::Optional("ion-mobility"),
ion_mobility_units: OptionalColumn::Optional("ion-mobility-units"),
ccs: OptionalColumn::Optional("ccs"),
};

/// All possible SpectrumSequenceList versions
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default, Serialize, Deserialize)]
#[allow(non_camel_case_types)]
pub enum SpectrumSequenceListVersion {
#[default]
/// SSL file format
SSL,
}

impl std::fmt::Display for SpectrumSequenceListVersion {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
write!(
f,
"{}",
match self {
Self::SSL => "",
}
)
}
}
99 changes: 99 additions & 0 deletions rustyms/src/identification/ssl_tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#![allow(clippy::missing_panics_doc)]
use std::io::BufReader;

use crate::identification::{test_format, SpectrumSequenceListData, SpectrumSequenceListVersion};

#[test]
fn cascadia_v0_0_5() {
match test_format::<SpectrumSequenceListData>(
BufReader::new(CASCADIA_V0_0_5.as_bytes()),
None,
false,
false,
Some(SpectrumSequenceListVersion::SSL),
) {
Ok(n) => assert_eq!(n, 20),
Err(e) => {
println!("{e}");
panic!("Failed identified peptides test");
}
}
}

#[test]
fn small_molecule_example() {
match test_format::<SpectrumSequenceListData>(
BufReader::new(SMALL_MOLECULES_EXAMPLE.as_bytes()),
None,
false,
false,
Some(SpectrumSequenceListVersion::SSL),
) {
Ok(n) => assert_eq!(n, 6),
Err(e) => {
println!("{e}");
panic!("Failed identified peptides test");
}
}
}

#[test]
fn peptide_example() {
match test_format::<SpectrumSequenceListData>(
BufReader::new(PEPTIDE_EXAMPLE.as_bytes()),
None,
true,
false,
Some(SpectrumSequenceListVersion::SSL),
) {
Ok(n) => assert_eq!(n, 12),
Err(e) => {
println!("{e}");
panic!("Failed identified peptides test");
}
}
}

const CASCADIA_V0_0_5: &str = "file scan charge sequence score-type score retention-time start-time end-time
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 140 4.0 VNHKPSNTKVDKK Cascadia Score 0.8716757 13.830939 13.700380273173717 13.961498312641712
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 143 4.0 LANVNHKPSNTKVDK Cascadia Score 0.9688815 13.73074 13.600180573771862 13.861298613239857
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 284 4.0 LANVNHKPSNTKVDK Cascadia Score 0.91546655 14.112868 13.982309289286999 14.243427328754994
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 350 4.0 TYLANVNHKPSNTK Cascadia Score 0.9532166 14.182865 14.052306123088268 14.313424162556263
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 374 4.0 KENAGEDPGLARQAPKPR Cascadia Score 0.9884338 14.223199 14.092639870952038 14.353757910420033
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 428 4.0 LANVNHKPSNTKVDK Cascadia Score 0.8682205 14.452167 14.321607537578014 14.58272557704601
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 559 3.0 LTPPSREEMTK Cascadia Score 0.94662637 14.79058 14.660020776103405 14.9211388155714
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 624 2.0 KDVDQYMTK Cascadia Score 0.9709009 14.849122 14.718563027690319 14.979681067158314
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 832 3.0 LTPPSREEMTK Cascadia Score 0.98843235 15.338381 15.207821793864635 15.46893983333263
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 939 2.0 YVDGVEVHNAK Cascadia Score 0.8673074 15.534778 15.40421862156239 15.665336661030384
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 1003 3.0 KAVGGLGKLGKDA Cascadia Score 0.91004914 15.7533655 15.6228064969286 15.883924536396595
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 1064 4.0 HKVYASEVTHQGLSSPVTK Cascadia Score 0.98483497 15.964101 15.833541817973522 16.094659857441517
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 1189 3.0 QAPGKGLESVAR Cascadia Score 0.91872495 16.191717 16.06115812809315 16.322276167561146
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 1195 3.0 LTPPSREEMTK Cascadia Score 0.9785851 16.206034 16.07547468693104 16.336592726399036
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 1212 2.0 KVLPVPQK Cascadia Score 0.9976683 16.322432 16.191872544597057 16.452990584065052
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 1219 3.0 EVTHQGLSSPVTK Cascadia Score 0.99756336 16.327246 16.196686692546276 16.45780473201427
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 1320 2.0 LTVDGVSR Cascadia Score 0.97489554 16.50676 16.37620062382069 16.637318663288685
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 1339 3.0 LTPPSREEMTK Cascadia Score 0.9637745 16.652891 16.52233213932362 16.783450178791615
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 26981 4.0 THTC[Carbamidomethyl]PPC[Carbamidomethyl]PAPELLGGPSVFLFPPKPK Cascadia Score 0.85148484 75.06698 74.93641943485585 75.19753747432384
../test_data/test/20230408_F1_UM4_Peng0013_SA_EXT00_her_01_tryp.mzML 1397 4.0 HKVYAGEVTHQGLSSPVTK Cascadia Score 0.99907994 16.68329 16.552731461833385 16.81384950130138";

const SMALL_MOLECULES_EXAMPLE: &str = r#"file scan charge adduct inchikey chemicalformula moleculename otherkeys
dexcaf_051017.mzML 01369 -1 [M-H] ZXPLRDFHBYIQOX-BTBVOZEKSA-N C24H44O21N0 Glc04Reduced
dexcaf_051017.mzML 01639 -1 [M-H] NBVGBCYERZIRIP-JAMOUWTMSA-N C30H54O26N0 Glc05Reduced
dexcaf_051017.mzML 01855 -1 [M-H] PNHJKLJIDNHXFR-ZGJYWSOBSA-N C36H64O31N0 Glc06Reduced
dexcaf_051017.mzML 02029 -1 [M-H] NVKJDLBVRSXYRE-BMFDHOHESA-N C42H74O36N0 Glc07Reduced
dexcaf_051017.mzML 02179 -1 [M-H] YMRGEPQWJZHXFF-MGQBKJSVSA-N C48H84O41N0 Glc08Reduced
dexcaf_051017.mzML 01079 -1 [M-H] RYYVLZVUVIJVGH-UHFFFAOYSA-N C8H10N4O2 Caffeine "InChI:1S/C8H10N4O2/c1-10-4-9-6-5(10)7(13)12(3)8(14)11(6)2/h4H,1-3H3 HMDB:01847 CAS:58-08-2 SMILES:Cn1cnc2n(C)c(=O)n(C)c(=O)c12""#;

const PEPTIDE_EXAMPLE: &str = "file scan charge sequence
demo.ms2 8 3 VGAGAPVYLAAVLEYLAAEVLELAGNAAR
demo.ms2 1806 2 LAESITIEQGK
demo.ms2 2572 2 ELAEDGC[+57.0]SGVEVR
demo.ms2 3088 2 TTAGAVEATSEITEGK
demo.ms2 3266 2 DC[+57.0]EEVGADSNEGGEEEGEEC[+57.0]
demo.ms2 9734 3 IWELEFPEEAADFQQQPVNAQ[-17.0]PQN
demo.ms2 20919 3 VHINIVVIGHVDSGK
../elsewhere/spec.mzXML 00497 2 LKEPAQNTADNAK
../elsewhere/spec.mzXML 00680 2 ALEGPGPGEDAAHSENNPPR
../elsewhere/spec.mzXML 00965 2 FFSHEAEQK
../elsewhere/spec.mzXML 01114 2 C[+57.0]GPSQPLK
../elsewhere/spec.mzXML 01382 2 AVHVQVTDAEAGK";

0 comments on commit f0f98e5

Please sign in to comment.