Skip to content

Commit

Permalink
Merge pull request rusteomics#47 from aukeheerdink:NovoB
Browse files Browse the repository at this point in the history
Implemented NovoB support
  • Loading branch information
douweschulte authored Nov 21, 2024
2 parents 7a162bb + f510992 commit 2520d49
Show file tree
Hide file tree
Showing 8 changed files with 304 additions and 35 deletions.
1 change: 1 addition & 0 deletions clippy.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ doc-valid-idents = [
"PointNovoFamily",
"PowerNovo",
"PepNet",
"NovoB",
"pi-PrimeNovo",
"π-PrimeNovo",
]
Expand Down
4 changes: 2 additions & 2 deletions rustyms/src/identification/deepnovofamily.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ format_family!(
location.location.clone(),
custom_database,
PARAMETERS_LOCK.get_or_init(|| SloppyParsingParameters{
mod_indications: vec![
mod_indications: (Some("mod"), vec![
(AminoAcid::Asparagine, Ontology::Unimod.find_id(7, None).unwrap()),
(AminoAcid::Glutamine, Ontology::Unimod.find_id(7, None).unwrap()),
(AminoAcid::Cysteine, Ontology::Unimod.find_id(6, None).unwrap()),
(AminoAcid::Methionine, Ontology::Unimod.find_id(35, None).unwrap()),
],
]),
..Default::default()
})
)).transpose();
Expand Down
20 changes: 17 additions & 3 deletions rustyms/src/identification/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ use super::{
error::{Context, CustomError},
ontologies::CustomDatabase,
DeepNovoFamilyData, FastaData, IdentifiedPeptide, IdentifiedPeptideIter,
IdentifiedPeptideSource, InstaNovoData, MSFraggerData, MZTabData, MaxQuantData, NovorData,
OpairData, PLinkData, PeaksData, PepNetData, PowerNovoData, SageData,
IdentifiedPeptideSource, InstaNovoData, MSFraggerData, MZTabData, MaxQuantData, NovoBData,
NovorData, OpairData, PLinkData, PeaksData, PepNetData, PowerNovoData, SageData,
};

// TODO:
Expand Down Expand Up @@ -89,7 +89,21 @@ pub fn open_identified_peptides_file<'a>(
as Box<dyn Iterator<Item = Result<IdentifiedPeptide, CustomError>> + 'a>
}),
Some("txt") => {
MaxQuantData::parse_file(path, custom_database).map(IdentifiedPeptideIter::into_box)
MaxQuantData::parse_file(path, custom_database)
.map(IdentifiedPeptideIter::into_box)
.or_else(|me| {
NovoBData::parse_file(path, custom_database)
.map(IdentifiedPeptideIter::into_box)
.map_err(|ne| (me, ne))
})
.map_err(|(me, ne)| {
CustomError::error(
"Unknown file format",
"Could not be recognised as either a MaxQuant or NovoB file",
Context::show(path.to_string_lossy()),
)
.with_underlying_errors(vec![me, ne])
})
}
Some("mztab") => MZTabData::parse_file(path, custom_database).map(|peptides| {
Box::new(peptides.into_iter().map(|p| p.map(Into::into)))
Expand Down
56 changes: 42 additions & 14 deletions rustyms/src/identification/identified_peptide.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ use crate::{
formula::MultiChemical,
identification::{
deepnovofamily::DeepNovoFamilyData, fasta::FastaData, instanovo::InstaNovoData,
novor::NovorData, opair::OpairData, peaks::PeaksData, pepnet::PepNetData, plink::PLinkData,
powernovo::PowerNovoData, system::MassOverCharge, MSFraggerData, MZTabData, MaxQuantData,
SageData,
novob::NovoBData, novor::NovorData, opair::OpairData, peaks::PeaksData, pepnet::PepNetData,
plink::PLinkData, powernovo::PowerNovoData, system::MassOverCharge, MSFraggerData,
MZTabData, MaxQuantData, SageData,
},
ontologies::CustomDatabase,
peptide::SemiAmbiguous,
Expand Down Expand Up @@ -48,6 +48,8 @@ pub enum MetaData {
MSFragger(MSFraggerData),
/// mzTab metadata
MZTab(MZTabData),
/// NovoB metadata
NovoB(NovoBData),
/// Novor metadata
Novor(NovorData),
/// OPair metadata
Expand Down Expand Up @@ -174,6 +176,19 @@ impl IdentifiedPeptide {
MetaData::PLink(PLinkData { peptidoform, .. }) => {
Some(ReturnedPeptide::Peptidoform(peptidoform))
}
MetaData::NovoB(NovoBData {
score_forward,
score_reverse,
peptide_forward,
peptide_reverse,
..
}) => {
if score_forward >= score_reverse {
Some(ReturnedPeptide::Linear(peptide_forward))
} else {
Some(ReturnedPeptide::Linear(peptide_reverse))
}
}
}
}

Expand All @@ -188,6 +203,7 @@ impl IdentifiedPeptide {
MetaData::MZTab(_) => "mzTab",
MetaData::Novor(_) => "Novor",
MetaData::Opair(_) => "OPair",
MetaData::NovoB(_) => "NovoB",
MetaData::Peaks(_) => "PEAKS",
MetaData::PepNet(_) => "PepNet",
MetaData::PLink(_) => "pLink",
Expand All @@ -212,6 +228,7 @@ impl IdentifiedPeptide {
MetaData::PowerNovo(PowerNovoData { version, .. }) => version.to_string(),
MetaData::Sage(SageData { version, .. }) => version.to_string(),
MetaData::PepNet(PepNetData { version, .. }) => version.to_string(),
MetaData::NovoB(NovoBData { version, .. }) => version.to_string(),
}
}

Expand All @@ -232,6 +249,7 @@ impl IdentifiedPeptide {
MetaData::DeepNovoFamily(DeepNovoFamilyData { scan, .. }) => scan.iter().join(";"),
MetaData::Novor(NovorData { id, scan, .. }) => id.unwrap_or(*scan).to_string(),
MetaData::Opair(OpairData { scan, .. })
| MetaData::NovoB(NovoBData { scan, .. })
| MetaData::InstaNovo(InstaNovoData { scan, .. }) => scan.to_string(),
MetaData::Sage(SageData { id, .. }) | MetaData::MZTab(MZTabData { id, .. }) => {
id.to_string()
Expand Down Expand Up @@ -286,6 +304,7 @@ impl IdentifiedPeptide {
| MetaData::Sage(SageData { z, .. })
| MetaData::MSFragger(MSFraggerData { z, .. })
| MetaData::MaxQuant(MaxQuantData { z, .. })
| MetaData::NovoB(NovoBData { z, .. })
| MetaData::PLink(PLinkData { z, .. })
| MetaData::InstaNovo(InstaNovoData { z, .. })
| MetaData::MZTab(MZTabData { z, .. }) => Some(*z),
Expand Down Expand Up @@ -317,6 +336,7 @@ impl IdentifiedPeptide {
MetaData::DeepNovoFamily(_)
| MetaData::InstaNovo(_)
| MetaData::Fasta(_)
| MetaData::NovoB(_)
| MetaData::PowerNovo(_)
| MetaData::PepNet(_)
| MetaData::PLink(_) => None,
Expand Down Expand Up @@ -349,7 +369,7 @@ impl IdentifiedPeptide {
)
})
}
MetaData::Novor(NovorData { scan, .. }) => {
MetaData::Novor(NovorData { scan, .. }) | MetaData::NovoB(NovoBData { scan, .. }) => {
SpectrumIds::FileNotKnown(vec![SpectrumId::Index(*scan)])
}
MetaData::DeepNovoFamily(DeepNovoFamilyData { scan, .. }) => SpectrumIds::FileNotKnown(
Expand Down Expand Up @@ -418,6 +438,7 @@ impl IdentifiedPeptide {
*mz
}
MetaData::Sage(SageData { mass, z, .. })
| MetaData::NovoB(NovoBData { mass, z, .. })
| MetaData::PLink(PLinkData { mass, z, .. }) => {
Some(MassOverCharge::new::<crate::system::mz>(
mass.value / (z.value as f64),
Expand All @@ -438,6 +459,7 @@ impl IdentifiedPeptide {
}
MetaData::Novor(NovorData { mass, .. })
| MetaData::Opair(OpairData { mass, .. })
| MetaData::NovoB(NovoBData { mass, .. })
| MetaData::MSFragger(MSFraggerData { mass, .. })
| MetaData::PLink(PLinkData { mass, .. })
| MetaData::Sage(SageData { mass, .. }) => Some(*mass),
Expand All @@ -453,17 +475,23 @@ impl IdentifiedPeptide {

/// Get the absolute ppm error between the experimental and theoretical precursor mass
pub fn ppm_error(&self) -> Option<crate::system::Ratio> {
if let MetaData::PepNet(p) = &self.metadata {
return Some(p.ppm_diff);
match &self.metadata {
MetaData::PepNet(p) => Some(p.ppm_diff),
MetaData::NovoB(p) => Some(if p.score_forward >= p.score_reverse {
p.ppm_diff_forward
} else {
p.ppm_diff_reverse
}),
_ => {
let exp_mass = self.experimental_mass()?;
let theo_mass = self
.peptide()
.and_then(|p| p.formulas().to_vec().pop())
.map(|f| f.monoisotopic_mass())?;

Some(theo_mass.ppm(exp_mass))
}
}

let exp_mass = self.experimental_mass()?;
let theo_mass = self
.peptide()
.and_then(|p| p.formulas().to_vec().pop())
.map(|f| f.monoisotopic_mass())?;

Some(theo_mass.ppm(exp_mass))
}

/// Get the absolute mass error between the experimental and theoretical precursor mass
Expand Down
4 changes: 4 additions & 0 deletions rustyms/src/identification/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ mod instanovo;
mod maxquant;
mod msfragger;
mod mztab;
mod novob;
mod novor;
mod opair;
mod peaks;
Expand All @@ -30,6 +31,7 @@ pub use instanovo::*;
pub use maxquant::*;
pub use msfragger::*;
pub use mztab::*;
pub use novob::*;
pub use novor::*;
pub use opair::*;
pub use peaks::*;
Expand All @@ -49,6 +51,8 @@ mod msfragger_tests;
#[cfg(test)]
mod mztab_test;
#[cfg(test)]
mod novob_tests;
#[cfg(test)]
mod novor_tests;
#[cfg(test)]
mod opair_tests;
Expand Down
165 changes: 165 additions & 0 deletions rustyms/src/identification/novob.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
use crate::{
error::CustomError,
identification::{IdentifiedPeptide, IdentifiedPeptideSource, MetaData},
modification::Ontology,
ontologies::CustomDatabase,
system::Ratio,
system::{usize::Charge, Mass},
AminoAcid, LinearPeptide, SemiAmbiguous, SloppyParsingParameters,
};

use serde::{Deserialize, Serialize};

use super::{
common_parser::Location,
csv::{parse_csv, CsvLine},
BoxedIdentifiedPeptideIter, SequenceElement,
};

use std::sync::OnceLock;

static NUMBER_ERROR: (&str, &str) = (
"Invalid NovoB line",
"This column is not a number but it is required to be a number in this format",
);

static PARAMETERS_LOCK: OnceLock<SloppyParsingParameters> = OnceLock::new();

/// Global parsing parameters
#[allow(clippy::missing_panics_doc)] // These modifications exist
fn parameters() -> &'static SloppyParsingParameters {
PARAMETERS_LOCK.get_or_init(|| SloppyParsingParameters {
custom_alphabet: vec![
(
b's',
SequenceElement::new(AminoAcid::Serine.into(), None)
.with_simple_modification(Ontology::Unimod.find_id(21, None).unwrap()),
),
(
b't',
SequenceElement::new(AminoAcid::Tyrosine.into(), None)
.with_simple_modification(Ontology::Unimod.find_id(21, None).unwrap()),
),
(
b'y',
SequenceElement::new(AminoAcid::Threonine.into(), None)
.with_simple_modification(Ontology::Unimod.find_id(21, None).unwrap()),
),
(
b'n',
SequenceElement::new(AminoAcid::Asparagine.into(), None)
.with_simple_modification(Ontology::Unimod.find_id(7, None).unwrap()),
),
(
b'q',
SequenceElement::new(AminoAcid::Glutamine.into(), None)
.with_simple_modification(Ontology::Unimod.find_id(7, None).unwrap()),
),
(
b'C',
SequenceElement::new(AminoAcid::Cysteine.into(), None)
.with_simple_modification(Ontology::Unimod.find_id(6, None).unwrap()),
),
(
b'm',
SequenceElement::new(AminoAcid::Methionine.into(), None)
.with_simple_modification(Ontology::Unimod.find_id(35, None).unwrap()),
),
],
..Default::default()
})
}

format_family!(
/// The format for any NovoB file
NovoBFormat,
/// The data from any NovoB file
NovoBData,
NovoBVersion, [&NOVOB_V0_0_1], b'\t', Some(vec![
"mcount".to_string(),
"charge".to_string(),
"pepmass".to_string(),
"senten".to_string(),
"delta_mass".to_string(),
"prob".to_string(),
"senten_reverse".to_string(),
"delta_mass_reverse".to_string(),
"prob_reverse".to_string()
]);

required {
scan: usize, |location: Location, _| location.parse(NUMBER_ERROR);
z: Charge, |location: Location, _| location.parse::<usize>(NUMBER_ERROR).map(Charge::new::<crate::system::e>);
mass: Mass, |location: Location, _| location.parse::<f64>(NUMBER_ERROR).map(Mass::new::<crate::system::dalton>);

score_forward: f64, |location: Location, _| location.parse::<f64>(NUMBER_ERROR);
ppm_diff_forward: Ratio, |location: Location, _| location.parse::<f64>(NUMBER_ERROR).map(Ratio::new::<crate::system::ratio::ppm>);
peptide_forward: LinearPeptide<SemiAmbiguous>, | location: Location, custom_database: Option<&CustomDatabase>| {
let location = location.trim_start_matches("['").trim_end_matches("']");
LinearPeptide::sloppy_pro_forma(
location.full_line(),
location.location.clone(),
custom_database,
parameters(),
)};

score_reverse: f64, |location: Location, _| location.parse::<f64>(NUMBER_ERROR);
ppm_diff_reverse: Ratio, |location: Location, _| location.parse::<f64>(NUMBER_ERROR).map(Ratio::new::<crate::system::ratio::ppm>);
peptide_reverse: LinearPeptide<SemiAmbiguous>, | location: Location, custom_database: Option<&CustomDatabase>| {
let location = location.trim_start_matches("['").trim_end_matches("']");
LinearPeptide::sloppy_pro_forma(
location.full_line(),
location.location.clone(),
custom_database,
parameters(),
)};
}
optional { }
);

impl From<NovoBData> for IdentifiedPeptide {
fn from(value: NovoBData) -> Self {
Self {
score: Some(value.score_forward.max(value.score_reverse)),
local_confidence: None,
metadata: MetaData::NovoB(value),
}
}
}

/// The only known version of NovoB
pub const NOVOB_V0_0_1: NovoBFormat = NovoBFormat {
version: NovoBVersion::V0_0_1,
scan: "mcount",
z: "charge",
mass: "pepmass",

score_forward: "prob",
peptide_forward: "senten",
ppm_diff_forward: "delta_mass",

score_reverse: "prob_reverse",
peptide_reverse: "senten_reverse",
ppm_diff_reverse: "delta_mass_reverse",
};

/// All possible NovoB versions
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default, Serialize, Deserialize)]
#[allow(non_camel_case_types)]
pub enum NovoBVersion {
#[default]
/// NovoB version 0.0.1
V0_0_1,
}

impl std::fmt::Display for NovoBVersion {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
write!(
f,
"{}",
match self {
Self::V0_0_1 => "v0.0.1",
}
)
}
}
Loading

0 comments on commit 2520d49

Please sign in to comment.