Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/snijderlab/rustyms
Browse files Browse the repository at this point in the history
  • Loading branch information
aukeheerdink committed Feb 11, 2025
2 parents 56859da + ba6dad7 commit d686cf9
Show file tree
Hide file tree
Showing 183 changed files with 2,411 additions and 199,088 deletions.
49 changes: 49 additions & 0 deletions .github/workflows/generate-databases.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Generate databases

on:
workflow_dispatch:
schedule:
- cron: "7 4 1 * *"
push:
branches: ["release", "main"]

permissions:
contents: write
pull-requests: write

env:
CARGO_TERM_COLOR: always
CARGO_ENCODED_RUSTFLAGS: --cfg=github_action

jobs:
generate-databases:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Check
run: cargo check
- name: Download and build external databases
run: |
bash ./.github/workflows/scripts/update-all-databases.sh
MESSAGE="$(cat /tmp/MESSAGES)"
echo "MESSAGE=${MESSAGE}" >> "${GITHUB_ENV}"
- name: Create pull request
id: cpr
uses: peter-evans/create-pull-request@v7
with:
token: ${{ secrets.RUSTEOMICS_MZCORE_PR_TOKEN }}
commit-message: Update external databases and ontologies
committer: GitHub <[email protected]>
author: GitHub <[email protected]>
branch: update-databases
delete-branch: true
title: "Update external databases and ontologies"
body: |
This automated PR updates the binary blobs for external databases
and ontologies.
Below are messages from script execution:
> ${{ env.MESSAGE }}
labels: |
A-rustyms-generate-databases
C-maintenance
76 changes: 76 additions & 0 deletions .github/workflows/scripts/update-all-databases.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env bash


function help {
echo "Usage: generate-all-databases.sh"
echo ""
echo "Download the required databases and build the "
echo "required binary representations of the ontologies."
echo ""
echo "Options:"
echo " -h, --help Display this help and exit"
exit 1
}


# Download IMGT and process and serialize it to a binary blob.
function make-imgt {
echo "Downloading IMGT..."
mkdir -p rustyms-generate-imgt/data
# IMGT is not very reliable, so sometimes the server is down.
# The || clause here allows the rest of the script to continue
# even if this fails.
curl https://www.imgt.org/download/LIGM-DB/imgt.dat.Z \
| gunzip -c > rustyms-generate-imgt/data/imgt.dat \
&& echo "Serializing IMGT ..." \
&& cargo run --bin rustyms-generate-imgt \
|| echo "Failed to download IMGT. I did not update it." >> /tmp/MESSAGES
}


# Download the relevant ontologies and serialize them to binary blobs.
function make-ontologies {
echo "Downloading databases..."
db_data="rustyms-generate-databases/data"
mkdir -p ${db_data}
curl https://raw.githubusercontent.com/HUPO-PSI/psi-mod-CV/refs/heads/master/PSI-MOD-newstyle.obo \
> ${db_data}/PSI-MOD-newstyle.obo
curl http://www.unimod.org/obo/unimod.obo > ${db_data}/unimod.obo
curl ftp://ftp.proteininformationresource.org/pir_databases/other_databases/resid/RESIDUES.XML \
> ${db_data}/RESID-RESIDUES.XML
curl https://raw.githubusercontent.com/HUPO-PSI/mzIdentML/master/cv/XLMOD.obo \
> ${db_data}/XLMOD.obo
curl -L http://purl.obolibrary.org/obo/gno.obo \
| sed '/(property_value: GNO:00000(022|023|041|042|101|102) .*$\n)|(def: .*$\n)/d' \
| gzip -c \
> ${db_data}/GNOme.obo.gz
curl -L https://glycosmos.org/download/glycosmos_glycans_list.csv \
| gzip -c > ${db_data}/glycosmos_glycans_list.csv.gz


echo "Serializing the other databases..."
cargo run --bin rustyms-generate-databases
}


function main {
while [[ $# -gt 0 ]]; do
case "$1" in
-h|--help)
help
;;
*)
echo "Unknown argument: $1"
help
;;
esac
done

touch /tmp/MESSAGES

make-imgt
make-ontologies
}


main "$@"
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
rustyms-generate-databases/data/*
!rustyms-generate-databases/data/CIAAW*
!rustyms-generate-databases/data/IUPAC*
/target
Cargo.lock
*/target/
Expand All @@ -11,4 +14,4 @@ docs/**/_build/
out
out_peaks
out_pro_forma
out_sloppy_pro_forma
out_sloppy_pro_forma
37 changes: 37 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
title: rustyms
version: 0.9.0
abstract: A rust library for parsing Pro Forma peptides and matching them against MS spectra .
authors:
- affiliation: Utrecht University
family-names: Schulte
given-names: Douwe
orcid: https://orcid.org/0000-0003-0594-0993
- affiliation: VIB-UGent Center for Medical Biotechnology
family-names: Gabriels
given-names: Ralf
orcid: https://orcid.org/0000-0002-1679-1711
- affiliation: Utrecht University
family-names: Heerdink
given-names: Auke
cff-version: 1.2.0
identifiers:
- description: Main paper (preprint)
type: doi
value: 10.1101/2025.01.18.633732
- description: Mass alignment algorithm paper
type: doi
value: 10.1021/acs.jproteome.4c00188
- description: Mass alignment algorithm paper (preprint)
type: url
value: https://www.biorxiv.org/content/10.1101/2024.02.20.581155v1
- description: Repository
type: url
value: https://github.com/snijderlab/rustyms
keywords:
- sequencing
- antibody
- mass-spectrometry
- de novo
license: MIT OR Apache-2.0
message: If you use this software, please cite it using these metadata.
repository-code: https://github.com/snijderlab/rustyms
25 changes: 16 additions & 9 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ package.authors = [
]
package.edition = "2021"
package.license = "MIT OR Apache-2.0"
package.rust-version = "1.75.0"
package.version = "0.9.0-alpha.3"
package.rust-version = "1.84.0"
package.version = "0.9.0"

[profile.release]
debug = true
Expand All @@ -29,21 +29,28 @@ codegen-units = 1
afl = "0.15"
bincode = "1.3"
clap = { version = "4.5", features = ["derive", "cargo"] }
directories = "5.0"
directories = "6.0"
flate2 = "1.0"
iai-callgrind = "0.14"
itertools = "0.13"
mzdata = "0.40"
itertools = "0.14"
mzdata = "0.44"
ndarray = "0.16"
ordered-float = { version = "4.5", features = ["serde"] }
ordered-float = { version = "4.6", features = ["serde"] }
probability = "0.20"
pyo3 = "0.23"
rand = "0.8"
rayon = "1.9"
rand = "0.9"
rayon = "1.10"
regex = "1.11"
roxmltree = "0.20"
serde = { version = "1.0", features = ["derive", "rc"] }
serde_json = "1.0"
similar = "2.6"
similar = "2.7"
thin-vec = { version = "0.2", features = ["serde"] }
uom = { version = "0.36", features = ["use_serde", "usize", "isize"] }

[workspace.lints.rust]
unexpected_cfgs = { level = "allow", check-cfg = [
"cfg(github_action)",
"cfg(si)",
"cfg(f32)",
] }
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,22 @@

# Match those fragments!

A peptide fragmentation matching library for Rust. Built to handle very complex peptides in a sensible way.
A peptide fragmentation matching library for Rust. Built to handle very complex peptidoforms in a sensible way.

## Features

- Read [ProForma](https://github.com/HUPO-PSI/ProForma) sequences (complete specification supported: 'level 2-ProForma + top-down compliant + cross-linking compliant + glycans compliant + mass spectrum compliant')
- Generate theoretical fragments with control over the fragmentation model from any ProForma peptidoform/proteoform
- Read [ProForma](https://github.com/HUPO-PSI/ProForma) sequences (complete 2.0 specification supported: 'level 2-ProForma + top-down compliant + cross-linking compliant + glycans compliant + mass spectrum compliant')
- Generate theoretical fragments with control over the fragmentation model from any ProForma peptidoform
- Generate theoretical fragments for chimeric spectra
- Generate theoretical fragments for cross-links (also disulfides)
- Generate theoretical fragments for modifications of unknown position
- Generate peptide backbone (a, b, c, x, y, and z) and satellite ion fragments (w, d, and v)
- Generate peptide backbone (a, b, c, x, y, and z) and satellite ion fragments (d, v, and w)
- Generate glycan fragments (B, Y, and internal fragments)
- Integrated with [mzdata](https://crates.io/crates/mzdata) for reading raw data files
- Match spectra to the generated fragments
- [Align peptides based on mass](https://pubs.acs.org/doi/10.1021/acs.jproteome.4c00188)
- Fast access to the IMGT database of antibody germlines
- Reading of multiple identified peptide file formats (Fasta, MaxQuant, MSFragger, Novor, OPair, Peaks, and Sage)
- Reading of multiple identified peptide file formats (among others: Fasta, MaxQuant, MSFragger, Novor, OPair, Peaks, and Sage)
- Exhaustively fuzz tested for reliability (using [cargo-afl](https://crates.io/crates/cargo-afl))
- Extensive use of [uom](https://docs.rs/uom/latest/uom/) for compile time unit checking
- Python bindings are provided to several core components of the rustyms library. Go to the [Python documentation](https://rustyms.readthedocs.io/) for more information.
Expand Down
5 changes: 2 additions & 3 deletions docs/python/source/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Python bindings are provided to several core components of the rustyms library,
- {py:class}`~rustyms.Fragment` Theoretical fragment ion
- {py:class}`~rustyms.SequenceElement` One position in a peptide sequence with amino acid and
modifications
- {py:class}`~rustyms.LinearPeptide` Peptide sequence, modifications, and charge, using
- {py:class}`~rustyms.CompoundPeptidoformIon` Peptide sequence, modifications, and charge, using
[ProForma 2.0](https://proforma.readthedocs.io) (see {ref}`ProForma support` for more
information)
- {py:class}`~rustyms.RawPeak` A single peak in a mass spectrum
Expand All @@ -43,7 +43,7 @@ raw_spectrum = rustyms.RawSpectrum(
)

# Create a new peptide from a ProForma 2.0 string
peptide = rustyms.LinearPeptide("ACDE/2")
peptide = rustyms.CompoundPeptidoformIon("ACDE/2")

# Annotate the spectrum with the peptide
annotated_spectrum = raw_spectrum.annotate(peptide, "cid_hcd")
Expand All @@ -68,7 +68,6 @@ rustyms Python bindings.
:maxdepth: 2
About <self>
proforma-support
api
contributing
```
3 changes: 0 additions & 3 deletions docs/python/source/proforma-support.md

This file was deleted.

3 changes: 3 additions & 0 deletions examples/de-novo-align/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ clap = { workspace = true }
itertools = { workspace = true }
rayon = { workspace = true }
serde_json = { workspace = true }

[lints]
workspace = true
2 changes: 1 addition & 1 deletion examples/de-novo-align/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ fn main() {
peptide,
align::<4, SemiAmbiguous, SemiAmbiguous>(
db.peptide(),
&linear_peptide,
linear_peptide,
AlignScoring::default(),
AlignType::EITHER_GLOBAL,
),
Expand Down
3 changes: 3 additions & 0 deletions examples/multi-annotator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ directories = { workspace = true }
itertools = { workspace = true }
rayon = { workspace = true }
serde_json = { workspace = true }

[lints]
workspace = true
2 changes: 1 addition & 1 deletion examples/multi-annotator/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ fn main() {
.parse::<usize>()
.unwrap();
let z = line.index_column("z").unwrap().0.parse::<usize>().unwrap();
let peptide = CompoundPeptidoform::pro_forma(
let peptide = CompoundPeptidoformIon::pro_forma(
line.index_column("sequence").unwrap().0,
custom_database.as_ref(),
)
Expand Down
3 changes: 3 additions & 0 deletions fuzz/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,6 @@ path = "fuzz_targets/peaks.rs"
test = false
doc = false
bench = false

[lints]
workspace = true
2 changes: 1 addition & 1 deletion fuzz/fuzz_targets/peaks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ impl<'a> StringReader<'a> {
}
}

impl<'a> Read for StringReader<'a> {
impl Read for StringReader<'_> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
for (i, item) in buf.iter_mut().enumerate() {
if let Some(x) = self.iter.next() {
Expand Down
2 changes: 1 addition & 1 deletion fuzz/fuzz_targets/pro_forma.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use afl::*;
fn main() {
fuzz!(|data: &[u8]| {
if let Ok(s) = std::str::from_utf8(data) {
let _ = rustyms::CompoundPeptidoform::pro_forma(s, None);
let _ = rustyms::CompoundPeptidoformIon::pro_forma(s, None);
}
});
}
2 changes: 1 addition & 1 deletion fuzz/fuzz_targets/sloppy_pro_forma.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use afl::*;
fn main() {
fuzz!(|data: &[u8]| {
if let Ok(s) = std::str::from_utf8(data) {
let _ = rustyms::LinearPeptide::sloppy_pro_forma(
let _ = rustyms::Peptidoform::sloppy_pro_forma(
s,
0..s.len(),
None,
Expand Down
3 changes: 3 additions & 0 deletions rustyms-generate-databases/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,6 @@ uom = { workspace = true }

[features]
rayon = []

[lints]
workspace = true
Binary file removed rustyms-generate-databases/data/GNOme.obo.gz
Binary file not shown.
Loading

0 comments on commit d686cf9

Please sign in to comment.