From 48cb02ad65046017c3b6dd136fa3e9b5cdd082f0 Mon Sep 17 00:00:00 2001 From: adamrtalbot <12817534+adamrtalbot@users.noreply.github.com> Date: Thu, 12 Feb 2026 18:24:03 +0000 Subject: [PATCH 1/6] Add release process with cross-platform binaries and Docker container - GitHub Actions release workflow triggered by version tags (v*) - Builds binaries for Linux (x86_64, aarch64), macOS (x86_64, aarch64) - Multi-arch Docker image published to GHCR (ghcr.io/ewels/duprust) - CI workflow with test, fmt, and clippy checks on PRs - Multi-stage Dockerfile optimized for rust-htslib static builds Generated by Claude Code --- .dockerignore | 9 ++ .github/workflows/ci.yml | 76 ++++++++++++++ .github/workflows/release.yml | 190 ++++++++++++++++++++++++++++++++++ Dockerfile | 30 ++++++ 4 files changed, 305 insertions(+) create mode 100644 .dockerignore create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/release.yml create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0382d24 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +target/ +.git/ +.github/ +benchmark/ +tests/ +*.md +LICENSE +.gitignore +.dockerignore diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..104305c --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,76 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + +permissions: + contents: read + +env: + CARGO_TERM_COLOR: always + +jobs: + test: + name: Test (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + steps: + - uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Install Linux build deps + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -y cmake zlib1g-dev libbz2-dev liblzma-dev \ + libcurl4-openssl-dev libssl-dev pkg-config clang + + - name: Install macOS build deps + if: runner.os == 'macOS' + run: brew install bzip2 xz + + - uses: Swatinem/rust-cache@v2 + + - name: Build + run: cargo build --release + + - name: Test + run: cargo test --release + + fmt: + name: Formatting + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt + - run: cargo fmt --check + + clippy: + name: Clippy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + components: clippy + + - name: Install Linux build deps + run: | + sudo apt-get update + sudo apt-get install -y cmake zlib1g-dev libbz2-dev liblzma-dev \ + libcurl4-openssl-dev libssl-dev pkg-config clang + + - uses: Swatinem/rust-cache@v2 + + - run: cargo clippy -- -D warnings diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..e26d307 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,190 @@ +name: Release + +on: + push: + tags: + - "v[0-9]+.*" + +permissions: + contents: write + packages: write + +env: + CARGO_TERM_COLOR: always + +jobs: + # ------------------------------------------------------------------ + # 1. Create a draft GitHub release + # ------------------------------------------------------------------ + create-release: + name: Create release + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + steps: + - uses: actions/checkout@v4 + + - name: Get tag + id: tag + run: echo "tag=${GITHUB_REF#refs/tags/}" >> "$GITHUB_OUTPUT" + + - name: Create draft release + env: + GH_TOKEN: ${{ github.token }} + run: gh release create "${{ steps.tag.outputs.tag }}" --draft --verify-tag --title "${{ steps.tag.outputs.tag }}" + + # ------------------------------------------------------------------ + # 2. Build binaries for each target + # ------------------------------------------------------------------ + build-binaries: + name: Build ${{ matrix.name }} + needs: create-release + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + # Linux x86_64 + - name: linux-x86_64 + os: ubuntu-latest + target: x86_64-unknown-linux-gnu + use-cross: false + # Linux aarch64 + - name: linux-aarch64 + os: ubuntu-latest + target: aarch64-unknown-linux-gnu + use-cross: true + # macOS x86_64 + - name: macos-x86_64 + os: macos-13 + target: x86_64-apple-darwin + use-cross: false + # macOS aarch64 (Apple Silicon) + - name: macos-aarch64 + os: macos-latest + target: aarch64-apple-darwin + use-cross: false + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + targets: ${{ matrix.target }} + + # Linux build dependencies (native) + - name: Install Linux build deps + if: runner.os == 'Linux' && !matrix.use-cross + run: | + sudo apt-get update + sudo apt-get install -y cmake zlib1g-dev libbz2-dev liblzma-dev \ + libcurl4-openssl-dev libssl-dev pkg-config clang + + # macOS build dependencies + - name: Install macOS build deps + if: runner.os == 'macOS' + run: brew install bzip2 xz + + # Cross (for Linux aarch64) + - name: Install cross + if: matrix.use-cross + uses: taiki-e/install-action@v2 + with: + tool: cross + + - name: Build + run: | + if [ "${{ matrix.use-cross }}" = "true" ]; then + cross build --release --target ${{ matrix.target }} + else + cargo build --release --target ${{ matrix.target }} + fi + + - name: Package + id: package + run: | + BIN="duprust" + TAG="${{ needs.create-release.outputs.tag }}" + ARCHIVE="${BIN}-${TAG}-${{ matrix.name }}" + + mkdir -p "staging/${ARCHIVE}" + cp "target/${{ matrix.target }}/release/${BIN}" "staging/${ARCHIVE}/" + cp README.md LICENSE "staging/${ARCHIVE}/" 2>/dev/null || true + + cd staging + tar czf "../${ARCHIVE}.tar.gz" "${ARCHIVE}" + cd .. + + # SHA256 checksum + shasum -a 256 "${ARCHIVE}.tar.gz" > "${ARCHIVE}.tar.gz.sha256" + + echo "archive=${ARCHIVE}.tar.gz" >> "$GITHUB_OUTPUT" + echo "checksum=${ARCHIVE}.tar.gz.sha256" >> "$GITHUB_OUTPUT" + + - name: Upload to release + env: + GH_TOKEN: ${{ github.token }} + run: | + gh release upload "${{ needs.create-release.outputs.tag }}" \ + "${{ steps.package.outputs.archive }}" \ + "${{ steps.package.outputs.checksum }}" + + # ------------------------------------------------------------------ + # 3. Build & push Docker image to GHCR + # ------------------------------------------------------------------ + docker: + name: Docker image + needs: create-release + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/${{ github.repository }} + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + type=raw,value=latest + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v6 + with: + context: . + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + # ------------------------------------------------------------------ + # 4. Publish the release (undraft) + # ------------------------------------------------------------------ + publish-release: + name: Publish release + needs: [create-release, build-binaries, docker] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Publish release + env: + GH_TOKEN: ${{ github.token }} + run: gh release edit "${{ needs.create-release.outputs.tag }}" --draft=false diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..65c0a78 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +# ---- Build stage ---- +FROM rust:1-bookworm AS builder + +RUN apt-get update && apt-get install -y --no-install-recommends \ + cmake \ + zlib1g-dev \ + libbz2-dev \ + liblzma-dev \ + libcurl4-openssl-dev \ + libssl-dev \ + pkg-config \ + clang \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /build +COPY Cargo.toml Cargo.lock ./ +COPY src/ src/ + +RUN cargo build --release && strip target/release/duprust + +# ---- Runtime stage ---- +FROM debian:bookworm-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /build/target/release/duprust /usr/local/bin/duprust + +ENTRYPOINT ["duprust"] From 4d987ecf234098b0e32bdfaae564c2e484586177 Mon Sep 17 00:00:00 2001 From: adamrtalbot <12817534+adamrtalbot@users.noreply.github.com> Date: Thu, 12 Feb 2026 19:33:36 +0000 Subject: [PATCH 2/6] Fix CI: add libfontconfig1-dev dependency and run cargo fmt - Add libfontconfig1-dev to apt-get in ci.yml, release.yml, and Dockerfile (required by plotters via yeslogic-fontconfig-sys) - Run cargo fmt on all source files to pass formatting check Generated by Claude Code --- .github/workflows/ci.yml | 4 +- .github/workflows/release.yml | 2 +- Dockerfile | 1 + src/counting.rs | 35 ++++++---- src/gtf.rs | 3 +- src/main.rs | 5 +- src/plots.rs | 127 ++++++++++++++++++++++++++-------- tests/integration_test.rs | 28 ++------ 8 files changed, 130 insertions(+), 75 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 104305c..cab327d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y cmake zlib1g-dev libbz2-dev liblzma-dev \ - libcurl4-openssl-dev libssl-dev pkg-config clang + libcurl4-openssl-dev libssl-dev libfontconfig1-dev pkg-config clang - name: Install macOS build deps if: runner.os == 'macOS' @@ -69,7 +69,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y cmake zlib1g-dev libbz2-dev liblzma-dev \ - libcurl4-openssl-dev libssl-dev pkg-config clang + libcurl4-openssl-dev libssl-dev libfontconfig1-dev pkg-config clang - uses: Swatinem/rust-cache@v2 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e26d307..916b180 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -79,7 +79,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y cmake zlib1g-dev libbz2-dev liblzma-dev \ - libcurl4-openssl-dev libssl-dev pkg-config clang + libcurl4-openssl-dev libssl-dev libfontconfig1-dev pkg-config clang # macOS build dependencies - name: Install macOS build deps diff --git a/Dockerfile b/Dockerfile index 65c0a78..7b780c9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,6 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ liblzma-dev \ libcurl4-openssl-dev \ libssl-dev \ + libfontconfig1-dev \ pkg-config \ clang \ && rm -rf /var/lib/apt/lists/* diff --git a/src/counting.rs b/src/counting.rs index 7110acd..8b0b440 100644 --- a/src/counting.rs +++ b/src/counting.rs @@ -137,9 +137,7 @@ impl ChromIndex { // Binary search for the first interval that could overlap // An interval overlaps [start, end) if interval.start < end AND interval.end > start - let search_start = self - .intervals - .partition_point(|iv| iv.end <= start); + let search_start = self.intervals.partition_point(|iv| iv.end <= start); for iv in &self.intervals[search_start..] { if iv.start >= end { @@ -326,9 +324,7 @@ pub fn count_reads( // Get chromosome names from header let header = bam.header().clone(); let tid_to_name: Vec = (0..header.target_count()) - .map(|tid| { - String::from_utf8_lossy(header.tid2name(tid)).to_string() - }) + .map(|tid| String::from_utf8_lossy(header.tid2name(tid)).to_string()) .collect(); // Track statistics @@ -427,8 +423,7 @@ pub fn count_reads( let gene_hits = if let Some(chrom_idx) = index.get(chrom) { // Extract aligned blocks from CIGAR (M/=/X operations only). // This avoids false overlaps with genes in introns of spliced reads. - let aligned_blocks = - cigar_to_aligned_blocks(record.pos() as u64, &record.cigar()); + let aligned_blocks = cigar_to_aligned_blocks(record.pos() as u64, &record.cigar()); let mut overlaps = Vec::new(); for (block_start, block_end) in &aligned_blocks { @@ -438,9 +433,7 @@ pub fn count_reads( // Filter by strand and deduplicate gene IDs let mut genes_hit: Vec = overlaps .iter() - .filter(|iv| { - strand_matches(is_reverse, is_read1, paired, iv.strand, stranded) - }) + .filter(|iv| strand_matches(is_reverse, is_read1, paired, iv.strand, stranded)) .map(|iv| iv.gene_id.clone()) .collect(); genes_hit.sort_unstable(); @@ -482,7 +475,11 @@ pub fn count_reads( // For the fragment, use read1's dup/multi status (featureCounts // considers a fragment as duplicate if read1 is flagged as duplicate) let frag_is_dup = if is_read1 { is_dup } else { mate_info.is_dup }; - let frag_is_multi = if is_read1 { is_multi } else { mate_info.is_multi }; + let frag_is_multi = if is_read1 { + is_multi + } else { + mate_info.is_multi + }; // Update N totals (once per fragment) n_multi_dup += 1; @@ -499,7 +496,12 @@ pub fn count_reads( combined_genes.dedup(); // Assign to gene if unambiguous (exactly one gene from combined overlaps) - assign_fragment_to_gene(&combined_genes, &mut gene_counts, frag_is_dup, frag_is_multi); + assign_fragment_to_gene( + &combined_genes, + &mut gene_counts, + frag_is_dup, + frag_is_multi, + ); } else { // First mate seen - buffer it and wait for the other mate mate_buffer.insert( @@ -527,7 +529,12 @@ pub fn count_reads( n_unique_nodup += 1; } - assign_fragment_to_gene(&mate_info.gene_hits, &mut gene_counts, mate_info.is_dup, mate_info.is_multi); + assign_fragment_to_gene( + &mate_info.gene_hits, + &mut gene_counts, + mate_info.is_dup, + mate_info.is_multi, + ); } info!( diff --git a/src/gtf.rs b/src/gtf.rs index df57775..2c84ad1 100644 --- a/src/gtf.rs +++ b/src/gtf.rs @@ -192,7 +192,8 @@ mod tests { #[test] fn test_get_attribute() { - let attrs = r#"gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1";"#; + let attrs = + r#"gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1";"#; assert_eq!( get_attribute(attrs, "gene_id"), Some("ENSG00000223972".to_string()) diff --git a/src/main.rs b/src/main.rs index 6238bec..e1ca28a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -187,10 +187,7 @@ fn main() -> Result<()> { stats.f_regions_duplication * 100.0 ); - info!( - "Total runtime: {:.2}s", - start.elapsed().as_secs_f64() - ); + info!("Total runtime: {:.2}s", start.elapsed().as_secs_f64()); Ok(()) } diff --git a/src/plots.rs b/src/plots.rs index 9b29cbf..d1473ef 100644 --- a/src/plots.rs +++ b/src/plots.rs @@ -129,9 +129,7 @@ fn estimate_density(x: &[f64], y: &[f64], nbins: usize) -> Vec { let nx = bx as i32 + dx; let ny = by as i32 + dy; if nx >= 0 && nx <= nbins as i32 && ny >= 0 && ny <= nbins as i32 { - let w = (-(dx * dx) as f64 / sigma2_x - - (dy * dy) as f64 / sigma2_y) - .exp(); + let w = (-(dx * dx) as f64 / sigma2_x - (dy * dy) as f64 / sigma2_y).exp(); smoothed[nx as usize][ny as usize] += c * w; } } @@ -212,8 +210,11 @@ fn draw_dotted_vline( let mut y = y_top; while y < y_bot { let ye = (y + dash).min(y_bot); - root.draw(&PathElement::new(vec![(x, y), (x, ye)], color.stroke_width(sw))) - .ok(); + root.draw(&PathElement::new( + vec![(x, y), (x, ye)], + color.stroke_width(sw), + )) + .ok(); y = ye + gap; } } @@ -296,8 +297,7 @@ where // ── points (sorted by density → dense on top) ────────────────────── // R uses pch=20 cex=0.25 → tiny filled dots - let mut order: Vec<(usize, f64)> = - densities.iter().enumerate().map(|(i, d)| (i, *d)).collect(); + let mut order: Vec<(usize, f64)> = densities.iter().enumerate().map(|(i, d)| (i, *d)).collect(); order.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); for &(i, _) in &order { @@ -395,7 +395,11 @@ where let ly = pa_y.0 + (pxs * 8.0) as i32; root.draw(&Rectangle::new( [(lx, ly), (lx + lw, ly + lh)], - ShapeStyle { color: WHITE.to_rgba(), filled: true, stroke_width: 0 }, + ShapeStyle { + color: WHITE.to_rgba(), + filled: true, + stroke_width: 0, + }, ))?; root.draw(&Rectangle::new( [(lx, ly), (lx + lw, ly + lh)], @@ -426,7 +430,11 @@ where let samp_x = lx + (pxs * 4.0) as i32; root.draw(&Rectangle::new( [(lx, ly), (lx + lw, ly + lh)], - ShapeStyle { color: WHITE.to_rgba(), filled: true, stroke_width: 0 }, + ShapeStyle { + color: WHITE.to_rgba(), + filled: true, + stroke_width: 0, + }, ))?; root.draw(&Rectangle::new( [(lx, ly), (lx + lw, ly + lh)], @@ -434,8 +442,16 @@ where ))?; let mut cy = ly + (pxs * 5.0) as i32; // "1 read/bp" with red dashed sample line - draw_dotted_vline(&root, samp_x + (pxs * 7.0) as i32, cy, cy + line_h - 2, &RED, sw, - (pxs * 3.0) as i32, (pxs * 2.0) as i32); + draw_dotted_vline( + &root, + samp_x + (pxs * 7.0) as i32, + cy, + cy + line_h - 2, + &RED, + sw, + (pxs * 3.0) as i32, + (pxs * 2.0) as i32, + ); root.draw(&Text::new( "1 read/bp", (txt_x, cy), @@ -443,8 +459,16 @@ where ))?; cy += line_h; // "0.5 RPKM" with green dashed sample line - draw_dotted_vline(&root, samp_x + (pxs * 7.0) as i32, cy, cy + line_h - 2, &GREEN, sw, - (pxs * 3.0) as i32, (pxs * 2.0) as i32); + draw_dotted_vline( + &root, + samp_x + (pxs * 7.0) as i32, + cy, + cy + line_h - 2, + &GREEN, + sw, + (pxs * 3.0) as i32, + (pxs * 2.0) as i32, + ); root.draw(&Text::new( "0.5 RPKM", (txt_x, cy), @@ -552,7 +576,12 @@ where } else { format!("{:.1}", mean_rpk) }; - labels.push(format!("{} - {} % / {}", (pl * 100.0) as u32, (ph * 100.0) as u32, rpk_s)); + labels.push(format!( + "{} - {} % / {}", + (pl * 100.0) as u32, + (ph * 100.0) as u32, + rpk_s + )); bins.push(vals); } @@ -579,7 +608,11 @@ where } }) .x_labels(n_bins) - .x_label_style(("sans-serif", ps(8.0)).into_font().transform(FontTransform::Rotate270)) + .x_label_style( + ("sans-serif", ps(8.0)) + .into_font() + .transform(FontTransform::Rotate270), + ) .y_label_formatter(&|v| format!("{:.1}", v)) .axis_desc_style(("sans-serif", ps(13.0))) .label_style(("sans-serif", ps(11.0))) @@ -598,8 +631,17 @@ where let med = quantile(&sv, 0.5); let q3 = quantile(&sv, 0.75); let iqr = q3 - q1; - let wl = sv.iter().find(|&&v| v >= q1 - 1.5 * iqr).copied().unwrap_or(q1); - let wh = sv.iter().rev().find(|&&v| v <= q3 + 1.5 * iqr).copied().unwrap_or(q3); + let wl = sv + .iter() + .find(|&&v| v >= q1 - 1.5 * iqr) + .copied() + .unwrap_or(q1); + let wh = sv + .iter() + .rev() + .find(|&&v| v <= q3 + 1.5 * iqr) + .copied() + .unwrap_or(q3); let bl = idx as f64 + 0.2; let br = idx as f64 + 0.8; @@ -609,7 +651,11 @@ where // box fill chart.draw_series(std::iter::once(Rectangle::new( [(bl, q1), (br, q3)], - ShapeStyle { color: gray_fill, filled: true, stroke_width: ps(1.0) }, + ShapeStyle { + color: gray_fill, + filled: true, + stroke_width: ps(1.0), + }, )))?; // box border chart.draw_series(std::iter::once(Rectangle::new( @@ -623,7 +669,10 @@ where ))?; // whiskers + caps for &(from, to, cap_y) in &[(q1, wl, wl), (q3, wh, wh)] { - chart.draw_series(LineSeries::new(vec![(cx, from), (cx, to)], BLACK.stroke_width(ps(1.0))))?; + chart.draw_series(LineSeries::new( + vec![(cx, from), (cx, to)], + BLACK.stroke_width(ps(1.0)), + ))?; chart.draw_series(LineSeries::new( vec![(bl + cap, cap_y), (br - cap, cap_y)], BLACK.stroke_width(ps(1.0)), @@ -717,10 +766,18 @@ where .y_desc("Frequency") .x_label_formatter(&|v| { let r = (*v * 10.0).round() / 10.0; - if (r - r.round()).abs() < 0.01 { format_rpk_tick(r) } else { String::new() } + if (r - r.round()).abs() < 0.01 { + format_rpk_tick(r) + } else { + String::new() + } }) .y_label_formatter(&|v| { - if *v == v.floor() && *v >= 0.0 { format!("{}", *v as i32) } else { String::new() } + if *v == v.floor() && *v >= 0.0 { + format!("{}", *v as i32) + } else { + String::new() + } }) .axis_desc_style(("sans-serif", ps(14.0))) .label_style(("sans-serif", ps(12.0))) @@ -728,12 +785,18 @@ where let gray = RGBAColor(190, 190, 190, 1.0); for (i, &c) in hist.iter().enumerate() { - if c == 0 { continue; } + if c == 0 { + continue; + } let x0 = x_min + i as f64 * bw; let x1 = x0 + bw; chart.draw_series(std::iter::once(Rectangle::new( [(x0, 0.0), (x1, c as f64)], - ShapeStyle { color: gray, filled: true, stroke_width: 0 }, + ShapeStyle { + color: gray, + filled: true, + stroke_width: 0, + }, )))?; chart.draw_series(std::iter::once(Rectangle::new( [(x0, 0.0), (x1, c as f64)], @@ -795,7 +858,10 @@ pub fn write_mqc_intercept( writeln!(f, "# dupRadar_intercept:")?; writeln!(f, "# title: 'dupRadar int'")?; writeln!(f, "# namespace: 'dupRadar'")?; - writeln!(f, "# description: 'dupRadar duplication rate at low read counts'")?; + writeln!( + f, + "# description: 'dupRadar duplication rate at low read counts'" + )?; writeln!(f, "# max: 100")?; writeln!(f, "# min: 0")?; writeln!(f, "# format: '{{:.2f}}'")?; @@ -805,11 +871,7 @@ pub fn write_mqc_intercept( } /// Write a MultiQC-compatible line-graph curve file. -pub fn write_mqc_curve( - fit: &FitResult, - dm: &DupMatrix, - path: &std::path::Path, -) -> Result<()> { +pub fn write_mqc_curve(fit: &FitResult, dm: &DupMatrix, path: &std::path::Path) -> Result<()> { use std::io::Write; let mut f = std::fs::File::create(path)?; writeln!(f, "# id: 'dupradar'")?; @@ -824,7 +886,12 @@ pub fn write_mqc_curve( writeln!(f, "# ymax: 100")?; writeln!(f, "# xlog: True")?; - let rpks: Vec = dm.rows.iter().filter(|r| r.rpk > 0.0).map(|r| r.rpk).collect(); + let rpks: Vec = dm + .rows + .iter() + .filter(|r| r.rpk > 0.0) + .map(|r| r.rpk) + .collect(); if rpks.is_empty() { return Ok(()); } diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 3223db5..f4e9255 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -248,19 +248,11 @@ fn test_all_output_files_generated() { for file in &expected_files { let path = format!("{}/{}", outdir, file); - assert!( - Path::new(&path).exists(), - "Missing output file: {}", - file - ); + assert!(Path::new(&path).exists(), "Missing output file: {}", file); // Check file is non-empty let metadata = fs::metadata(&path).unwrap(); - assert!( - metadata.len() > 0, - "Output file is empty: {}", - file - ); + assert!(metadata.len() > 0, "Output file is empty: {}", file); } // Cleanup @@ -328,10 +320,7 @@ fn test_mqc_intercept_format() { // Check MultiQC intercept file format let content = fs::read_to_string(format!("{}/test_dup_intercept_mqc.txt", outdir)).unwrap(); // Skip YAML comment lines (starting with #) - let data_lines: Vec<&str> = content - .lines() - .filter(|l| !l.starts_with('#')) - .collect(); + let data_lines: Vec<&str> = content.lines().filter(|l| !l.starts_with('#')).collect(); assert!(data_lines.len() >= 2, "MultiQC intercept file too short"); // First data line should be a header with "Sample" and "dupRadar_intercept" @@ -369,10 +358,7 @@ fn test_mqc_curve_format() { let content = fs::read_to_string(format!("{}/test_duprateExpDensCurve_mqc.txt", outdir)).unwrap(); // Skip YAML comment lines (starting with #) - let data_lines: Vec<&str> = content - .lines() - .filter(|l| !l.starts_with('#')) - .collect(); + let data_lines: Vec<&str> = content.lines().filter(|l| !l.starts_with('#')).collect(); // Header + at least some data points (101 evenly spaced + header = 102) assert!( data_lines.len() >= 3, @@ -382,11 +368,7 @@ fn test_mqc_curve_format() { // Should have a header line with 2 columns let header_parts: Vec<&str> = data_lines[0].split('\t').collect(); - assert_eq!( - header_parts.len(), - 2, - "Curve header should have 2 columns" - ); + assert_eq!(header_parts.len(), 2, "Curve header should have 2 columns"); // Data lines should have numeric values for line in &data_lines[1..] { From 76337cd740059c6b82d956b38d58697323ca853c Mon Sep 17 00:00:00 2001 From: adamrtalbot <12817534+adamrtalbot@users.noreply.github.com> Date: Thu, 12 Feb 2026 19:51:01 +0000 Subject: [PATCH 3/6] Fix clippy lints for Rust 1.93 - is_multiple_of instead of manual modulo check - Iterator enumerate instead of needless_range_loop - allow too_many_arguments on private draw helper - range contains instead of manual bounds check - std::mem::take instead of drain().collect() Generated by Claude Code --- src/counting.rs | 2 +- src/plots.rs | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/counting.rs b/src/counting.rs index 8b0b440..697d745 100644 --- a/src/counting.rs +++ b/src/counting.rs @@ -350,7 +350,7 @@ pub fn count_reads( result.context("Error reading BAM record")?; total_reads += 1; - if total_reads % 5_000_000 == 0 { + if total_reads.is_multiple_of(5_000_000) { debug!("Processed {} reads...", total_reads); } diff --git a/src/plots.rs b/src/plots.rs index d1473ef..6ad009e 100644 --- a/src/plots.rs +++ b/src/plots.rs @@ -118,12 +118,12 @@ fn estimate_density(x: &[f64], y: &[f64], nbins: usize) -> Vec { // Anisotropic Gaussian smoothing with adaptive bandwidth let mut smoothed = vec![vec![0.0f64; nbins + 1]; nbins + 1]; - for bx in 0..=nbins { - for by in 0..=nbins { - if grid[bx][by] == 0 { + for (bx, row) in grid.iter().enumerate() { + for (by, &cell) in row.iter().enumerate() { + if cell == 0 { continue; } - let c = grid[bx][by] as f64; + let c = cell as f64; for dx in -radius_x..=radius_x { for dy in -radius_y..=radius_y { let nx = bx as i32 + dx; @@ -197,6 +197,7 @@ fn quantile(sorted: &[f64], p: f64) -> f64 { /// /// plotters has no native dashed-line support, so we draw small segments /// separated by gaps. +#[allow(clippy::too_many_arguments)] fn draw_dotted_vline( root: &DrawingArea, x: i32, @@ -285,7 +286,7 @@ where .y_labels(21) // step=5 on 0-100, then filter to multiples of 25 .y_label_formatter(&|v| { let iv = v.round() as i32; - if iv >= 0 && iv <= 100 && iv % 25 == 0 && (*v - iv as f64).abs() < 0.1 { + if (0..=100).contains(&iv) && iv % 25 == 0 && (*v - iv as f64).abs() < 0.1 { format!("{}", iv) } else { String::new() @@ -336,7 +337,7 @@ where if seg_idx >= limit { if seg_on && seg_pts.len() >= 2 { chart.draw_series(LineSeries::new( - seg_pts.drain(..).collect::>(), + std::mem::take(&mut seg_pts), BLACK.stroke_width(curve_sw), ))?; } From b9f2d5e7dae9d3af7a2a916b3f9e95d7a123d5ef Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Thu, 12 Feb 2026 23:26:09 +0100 Subject: [PATCH 4/6] Run cargo fmt --- src/counting.rs | 66 +++++++++++++++++++++++++++++-------------------- src/plots.rs | 21 +++++----------- 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/src/counting.rs b/src/counting.rs index 1ed93e4..ae5c489 100644 --- a/src/counting.rs +++ b/src/counting.rs @@ -554,39 +554,46 @@ pub fn count_reads( // // If only one mate has gene hits (the other overlaps nothing), // we use that mate's gene set directly. - let combined_genes: Vec = if mate_info.gene_hits.is_empty() && gene_hits.is_empty() { - Vec::new() - } else if mate_info.gene_hits.is_empty() { - gene_hits - } else if gene_hits.is_empty() { - mate_info.gene_hits - } else { - // Both mates have gene hits - use INTERSECTION - let set_a: std::collections::HashSet<&String> = mate_info.gene_hits.iter().collect(); - let intersection: Vec = gene_hits - .iter() - .filter(|g| set_a.contains(g)) - .cloned() - .collect(); - if intersection.is_empty() { - // Mates disagree on gene assignment - treat as ambiguous - // by returning the union (which will have len > 1) - let mut union = mate_info.gene_hits; - union.extend(gene_hits); - union.sort_unstable(); - union.dedup(); - union + let combined_genes: Vec = + if mate_info.gene_hits.is_empty() && gene_hits.is_empty() { + Vec::new() + } else if mate_info.gene_hits.is_empty() { + gene_hits + } else if gene_hits.is_empty() { + mate_info.gene_hits } else { - intersection - } - }; + // Both mates have gene hits - use INTERSECTION + let set_a: std::collections::HashSet<&String> = + mate_info.gene_hits.iter().collect(); + let intersection: Vec = gene_hits + .iter() + .filter(|g| set_a.contains(g)) + .cloned() + .collect(); + if intersection.is_empty() { + // Mates disagree on gene assignment - treat as ambiguous + // by returning the union (which will have len > 1) + let mut union = mate_info.gene_hits; + union.extend(gene_hits); + union.sort_unstable(); + union.dedup(); + union + } else { + intersection + } + }; // Assign to gene if unambiguous (exactly one gene from combined overlaps) if combined_genes.is_empty() { stat_no_features += 1; } else if combined_genes.len() > 1 { stat_ambiguous += 1; - } else if assign_fragment_to_gene(&combined_genes, &mut gene_counts, frag_is_dup, frag_is_multi) { + } else if assign_fragment_to_gene( + &combined_genes, + &mut gene_counts, + frag_is_dup, + frag_is_multi, + ) { stat_assigned += 1; } } else { @@ -620,7 +627,12 @@ pub fn count_reads( stat_no_features += 1; } else if mate_info.gene_hits.len() > 1 { stat_ambiguous += 1; - } else if assign_fragment_to_gene(&mate_info.gene_hits, &mut gene_counts, mate_info.is_dup, mate_info.is_multi) { + } else if assign_fragment_to_gene( + &mate_info.gene_hits, + &mut gene_counts, + mate_info.is_dup, + mate_info.is_multi, + ) { stat_assigned += 1; } } diff --git a/src/plots.rs b/src/plots.rs index 4386066..5d36104 100644 --- a/src/plots.rs +++ b/src/plots.rs @@ -121,7 +121,7 @@ fn estimate_density(x: &[f64], y: &[f64], nbins: usize) -> Vec { let iy = fy.floor() as i32; let sx = fx - ix as f64; // fractional x let sy = fy - iy as f64; // fractional y - // Distribute weight to 4 corners + // Distribute weight to 4 corners for (dx, wx) in [(0i32, 1.0 - sx), (1, sx)] { for (dy, wy) in [(0i32, 1.0 - sy), (1, sy)] { let gx = ix + dx; @@ -145,14 +145,9 @@ fn estimate_density(x: &[f64], y: &[f64], nbins: usize) -> Vec { for dy in -radius_y..=radius_y { let nx = bx as i32 + dx; let ny = by as i32 + dy; - if nx >= 0 - && (nx as usize) < grid_size - && ny >= 0 - && (ny as usize) < grid_size + if nx >= 0 && (nx as usize) < grid_size && ny >= 0 && (ny as usize) < grid_size { - let w = (-(dx * dx) as f64 / sigma2_x - - (dy * dy) as f64 / sigma2_y) - .exp(); + let w = (-(dx * dx) as f64 / sigma2_x - (dy * dy) as f64 / sigma2_y).exp(); smoothed[nx as usize][ny as usize] += c * w; } } @@ -333,15 +328,11 @@ where // ── points (data order, matching R's plot() behavior) ───────────── // R draws points in data order with pch=20 cex=0.25 → tiny filled dots. - // R's pch=20 with cex=0.25 draws tiny filled circles. + // R's pch=20 with cex=0.25 draws tiny filled circles. // Circle radius 1 at our scale gives the closest match. for i in 0..xd.len() { let c = density_color(densities[i]); - chart.draw_series(std::iter::once(Circle::new( - (xd[i], yd[i]), - 1, - c.filled(), - )))?; + chart.draw_series(std::iter::once(Circle::new((xd[i], yd[i]), 1, c.filled())))?; } // ── fit curve: R uses col='black', lwd=2, lty=3 (dotted) ────────── @@ -973,7 +964,7 @@ mod tests { assert_eq!((c0.0, c0.1, c0.2), (0, 255, 255)); // cyan let c1 = density_color(1.0); assert_eq!((c1.0, c1.1, c1.2), (255, 0, 0)); // red - // Mid-point should be green + // Mid-point should be green let c_mid = density_color(0.5); assert_eq!((c_mid.0, c_mid.1, c_mid.2), (0, 255, 0)); // green } From 8a2b0fe8d596be2f879e5678a806bc11eaeb4b3d Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Thu, 12 Feb 2026 23:31:08 +0100 Subject: [PATCH 5/6] Add AGENTS.md so that the auto-formatting and checks are done pre-push --- AGENTS.md | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..37c38d2 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,170 @@ +# AGENTS.md — dupRust + +> Fast Rust reimplementation of [dupRadar](https://github.com/ssayols/dupRadar) for assessing +> PCR duplicate rates in RNA-Seq datasets. Binary crate (`duprust`), Rust edition 2021. + +## Build / Lint / Test Commands + +```bash +# Build +cargo build # debug build +cargo build --release # optimized release build (LTO, strip, opt-level 3) + +# Format (enforced in CI — default rustfmt, no config file) +cargo fmt # auto-format +cargo fmt --check # check only (CI uses this) + +# Lint (enforced in CI — default clippy, warnings are errors) +cargo clippy -- -D warnings + +# Test — all unit + integration tests +cargo test # debug mode +cargo test --release # release mode (CI uses this) + +# Run a single test by name (substring match) +cargo test test_dup_rate_calculation +cargo test test_dup_matrix_exact_match -- --nocapture # with stdout + +# Run only unit tests (skip integration tests) +cargo test --lib + +# Run only integration tests +cargo test --test integration_test + +# Run a specific integration test +cargo test --test integration_test test_intercept_slope_match +``` + +## Project Structure + +``` +src/ + main.rs — Entry point, orchestrates the 8-step pipeline + cli.rs — CLI argument parsing (clap derive) + config.rs — YAML configuration loading (serde) + gtf.rs — GTF annotation file parser + counting.rs — BAM read counting engine (largest module) + dupmatrix.rs — Duplication matrix construction & TSV output + fitting.rs — Logistic regression via IRLS + plots.rs — Plot generation: density scatter, boxplot, histogram +tests/ + integration_test.rs — 8 integration tests vs R dupRadar reference output + data/ — Test BAM/GTF input files + expected/ — R-generated reference outputs + create_test_data.R — R script to regenerate test data + references +``` + +Flat module structure — all modules declared in `main.rs`, no `lib.rs`. +Inter-module access uses `crate::` paths (e.g., `use crate::gtf::Gene;`). + +## Code Style + +### Formatting + +- **Default `rustfmt`** — no `rustfmt.toml` exists. Do not create one. +- 4-space indentation, ~100 char line width. +- Trailing commas on all multi-line constructs. +- Chained method calls break to new line with indent. + +### Imports + +Three groups (crate-internal, third-party, std), though blank-line separation +between groups is not strictly enforced. Each `use` is a single statement: + +```rust +use crate::gtf::Gene; +use anyhow::{Context, Result}; +use indexmap::IndexMap; +use log::{debug, info}; +use std::collections::HashMap; +``` + +Localized `use` inside function bodies is acceptable for narrow imports +(e.g., `use std::io::Write;`). + +### Naming + +| Kind | Convention | Examples | +|-------------------|------------------------|-------------------------------------------------| +| Types / Structs | `CamelCase` | `GeneCounts`, `DupMatrix`, `FitResult` | +| Functions/Methods | `snake_case` | `count_reads`, `build_index`, `format_float` | +| Constants | `SCREAMING_SNAKE_CASE` | `BAM_FDUP`, `DENSITY_COLORS`, `SCALE` | +| Modules | `snake_case` | `dupmatrix`, `counting`, `fitting` | +| Variables/Fields | `snake_case` | `gene_counts`, `dup_rate_multi`, `is_dup` | +| Type aliases | `CamelCase` | `MateBufferKey` | + +### Error Handling + +- **`anyhow::Result`** for all fallible functions. No custom error types. +- Propagate with `?` operator. +- Add context with `.context("msg")` or `.with_context(|| format!(...))`. +- Use `anyhow::bail!()` for early error returns. +- Use `anyhow::ensure!()` for precondition checks. +- **`unwrap()` / `expect()`** are restricted to test code only. In production code, + `unwrap()` is acceptable only when a prior guard makes it provably safe (add a comment). + +### Documentation + +- **Every source file** starts with `//!` module doc comment (2-4 lines). +- **All public items** (structs, fields, functions, methods) get `///` doc comments. +- Complex functions include `# Arguments` and `# Returns` sections. +- Inline `//` comments explain complex logic, domain-specific behavior, and + references to R equivalents. +- Long files use section dividers: `// ===` for major sections, `// ---` for sub-sections. + +### Types and Derives + +- `#[derive(Debug)]` on all structs. +- Add `Clone`, `Default`, `Deserialize` as needed — keep derives minimal. +- Public structs expose `pub` fields. Private helper structs keep fields private. +- Numeric conventions: `u64` for counts/positions, `f64` for metrics, `u8` for flags/strandedness. +- `IndexMap` when insertion order matters (gene ordering); `HashMap` for unordered lookups. + +### Clippy + +- Default clippy settings with `-D warnings` (deny all warnings). +- Targeted `#[allow(...)]` annotations are acceptable with justification: + - `#[allow(dead_code)]` for fields kept for API completeness. + - `#[allow(clippy::too_many_arguments)]` when refactoring would reduce clarity. + +### Tests + +- **Unit tests** co-located in each source file inside `#[cfg(test)] mod tests { use super::*; ... }`. +- **Integration tests** in `tests/integration_test.rs` — run the binary as a subprocess + and compare output against R reference files in `tests/expected/`. +- Test naming: `test_`. +- Use `assert_eq!` with descriptive messages for exact comparisons. +- Use `assert!((val - expected).abs() < tolerance)` for float comparisons. +- No dev-dependencies — tests use only std + crate dependencies. + +## CI Pipeline + +GitHub Actions (`.github/workflows/ci.yml`) runs on push to `main` and all PRs: + +1. **Test** — `cargo test --release` on Ubuntu and macOS +2. **Format** — `cargo fmt --check` +3. **Clippy** — `cargo clippy -- -D warnings` + +All three must pass. Uses `dtolnay/rust-toolchain@stable` and `Swatinem/rust-cache@v2`. + +## Key Dependencies + +| Crate | Purpose | +|----------------|----------------------------------| +| `clap` v4 | CLI argument parsing (derive) | +| `rust-htslib` | BAM file I/O (statically linked) | +| `plotters` | Chart generation (PNG + SVG) | +| `serde` | YAML config deserialization | +| `anyhow` | Error handling | +| `log` | Logging facade | +| `env_logger` | Log output backend | +| `indexmap` | Insertion-order-preserving maps | + +## Notes for Agents + +- The codebase is a pure binary crate with no library target. +- Release builds use aggressive optimization (`lto = true`, `codegen-units = 1`, `strip = true`). +- Test data is generated by `tests/create_test_data.R` — do not modify `tests/expected/` by hand. +- Float output formatting must match R's behavior (15 significant digits, "NA" for NaN, trailing-zero trimming). +- The pipeline processes BAM files which can be very large — performance matters. +- System dependencies needed for building: cmake, zlib, bz2, lzma, curl, ssl, clang (for `rust-htslib`). From c409602d6e15072f675fcd69426e0c8ee0387764 Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Thu, 12 Feb 2026 23:33:09 +0100 Subject: [PATCH 6/6] Fix clippy errors --- src/counting.rs | 1 + src/plots.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/src/counting.rs b/src/counting.rs index ae5c489..c640a05 100644 --- a/src/counting.rs +++ b/src/counting.rs @@ -23,6 +23,7 @@ const BAM_FUNMAP: u16 = 0x4; /// Flag indicating the read failed quality checks (0x200). const BAM_FQCFAIL: u16 = 0x200; /// Flag indicating a secondary alignment (0x100). +#[allow(dead_code)] const BAM_FSECONDARY: u16 = 0x100; /// Flag indicating a supplementary alignment (0x800). const BAM_FSUPPLEMENTARY: u16 = 0x800; diff --git a/src/plots.rs b/src/plots.rs index 5d36104..aa79948 100644 --- a/src/plots.rs +++ b/src/plots.rs @@ -135,6 +135,7 @@ fn estimate_density(x: &[f64], y: &[f64], nbins: usize) -> Vec { // Anisotropic Gaussian smoothing (matching bkde2D with tau=3.4) let mut smoothed = vec![vec![0.0f64; grid_size]; grid_size]; + #[allow(clippy::needless_range_loop)] // bx/by used as integer coordinates for offset arithmetic for bx in 0..grid_size { for by in 0..grid_size { if grid[bx][by] == 0.0 {