nf-core · CharlotteAnne · Mar 20, 2025 · Mar 21, 2025 · Mar 21, 2025 · Mar 21, 2025
diff --git a/assets/merged_library_deseq2_clustering_header.txt b/assets/merged_library_deseq2_clustering_header.txt
@@ -0,0 +1,12 @@
+#id: 'mlib_deseq2_clustering'
+#section_name: 'MERGED LIB: DESeq2 sample similarity'
+#description: "Matrix is generated from clustering with Euclidean distances between
+#	       <a href='https://bioconductor.org/packages/release/bioc/html/DESeq2.html' target='_blank'>DESeq2</a>
+#              rlog values for each sample
+#              in the <a href='https://github.com/nf-core/atacseq/blob/master/bin/deseq2_qc.r'><code>deseq2_qc.r</code></a> script."
+#plot_type: 'heatmap'
+#anchor: 'mlib_deseq2_clustering'
+#pconfig:
+#    title: 'DESeq2: Heatmap of the sample-to-sample distances'
+#    xlab: True
+#    reverseColors: True
diff --git a/assets/merged_library_deseq2_pca_header.txt b/assets/merged_library_deseq2_pca_header.txt
@@ -0,0 +1,11 @@
+#id: 'mlib_deseq2_pca'
+#section_name: 'MERGED LIB: DESeq2 PCA plot'
+#description: "PCA plot of the samples in the experiment.
+#              These values are calculated using <a href='https://bioconductor.org/packages/release/bioc/html/DESeq2.html'>DESeq2</a>
+#              in the <a href='https://github.com/nf-core/atacseq/blob/master/bin/deseq2_qc.r'><code>deseq2_qc.r</code></a> script."
+#plot_type: 'scatter'
+#anchor: 'mlib_deseq2_pca'
+#pconfig:
+#    title: 'DESeq2: Principal component plot'
+#    xlab: PC1
+#    ylab: PC2
diff --git a/assets/merged_replicate_deseq2_clustering_header.txt b/assets/merged_replicate_deseq2_clustering_header.txt
@@ -0,0 +1,12 @@
+#id: 'mrep_deseq2_clustering'
+#section_name: 'MERGED REP: DESeq2 sample similarity'
+#description: "Matrix is generated from clustering with Euclidean distances between
+#	       <a href='https://bioconductor.org/packages/release/bioc/html/DESeq2.html' target='_blank'>DESeq2</a>
+#              rlog values for each sample
+#              in the <a href='https://github.com/nf-core/atacseq/blob/master/bin/deseq2_qc.r'><code>deseq2_qc.r</code></a> script."
+#plot_type: 'heatmap'
+#anchor: 'mrep_deseq2_clustering'
+#pconfig:
+#    title: 'DESeq2: Heatmap of the sample-to-sample distances'
+#    xlab: True
+#    reverseColors: True
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -47,16 +47,193 @@ module_order:
         - "*.final.out"
   - custom_content
 
+# Other MultiQC config stuff here
+custom_data:
+  mapping:
+    parent_id: mapping
+    parent_name: "Mapping"
+    file_format: "tsv"
+    section_name: "Mapping"
+    description: "The mapping metrics for each experiment"
+    plot_type: "bargraph"
+  dedup_reads:
+    parent_id: dedup
+    parent_name: "Deduplication"
+    file_format: "tsv"
+    section_name: "Reads"
+    description: "The number of reads before and after PCR deduplication for each experiment"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Count"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  dedup_ratio:
+    parent_id: dedup
+    parent_name: "Deduplication"
+    file_format: "tsv"
+    section_name: "Ratio"
+    description: "The PCR deduplication ratio for each experiment"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Ratio"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  dedup_mean_umis:
+    parent_id: dedup
+    parent_name: "Deduplication"
+    file_format: "tsv"
+    section_name: "Mean UMIs"
+    description: "Mean number of unique UMIs per position for each experiment"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Mean number"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  crosslinks_counts:
+    parent_id: crosslinks
+    parent_name: "Crosslinks"
+    file_format: "tsv"
+    section_name: "Counts"
+    description: "The number of crosslinks or crosslink sites for each experiment"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Count"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  crosslinks_ratio:
+    parent_id: crosslinks
+    parent_name: "Crosslinks"
+    file_format: "tsv"
+    section_name: "Ratios"
+    description: "The ratio of number of cDNA mapping to crosslink positions for each experiment"
+    #plot_type: 'bargraph'
+    pconfig:
+      ylab: "Count"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+      tt_decimals: 2
+  peaks_counts:
+    parent_id: peaks
+    parent_name: "Peaks"
+    file_format: "tsv"
+    section_name: "Counts"
+    description: "The total number of peaks called by each peak caller"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Number of peaks"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  xlinks_in_peaks:
+    parent_id: peaks
+    parent_name: "Peaks"
+    file_format: "tsv"
+    section_name: "Crosslinks positions in peaks"
+    description: "The total percentage of crosslinks within peaks for each peak caller"
+    #plot_type: 'bargraph'
+    pconfig:
+      ylab: "Percentage of crosslinks"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+      tt_decimals: 2
+      tt_suffix: "%"
+  xlinksites_in_peaks:
+    parent_id: peaks
+    parent_name: "Peaks"
+    file_format: "tsv"
+    section_name: "Crosslinks positions in peaks"
+    description: "The total percentage of crosslink sites within peaks for each peak caller"
+    #plot_type: 'bargraph'
+    pconfig:
+      ylab: "Percentage of crosslink sites"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+      tt_decimals: 2
+      tt_suffix: "%"
+  peaks_xlinksite_coverage:
+    parent_id: peaks
+    parent_name: "Peaks"
+    file_format: "tsv"
+    section_name: "Peak-crosslink coverage"
+    description: "The total percentage of nucleotides within peaks covered by a crosslink site"
+    plot_type: "bargraph"
+    pconfig:
+      ylab: "Percentage of nucleotides within peaks"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+      tt_decimals: 2
+      tt_suffix: "%"
+  summary_type:
+    parent_id: Summary
+    parent_name: "Summary"
+    file_format: "tsv"
+    section_name: "Percentage of cDNA premap"
+    description: "The total percentage of cDNA summary mapped"
+    #plot_type: 'bargraph'
+    pconfig:
+      ylab: "Type"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+  summary_subtype:
+    parent_id: Summary
+    parent_name: "Summary"
+    file_format: "tsv"
+    section_name: "Percentage of cDNA premap subtypes"
+    description: "The total percentage of cDNA subtypes mapped"
+    #plot_type: 'bargraph'
+    pconfig:
+      ylab: "Type"
+      #stacking: False
+      cpswitch: False
+      tt_percentages: False
+
+sp:
+  mapping:
+    fn: "mapping.tsv"
+  dedup_reads:
+    fn: "dedup_reads.tsv"
+  dedup_ratio:
+    fn: "dedup_ratio.tsv"
+  dedup_mean_umis:
+    fn: "dedup_mean_umis.tsv"
+  crosslinks_counts:
+    fn: "xlinks_counts.tsv"
+  crosslinks_ratio:
+    fn: "xlinks_ratio.tsv"
+  peaks:
+    fn: "total_peaks.tsv"
+  xlinks_in_peaks:
+    fn: "xlinks_in_peaks.tsv"
+  xlinksites_in_peaks:
+    fn: "xlinksites_in_peaks.tsv"
+  peaks_xlinksite_coverage:
+    fn: "peaks_xlinksite_coverage.tsv"
+  summary_type:
+    fn: "summary_type_metrics.tsv"
+  summary_subtype:
+    fn: '"summary_subtype_metrics.tsv'
+
 custom_content:
   order:
+    - clipqc
     - software-versions-by-process
     - software-versions-unique
-
 # Customise the module search patterns to speed up execution time
-sp:
-  samtools/stats:
-    fn: "*.stats"
-  samtools/flagstat:
-    fn: "*.flagstat"
-  samtools/idxstats:
-    fn: "*.idxstats*"
+# sp:
+#   samtools/stats:
+#     fn: "*.stats"
+#   samtools/flagstat:
+#     fn: "*.flagstat"
+#   samtools/idxstats:
+#     fn: "*.idxstats*"
+#   clipqc:
+#     fn: "*.txt"
diff --git a/conf/modules.config b/conf/modules.config
@@ -34,6 +34,7 @@ process {
 ========================================================================================
 */
 
+
 if(params.run_genome_prep) {
     process {
         withName: '.*PREPARE_GENOME:GUNZIP_.*' {
@@ -71,7 +72,6 @@ if(params.run_genome_prep) {
                 path: { "${params.outdir}/00_genome" },
                 mode: "${params.publish_dir_mode}",
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-                enabled: params.save_reference
             ]
         }
 
@@ -185,6 +185,7 @@ if(params.run_genome_prep) {
     }
 }
 
+
 /*
 ========================================================================================
     PRE-PROCESSING
@@ -756,6 +757,7 @@ if(params.run_crosslinking) {
 if(params.run_peakcalling && params.consensus_peak){
     process {
         withName: 'NFCORE_CLIPSEQ:CLIPSEQ:.*CONSENSUS_PEAK_TABLE:CONSENSUS_MAP' {
+            ext.prefix = { "${meta.id}_consensus_sorted" }
             publishDir = [
                 path: { "${params.outdir}/05_peakcalling/consensus_peak_tables" },
                 mode: "${params.publish_dir_mode}",
@@ -1004,13 +1006,13 @@ if(params.run_reporting) {
             ]
         }
 
-        // withName: 'CLIPSEQ:CLIPSEQ_CLIPQC' {
-        //     publishDir = [
-        //         path: { "${params.outdir}/06_reports/clipqc" },
-        //         mode: "${params.publish_dir_mode}",
-        //         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        //     ]
-        // }
+         withName: 'NFCORE_CLIPSEQ:CLIPSEQ:CLIPQC' {
+             publishDir = [
+                 path: { "${params.outdir}/06_reports/clipqc" },
+                 mode: "${params.publish_dir_mode}",
+                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+             ]
+         }
 
         withName: 'NFCORE_CLIPSEQ:CLIPSEQ:MULTIQC' {
             ext.args   = params.multiqc_title ? "-v --title \"$params.multiqc_title\"" : '-v'

diff --git a/conf/test.config b/conf/test.config
@@ -15,13 +15,15 @@ params {
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
     // Limit resources so that this can run on GitHub Actions
-    max_cpus   = 2
-    max_memory = '8.GB'
-    max_time   = '6.h'
+    max_cpus   = 8
+    max_memory = '60.GB'
+    max_time   = '24.h'
 
-    // Input data
-    input  = 'https://raw.githubusercontent.com/nf-core/clipseq/refs/heads/feat-2-0/tests/test_new_samplesheet_FASTQ.csv'
-    source = "fastq"
+    // Inputs for testing dataset with yeast genome
+    //input  = 'https://raw.githubusercontent.com/nf-core/clipseq/refs/heads/feat-2-0/tests/test_new_samplesheet_FASTQ.csv'
+
+    //input    = '../tests/test_new_samplesheet_FASTQ.csv'
+    //source   = "fastq"
 
     // Genome references
     fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV.fa.gz"
@@ -44,6 +46,20 @@ params {
     seg_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_seg_genicOthertrue.resolved.gtf"
     regions_resolved_gtf_genic = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/yeast_MitoV_filtered_regions_genicOthertrue.resolved.gtf"
 
+
+    // Input data for full human testing
+    input  = "./tests/test_new_samplesheet_FASTQ_human.csv"
+    fasta = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz"
+    gtf   = "/data1/morrisq/chhabrs1/variant_calling/genome/GATK_GRCh38/Homo_sapiens.GRCh38.109.gtf.gz"
+    ncrna_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.gz"
+    source = "fastq"
+
+    // Genome references from s3 bucket
+    //fasta         = 's3://nf-core-awsmegatests/clipseq/input_data/reference/GRCh38.primary_assembly.genome.fa.gz'
+    //gtf           = 's3://nf-core-awsmegatests/clipseq/input_data/reference/gencode.v37.primary_assembly.annotation.gtf.gz'
+
+
+
     // Logic
     debug                 = true
     save_reference        = true
@@ -54,6 +70,16 @@ params {
     save_align_intermed   = true
     skip_transcriptome    = true
 
+    // Inputs for deseq2_qc
+    skip_deseq2_qc        = false
+
+
     // Pipeline params
     umitools_bc_pattern = 'NNNNNNNNN'
+
+    // Don't call consensus
+    //consensus_peak        = false
+
+
+
 }
diff --git a/modules.json b/modules.json
@@ -70,6 +70,11 @@
                         "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc",
                         "installed_by": ["modules"]
                     },
+                    "deseq2/differential": {
+                        "branch": "master",
+                        "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc",
+                        "installed_by": ["modules"]
+                    },
                     "fastqc": {
                         "branch": "master",
                         "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc",

diff --git a/modules/local/clipqc/main.nf b/modules/local/clipqc/main.nf
@@ -12,6 +12,10 @@ process CLIPQC {
     path("icount/*")
     path("paraclu/*")
     path("clippy/*")
+    path("pureclip/*")
+    path("summary_type/*")
+    path("summary_subtype/*")
+    path("summary_gene/*")
 
     output:
     path "*.tsv"         , emit: tsv