Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow backfilling of counts and rpkm without triggering preprocessing #771

Open
wants to merge 8 commits into
base: development
Choose a base branch
from
20 changes: 16 additions & 4 deletions gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.text.MessageFormat;
import java.util.Collection;
Expand All @@ -53,12 +52,13 @@ public class RNASeqDataAddCli extends ExpressionExperimentManipulatingCLI {
private static final String MULTIQC_METADATA_FILE_OPT = "multiqc";
private boolean allowMissingSamples = false;
private String countFile = null;
private boolean isPairedReads = false;
private Boolean isPairedReads = null;
private String platformName = null;
private Integer readLength = null;
private String rpkmFile = null;
private boolean justbackfillLog2cpm = false;
private File qualityControlReportFile = null;
private boolean justbackfillCountsAndRPKM = false;

@Override
public CommandGroup getCommandGroup() {
Expand All @@ -75,12 +75,14 @@ protected void buildOptions( Options options ) {
options.addOption( RNASeqDataAddCli.ALLOW_MISSING, "Set this if your data files don't have information for all samples." );
options.addOption( Option.builder( "a" ).longOpt( null ).desc( "Target platform (must already exist in the system)" ).argName( "platform short name" ).hasArg().build() );

options.addOption( Option.builder( RNASeqDataAddCli.METADATAOPT ).longOpt( null ).desc( "Information on read length given as a string like '100:paired', '36 (assumed unpaired)', or '36:unpaired' " ).argName( "length" ).hasArg().build() );
options.addOption( Option.builder( RNASeqDataAddCli.METADATAOPT ).longOpt( null ).desc( "Information on read length given as a string like '100:paired', '36' (no pairedness assumed), or '36:unpaired' " ).argName( "length" ).hasArg().build() );

options.addOption( "log2cpm", "Just compute log2cpm from the existing stored count data (backfill); batchmode OK, no other options needed" );

// RNA-Seq pipeline QC report
options.addOption( RNASeqDataAddCli.MULTIQC_METADATA_FILE_OPT, true, "File containing a MultiQC report" );

options.addOption( Option.builder( "noLog2cpm" ).longOpt( null ).desc( "Skip log2cpm computation and postprocessing, only backfill counts and rpkm" ).build() );
}

@Override
Expand Down Expand Up @@ -124,6 +126,8 @@ protected void processOptions( CommandLine commandLine ) {
} else {
throw new IllegalArgumentException( "Value must be either 'paired' or 'unpaired' or left blank" );
}
} else {
this.isPairedReads = null;
}

}
Expand All @@ -133,12 +137,20 @@ protected void processOptions( CommandLine commandLine ) {
if ( rpkmFile == null && countFile == null )
throw new IllegalArgumentException( "Must provide either RPKM or count data (or both)" );


if ( !commandLine.hasOption( "a" ) ) {
throw new IllegalArgumentException( "Must provide target platform" );
}

this.platformName = commandLine.getOptionValue( "a" );

if ( commandLine.hasOption( "noLog2cpm" ) ) {
this.justbackfillCountsAndRPKM = true;
if ( this.justbackfillLog2cpm ) {
throw new IllegalArgumentException( "Can't use both noLog2cpm and log2cpm options" );
}
}

if ( commandLine.hasOption( RNASeqDataAddCli.MULTIQC_METADATA_FILE_OPT ) ) {
qualityControlReportFile = new File( commandLine.getOptionValue( RNASeqDataAddCli.MULTIQC_METADATA_FILE_OPT ) );
if ( !qualityControlReportFile.isFile() || !qualityControlReportFile.canRead() ) {
Expand Down Expand Up @@ -208,7 +220,7 @@ protected void doWork() throws Exception {
}

serv.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, readLength, isPairedReads,
allowMissingSamples );
allowMissingSamples, justbackfillCountsAndRPKM );

} catch ( IOException e ) {
throw new Exception( "Failed while processing " + ee, e );
Expand Down
112 changes: 112 additions & 0 deletions gemma-cli/src/test/java/ubic/gemma/core/apps/RNASeqDataAddCliTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
package ubic.gemma.core.apps;

import org.junit.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.security.test.context.support.WithMockUser;
import org.springframework.test.annotation.DirtiesContext;
import org.springframework.test.context.ContextConfiguration;
import ubic.gemma.core.analysis.service.ExpressionDataFileService;
import ubic.gemma.core.genome.gene.service.GeneService;
import ubic.gemma.core.loader.expression.DataUpdater;
import ubic.gemma.core.search.SearchService;
import ubic.gemma.core.util.AbstractCLI;
import ubic.gemma.core.util.test.BaseCliTest;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService;
import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService;
import ubic.gemma.persistence.service.genome.taxon.TaxonService;
import ubic.gemma.persistence.util.TestComponent;

import static org.junit.Assert.assertEquals;
import static org.mockito.Mockito.*;

@DirtiesContext(classMode = DirtiesContext.ClassMode.AFTER_EACH_TEST_METHOD)
@ContextConfiguration
public class RNASeqDataAddCliTest extends BaseCliTest {

@TestComponent
@Configuration
static class RNASeqDataAddCliTestContextConfiguration extends BaseCliTestContextConfiguration {

@Bean
public RNASeqDataAddCli rnaSeqDataAddCli() {
return new RNASeqDataAddCli();
}

@Bean
public DataUpdater dataUpdater() {
return mock( DataUpdater.class );
}

@Bean
public GeneService geneService() {
return mock( GeneService.class );
}

@Bean
public TaxonService taxonService() {
return mock( TaxonService.class );
}

@Bean
public SearchService searchService() {
return mock( SearchService.class );
}

@Bean
public ExpressionDataFileService expressionDataFileService() {
return mock( ExpressionDataFileService.class );
}

@Bean
public ArrayDesignService arrayDesignService() {
return mock( ArrayDesignService.class );
}
}

@Autowired
private RNASeqDataAddCli cli;

@Autowired
private DataUpdater dataUpdater;

@Autowired
private ExpressionExperimentService expressionExperimentService;

@Autowired
private ArrayDesignService arrayDesignService;

@Test
@WithMockUser
public void test() {
ExpressionExperiment ee = new ExpressionExperiment();
ArrayDesign ad = new ArrayDesign();
when( expressionExperimentService.findByShortName( "GSE00001" ) ).thenReturn( ee );
when( arrayDesignService.findByShortName( "Generic_human_ncbiIds" ) ).thenReturn( ad );
assertEquals( AbstractCLI.SUCCESS, cli.executeCommand( new String[] {
"-e", "GSE00001", "-a", "Generic_human_ncbiIds",
"-rpkm", getClass().getResource( "/GSE00001.rpkm.txt" ).getFile(),
"-count", getClass().getResource( "/GSE00001.counts.txt" ).getFile() } ) );
verify( dataUpdater ).addCountData( same( ee ), same( ad ), isNotNull(), isNotNull(), isNull(),
isNull(), eq( false ), eq( false ) );
}

@Test
@WithMockUser
public void testSkipLog2cpm() {
ExpressionExperiment ee = new ExpressionExperiment();
ArrayDesign ad = new ArrayDesign();
when( expressionExperimentService.findByShortName( "GSE00001" ) ).thenReturn( ee );
when( arrayDesignService.findByShortName( "Generic_human_ncbiIds" ) ).thenReturn( ad );
assertEquals( AbstractCLI.SUCCESS, cli.executeCommand( new String[] {
"-e", "GSE00001", "-a", "Generic_human_ncbiIds",
"-rpkm", getClass().getResource( "/GSE00001.rpkm.txt" ).getFile(),
"-count", getClass().getResource( "/GSE00001.counts.txt" ).getFile(),
"-noLog2cpm" } ) );
verify( dataUpdater ).addCountData( same( ee ), same( ad ), isNotNull(), isNotNull(), isNull(),
isNull(), eq( false ), eq( true ) );
}
}
2 changes: 2 additions & 0 deletions gemma-cli/src/test/resources/GSE00001.counts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
GSM000001
1
2 changes: 2 additions & 0 deletions gemma-cli/src/test/resources/GSE00001.rpkm.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
GSM000001
0.5
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,25 @@
public interface DataUpdater {
void addAffyDataFromAPTOutput( ExpressionExperiment ee, String pathToAptOutputFile ) throws IOException;

/**
* RNA-seq: Replaces data. Starting with the count data, we compute the log2cpm, which is the preferred quantitation
* type we use internally. Counts and FPKM (if provided) are stored in addition.
*
* Rows (genes) that have all zero counts are ignored entirely.
*
* @param ee ee
* @param targetArrayDesign - this should be one of the "Generic" gene-based platforms. The data set will be
* switched to use it.
* @param countMatrix Representing 'raw' counts (added after rpkm, if provided).
* @param rpkmMatrix Representing per-gene normalized data, optional (RPKM or FPKM)
* @param readLength read length
* @param isPairedReads is paired reads
* @param allowMissingSamples if true, samples that are missing data will be deleted from the experiment.
* @param skipLog2cpm Only load the counts (and RPKM/FPKM if provided) - implemented to allow backfilling without updating log2cpm and preproc.
*/
void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign,
DoubleMatrix<String, String> countMatrix, DoubleMatrix<String, String> rpkmMatrix, Integer readLength,
Boolean isPairedReads, boolean allowMissingSamples );
Boolean isPairedReads, boolean allowMissingSamples, boolean skipLog2cpm );

void log2cpmFromCounts( ExpressionExperiment ee, QuantitationType qt );

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,26 +178,11 @@ public void addAffyDataFromAPTOutput( ExpressionExperiment ee, String pathToAptO

}

/**
* RNA-seq: Replaces data. Starting with the count data, we compute the log2cpm, which is the preferred quantitation
* type we use internally. Counts and FPKM (if provided) are stored in addition.
*
* Rows (genes) that have all zero counts are ignored entirely.
*
* @param ee ee
* @param targetArrayDesign - this should be one of the "Generic" gene-based platforms. The data set will be
* switched to use it.
* @param countMatrix Representing 'raw' counts (added after rpkm, if provided).
* @param rpkmMatrix Representing per-gene normalized data, optional (RPKM or FPKM)
* @param allowMissingSamples if true, samples that are missing data will be deleted from the experiment.
* @param isPairedReads is paired reads
* @param readLength read length
*/
@Override
@Transactional(propagation = Propagation.NEVER)
public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign,
DoubleMatrix<String, String> countMatrix, DoubleMatrix<String, String> rpkmMatrix, Integer readLength,
Boolean isPairedReads, boolean allowMissingSamples ) {
Boolean isPairedReads, boolean allowMissingSamples, boolean skipLog2cpm ) {

if ( countMatrix == null )
throw new IllegalArgumentException( "You must provide count matrix (rpkm is optional)" );
Expand All @@ -215,6 +200,7 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign
}

ee = experimentService.thawLite( ee );
Collection<QuantitationType> oldQts = ee.getQuantitationTypes();

ee = this.dealWithMissingSamples( ee, countMatrix, allowMissingSamples );

Expand All @@ -225,27 +211,8 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign
assert !properCountMatrix.getColNames().isEmpty();
assert !properCountMatrix.getRowNames().isEmpty();

Collection<QuantitationType> oldQts = ee.getQuantitationTypes();

// countEEMatrix = this.removeNoDataRows( countEEMatrix );

QuantitationType log2cpmQt = this.makelog2cpmQt();
for ( QuantitationType oldqt : oldQts ) { // use old QT if possible
if ( oldqt.getName().equals( log2cpmQt.getName() ) ) {
log2cpmQt = oldqt;
break;
}
}

DoubleMatrix1D librarySize = MatrixStats.colSums( countMatrix );
DoubleMatrix<CompositeSequence, BioMaterial> log2cpmMatrix = MatrixStats
.convertToLog2Cpm( properCountMatrix, librarySize );

ExpressionDataDoubleMatrix log2cpmEEMatrix = new ExpressionDataDoubleMatrix( ee, log2cpmQt, log2cpmMatrix );

// important: replaceData takes care of the platform switch if necessary; call first. It also deletes old QTs, so from here we have to remake them.
ee = this.replaceData( ee, targetArrayDesign, log2cpmEEMatrix );

QuantitationType countqt = this.makeCountQt();
for ( QuantitationType oldqt : oldQts ) { // use old QT if possible
if ( oldqt.getName().equals( countqt.getName() ) ) {
Expand All @@ -254,11 +221,11 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign
}
}
ExpressionDataDoubleMatrix countEEMatrix = new ExpressionDataDoubleMatrix( ee, countqt, properCountMatrix );

ee = this.addData( ee, targetArrayDesign, countEEMatrix );
this.addTotalCountInformation( ee, countEEMatrix, readLength, isPairedReads, skipLog2cpm );

this.addTotalCountInformation( ee, countEEMatrix, readLength, isPairedReads );

/* add rpkm/fpkm data if available */
if ( rpkmMatrix != null ) {

DoubleMatrix<CompositeSequence, BioMaterial> properRPKMMatrix = this
Expand All @@ -281,6 +248,31 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign
this.addData( ee, targetArrayDesign, rpkmEEMatrix );
}

// offered only to allow us to backfill the count and rpkm data without updating log2cpm and triggering postprocessing
if ( skipLog2cpm ) {
log.info( "Done adding count data, skipping the rest as per selected options" );
return;
}

/* Add the log2cpm data.*/
QuantitationType log2cpmQt = this.makelog2cpmQt();
for ( QuantitationType oldqt : oldQts ) { // use old QT if possible
if ( oldqt.getName().equals( log2cpmQt.getName() ) ) {
log2cpmQt = oldqt;
break;
}
}

DoubleMatrix1D librarySize = MatrixStats.colSums( countMatrix );
DoubleMatrix<CompositeSequence, BioMaterial> log2cpmMatrix = MatrixStats
.convertToLog2Cpm( properCountMatrix, librarySize );

ExpressionDataDoubleMatrix log2cpmEEMatrix = new ExpressionDataDoubleMatrix( ee, log2cpmQt, log2cpmMatrix );

// important: replaceData takes care of the platform switch if necessary; call first. It also deletes old QTs, so from here we have to remake them.
ee = this.replaceData( ee, targetArrayDesign, log2cpmEEMatrix );


}

/**
Expand Down Expand Up @@ -677,12 +669,15 @@ public ExpressionExperiment replaceData( ExpressionExperiment ee, ArrayDesign ta
* RNA-seq
*
* @param ee experiment
* @param countEEMatrix count ee matrix
* @param readLength read length
* @param isPairedReads is paired reads
* @param countEEMatrix count ee matrix; used to compute library sizes
* @param readLength read length (optional)
* @param isPairedReads is paired reads (optional)
* @param requireExistingLibrarySizesMatch whether to care if existing information about librarysizes can conflict with the one stored.
* This should be true when simply backfilling counts, as a check.
* It should be false if we are actually trying to replace existing counts.
*/
private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDoubleMatrix countEEMatrix,
Integer readLength, Boolean isPairedReads ) {
Integer readLength, Boolean isPairedReads, boolean requireExistingLibrarySizesMatch ) {
for ( BioAssay ba : ee.getBioAssays() ) {
Double[] col = countEEMatrix.getColumn( ba );
int librarySize = ( int ) Math.floor( DescriptiveWithMissing.sum( new DoubleArrayList( ArrayUtils.toPrimitive( col ) ) ) );
Expand All @@ -693,8 +688,17 @@ private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDo
}
DataUpdaterImpl.log.info( ba + " total library size=" + librarySize );

ba.setSequenceReadLength( readLength );
ba.setSequencePairedReads( isPairedReads );
if ( readLength != null ) {
ba.setSequenceReadLength( readLength );
}

if ( isPairedReads != null ) {
ba.setSequencePairedReads( isPairedReads );
}

if ( ba.getSequenceReadCount() != null && ba.getSequenceReadCount() != librarySize && requireExistingLibrarySizesMatch ) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need a test case for this specific condition.

throw new IllegalStateException( "Read count was already set for " + ba + " and is not the same (existing=" + ba.getSequenceReadCount() + " vs updated=" + librarySize + ")" );
}
ba.setSequenceReadCount( librarySize );

bioAssayService.update( ba );
Expand Down
Loading