From 5205963612213971178e907be474b7390ba28139 Mon Sep 17 00:00:00 2001 From: Paul Pavlidis Date: Mon, 17 Jul 2023 11:06:12 -0700 Subject: [PATCH 1/8] allow backfilling of counts and rpkm without triggering preprocessing --- .../gemma/core/apps/RNASeqDataAddCli.java | 14 +++- .../core/loader/expression/DataUpdater.java | 18 +++- .../loader/expression/DataUpdaterImpl.java | 82 +++++++++---------- .../analysis/expression/diff/DiffExTest.java | 2 +- .../preprocess/MeanVarianceServiceTest.java | 5 +- .../loader/expression/DataUpdaterTest.java | 8 +- 6 files changed, 76 insertions(+), 53 deletions(-) diff --git a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java index 88ccc44d23..8f8c3b2af1 100644 --- a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java +++ b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java @@ -34,7 +34,6 @@ import java.io.File; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.StandardCopyOption; import java.text.MessageFormat; import java.util.Collection; @@ -59,6 +58,7 @@ public class RNASeqDataAddCli extends ExpressionExperimentManipulatingCLI { private String rpkmFile = null; private boolean justbackfillLog2cpm = false; private File qualityControlReportFile = null; + private boolean justbackfillCountsAndRPKM = false; @Override public CommandGroup getCommandGroup() { @@ -81,6 +81,8 @@ protected void buildOptions( Options options ) { // RNA-Seq pipeline QC report options.addOption( RNASeqDataAddCli.MULTIQC_METADATA_FILE_OPT, true, "File containing a MultiQC report" ); + + options.addOption( Option.builder( "noLog2cpm" ).longOpt( null ).desc( "Skip log2cpm computation and postprocessing, only backfill counts and rpkm" ).build() ); } @Override @@ -133,12 +135,20 @@ protected void processOptions( CommandLine commandLine ) { if ( rpkmFile == null && countFile == null ) throw new IllegalArgumentException( "Must provide either RPKM or count data (or both)" ); + if ( !commandLine.hasOption( "a" ) ) { throw new IllegalArgumentException( "Must provide target platform" ); } this.platformName = commandLine.getOptionValue( "a" ); + if (commandLine.hasOption( "noLog2cpm")) { + this.justbackfillCountsAndRPKM = true; + if (this.justbackfillLog2cpm) { + throw new IllegalArgumentException("Can't use both noLog2cpm and log2cpm options"); + } + } + if ( commandLine.hasOption( RNASeqDataAddCli.MULTIQC_METADATA_FILE_OPT ) ) { qualityControlReportFile = new File( commandLine.getOptionValue( RNASeqDataAddCli.MULTIQC_METADATA_FILE_OPT ) ); if ( !qualityControlReportFile.isFile() || !qualityControlReportFile.canRead() ) { @@ -208,7 +218,7 @@ protected void doWork() throws Exception { } serv.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, readLength, isPairedReads, - allowMissingSamples ); + allowMissingSamples, justbackfillCountsAndRPKM ); } catch ( IOException e ) { throw new Exception( "Failed while processing " + ee, e ); diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdater.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdater.java index 8a760d58f4..752ab8955b 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdater.java +++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdater.java @@ -11,9 +11,25 @@ public interface DataUpdater { void addAffyDataFromAPTOutput( ExpressionExperiment ee, String pathToAptOutputFile ) throws IOException; + /** + * RNA-seq: Replaces data. Starting with the count data, we compute the log2cpm, which is the preferred quantitation + * type we use internally. Counts and FPKM (if provided) are stored in addition. + * + * Rows (genes) that have all zero counts are ignored entirely. + * + * @param ee ee + * @param targetArrayDesign - this should be one of the "Generic" gene-based platforms. The data set will be + * switched to use it. + * @param countMatrix Representing 'raw' counts (added after rpkm, if provided). + * @param rpkmMatrix Representing per-gene normalized data, optional (RPKM or FPKM) + * @param readLength read length + * @param isPairedReads is paired reads + * @param allowMissingSamples if true, samples that are missing data will be deleted from the experiment. + * @param skipLog2cpm Only load the counts (and RPKM/FPKM if provided) - implemented to allow backfilling without updating log2cpm and preproc. + */ void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign, DoubleMatrix countMatrix, DoubleMatrix rpkmMatrix, Integer readLength, - Boolean isPairedReads, boolean allowMissingSamples ); + Boolean isPairedReads, boolean allowMissingSamples, boolean skipLog2cpm ); void log2cpmFromCounts( ExpressionExperiment ee, QuantitationType qt ); diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java index 36692411e4..24c06e332d 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java +++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java @@ -178,26 +178,11 @@ public void addAffyDataFromAPTOutput( ExpressionExperiment ee, String pathToAptO } - /** - * RNA-seq: Replaces data. Starting with the count data, we compute the log2cpm, which is the preferred quantitation - * type we use internally. Counts and FPKM (if provided) are stored in addition. - * - * Rows (genes) that have all zero counts are ignored entirely. - * - * @param ee ee - * @param targetArrayDesign - this should be one of the "Generic" gene-based platforms. The data set will be - * switched to use it. - * @param countMatrix Representing 'raw' counts (added after rpkm, if provided). - * @param rpkmMatrix Representing per-gene normalized data, optional (RPKM or FPKM) - * @param allowMissingSamples if true, samples that are missing data will be deleted from the experiment. - * @param isPairedReads is paired reads - * @param readLength read length - */ @Override @Transactional(propagation = Propagation.NEVER) public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign, DoubleMatrix countMatrix, DoubleMatrix rpkmMatrix, Integer readLength, - Boolean isPairedReads, boolean allowMissingSamples ) { + Boolean isPairedReads, boolean allowMissingSamples, boolean skipLog2cpm ) { if ( countMatrix == null ) throw new IllegalArgumentException( "You must provide count matrix (rpkm is optional)" ); @@ -215,6 +200,7 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign } ee = experimentService.thawLite( ee ); + Collection oldQts = ee.getQuantitationTypes(); ee = this.dealWithMissingSamples( ee, countMatrix, allowMissingSamples ); @@ -225,27 +211,8 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign assert !properCountMatrix.getColNames().isEmpty(); assert !properCountMatrix.getRowNames().isEmpty(); - Collection oldQts = ee.getQuantitationTypes(); - // countEEMatrix = this.removeNoDataRows( countEEMatrix ); - QuantitationType log2cpmQt = this.makelog2cpmQt(); - for ( QuantitationType oldqt : oldQts ) { // use old QT if possible - if ( oldqt.getName().equals( log2cpmQt.getName() ) ) { - log2cpmQt = oldqt; - break; - } - } - - DoubleMatrix1D librarySize = MatrixStats.colSums( countMatrix ); - DoubleMatrix log2cpmMatrix = MatrixStats - .convertToLog2Cpm( properCountMatrix, librarySize ); - - ExpressionDataDoubleMatrix log2cpmEEMatrix = new ExpressionDataDoubleMatrix( ee, log2cpmQt, log2cpmMatrix ); - - // important: replaceData takes care of the platform switch if necessary; call first. It also deletes old QTs, so from here we have to remake them. - ee = this.replaceData( ee, targetArrayDesign, log2cpmEEMatrix ); - QuantitationType countqt = this.makeCountQt(); for ( QuantitationType oldqt : oldQts ) { // use old QT if possible if ( oldqt.getName().equals( countqt.getName() ) ) { @@ -254,11 +221,11 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign } } ExpressionDataDoubleMatrix countEEMatrix = new ExpressionDataDoubleMatrix( ee, countqt, properCountMatrix ); - ee = this.addData( ee, targetArrayDesign, countEEMatrix ); - this.addTotalCountInformation( ee, countEEMatrix, readLength, isPairedReads ); + + /* add rpkm/fpkm data if available */ if ( rpkmMatrix != null ) { DoubleMatrix properRPKMMatrix = this @@ -281,6 +248,31 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign this.addData( ee, targetArrayDesign, rpkmEEMatrix ); } + // offered only to allow us to backfill the count and rpkm data without updating log2cpm and triggering postprocessing + if ( skipLog2cpm ) { + log.info( "Done adding count data, skipping the rest as per selected options" ); + return; + } + + /* Add the log2cpm data.*/ + QuantitationType log2cpmQt = this.makelog2cpmQt(); + for ( QuantitationType oldqt : oldQts ) { // use old QT if possible + if ( oldqt.getName().equals( log2cpmQt.getName() ) ) { + log2cpmQt = oldqt; + break; + } + } + + DoubleMatrix1D librarySize = MatrixStats.colSums( countMatrix ); + DoubleMatrix log2cpmMatrix = MatrixStats + .convertToLog2Cpm( properCountMatrix, librarySize ); + + ExpressionDataDoubleMatrix log2cpmEEMatrix = new ExpressionDataDoubleMatrix( ee, log2cpmQt, log2cpmMatrix ); + + // important: replaceData takes care of the platform switch if necessary; call first. It also deletes old QTs, so from here we have to remake them. + ee = this.replaceData( ee, targetArrayDesign, log2cpmEEMatrix ); + + } /** @@ -677,9 +669,9 @@ public ExpressionExperiment replaceData( ExpressionExperiment ee, ArrayDesign ta * RNA-seq * * @param ee experiment - * @param countEEMatrix count ee matrix - * @param readLength read length - * @param isPairedReads is paired reads + * @param countEEMatrix count ee matrix; used to compute library sizes + * @param readLength read length (optional) + * @param isPairedReads is paired reads (optional) */ private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDoubleMatrix countEEMatrix, Integer readLength, Boolean isPairedReads ) { @@ -693,8 +685,14 @@ private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDo } DataUpdaterImpl.log.info( ba + " total library size=" + librarySize ); - ba.setSequenceReadLength( readLength ); - ba.setSequencePairedReads( isPairedReads ); + if ( readLength != null ) { + ba.setSequenceReadLength( readLength ); + } + + if ( isPairedReads != null ) { + ba.setSequencePairedReads( isPairedReads ); + } + ba.setSequenceReadCount( librarySize ); bioAssayService.update( ba ); diff --git a/gemma-core/src/test/java/ubic/gemma/core/analysis/expression/diff/DiffExTest.java b/gemma-core/src/test/java/ubic/gemma/core/analysis/expression/diff/DiffExTest.java index 86f9691e77..937323fc94 100644 --- a/gemma-core/src/test/java/ubic/gemma/core/analysis/expression/diff/DiffExTest.java +++ b/gemma-core/src/test/java/ubic/gemma/core/analysis/expression/diff/DiffExTest.java @@ -146,7 +146,7 @@ public void testCountData() throws Exception { // the experiment has 8 samples but the data has 4 columns so allow missing samples // GSM718707 GSM718708 GSM718709 GSM718710 - dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, null, 36, true, true ); + dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, null, 36, true, true, false ); } // make sure to do a thawRawAndProcessed() to get the addCountData() updates diff --git a/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/MeanVarianceServiceTest.java b/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/MeanVarianceServiceTest.java index e6f615e933..5507b9d6bc 100644 --- a/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/MeanVarianceServiceTest.java +++ b/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/MeanVarianceServiceTest.java @@ -33,7 +33,6 @@ import ubic.gemma.core.security.authorization.acl.AclTestUtils; import ubic.gemma.core.util.test.category.GeoTest; import ubic.gemma.core.util.test.category.SlowTest; -import ubic.gemma.model.common.auditAndSecurity.AuditAction; import ubic.gemma.model.common.quantitationtype.*; import ubic.gemma.model.expression.arrayDesign.ArrayDesign; import ubic.gemma.model.expression.arrayDesign.TechnologyType; @@ -235,12 +234,12 @@ final public void testServiceCreateCountData() throws Exception { targetArrayDesign = arrayDesignService.thaw( targetArrayDesign ); try { - dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false ); + dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false, false ); fail( "Should have gotten an exception" ); } catch ( IllegalArgumentException e ) { // Expected } - dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, true ); + dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, true, false ); } ee = eeService.thaw( this.ee ); diff --git a/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdaterTest.java b/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdaterTest.java index bd183ba894..8ee5f15221 100644 --- a/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdaterTest.java +++ b/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdaterTest.java @@ -241,7 +241,7 @@ public void testLoadRNASeqData() throws Exception { assertEquals( 199, targetArrayDesign.getCompositeSequences().size() ); // Main step. - dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false ); + dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false, false ); ee = experimentService.loadOrFail( ee.getId() ); ee = experimentService.thaw( ee ); @@ -285,7 +285,7 @@ public void testLoadRNASeqData() throws Exception { assertFalse( dataVectorService.getProcessedDataVectors( ee2 ).isEmpty() ); // Call it again to test that we don't leak QTs - dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false ); + dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false, false ); ee = experimentService.load( ee.getId() ); assertNotNull( ee ); ee = this.experimentService.thawLite( ee ); @@ -328,12 +328,12 @@ public void testLoadRNASeqDataWithMissingSamples() throws Exception { .getTestPersistentArrayDesign( probeNames, taxonService.findByCommonName( "human" ) ); targetArrayDesign = arrayDesignService.thaw( targetArrayDesign ); try { - dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false ); + dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false, false ); fail( "Should have gotten an exception" ); } catch ( IllegalArgumentException e ) { // Expected } - dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, true ); + dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, true, false ); } /* From 36bf8df2a6db8c841f6e793c6e881c0d55a5958a Mon Sep 17 00:00:00 2001 From: Paul Pavlidis Date: Mon, 17 Jul 2023 11:13:17 -0700 Subject: [PATCH 2/8] add a check that the library size is not changed when adding counts to data that already has librarysizes --- .../ubic/gemma/core/loader/expression/DataUpdaterImpl.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java index 24c06e332d..ed105e8c38 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java +++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java @@ -693,6 +693,9 @@ private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDo ba.setSequencePairedReads( isPairedReads ); } + if ( ba.getSequenceReadCount() != null && ba.getSequenceReadCount() != librarySize ) { + throw new IllegalStateException( "Read count was already set for " + ba + " and is not the same (existing=" + ba.getSequenceReadCount() + " vs updated=" + librarySize + ")" ); + } ba.setSequenceReadCount( librarySize ); bioAssayService.update( ba ); From 2dd9ae120d46596656202c59e66236e0677a3b9c Mon Sep 17 00:00:00 2001 From: Paul Pavlidis Date: Mon, 17 Jul 2023 11:18:02 -0700 Subject: [PATCH 3/8] add check for whether already stored library sizes can deviate from the new --- .../gemma/core/loader/expression/DataUpdaterImpl.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java index ed105e8c38..731b35713f 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java +++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java @@ -222,7 +222,7 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign } ExpressionDataDoubleMatrix countEEMatrix = new ExpressionDataDoubleMatrix( ee, countqt, properCountMatrix ); ee = this.addData( ee, targetArrayDesign, countEEMatrix ); - this.addTotalCountInformation( ee, countEEMatrix, readLength, isPairedReads ); + this.addTotalCountInformation( ee, countEEMatrix, readLength, isPairedReads, skipLog2cpm ); /* add rpkm/fpkm data if available */ @@ -672,9 +672,12 @@ public ExpressionExperiment replaceData( ExpressionExperiment ee, ArrayDesign ta * @param countEEMatrix count ee matrix; used to compute library sizes * @param readLength read length (optional) * @param isPairedReads is paired reads (optional) + * @param requireExistingLibrarySizesMatch whether to care if existing information about librarysizes can conflict with the one stored. + * This should be true when simply backfilling counts, as a check. + * It should be false if we are actually trying to replace existing counts. */ private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDoubleMatrix countEEMatrix, - Integer readLength, Boolean isPairedReads ) { + Integer readLength, Boolean isPairedReads, boolean requireExistingLibrarySizesMatch ) { for ( BioAssay ba : ee.getBioAssays() ) { Double[] col = countEEMatrix.getColumn( ba ); int librarySize = ( int ) Math.floor( DescriptiveWithMissing.sum( new DoubleArrayList( ArrayUtils.toPrimitive( col ) ) ) ); @@ -693,7 +696,7 @@ private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDo ba.setSequencePairedReads( isPairedReads ); } - if ( ba.getSequenceReadCount() != null && ba.getSequenceReadCount() != librarySize ) { + if ( ba.getSequenceReadCount() != null && ba.getSequenceReadCount() != librarySize && requireExistingLibrarySizesMatch ) { throw new IllegalStateException( "Read count was already set for " + ba + " and is not the same (existing=" + ba.getSequenceReadCount() + " vs updated=" + librarySize + ")" ); } ba.setSequenceReadCount( librarySize ); From d124d35de3d65324e2dda3a47e740f267752f0da Mon Sep 17 00:00:00 2001 From: Guillaume Poirier-Morency Date: Mon, 24 Jul 2023 15:56:59 -0700 Subject: [PATCH 4/8] Use null as default for missing pairedness information --- .../src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java index 8f8c3b2af1..55d2409bc7 100644 --- a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java +++ b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java @@ -52,7 +52,7 @@ public class RNASeqDataAddCli extends ExpressionExperimentManipulatingCLI { private static final String MULTIQC_METADATA_FILE_OPT = "multiqc"; private boolean allowMissingSamples = false; private String countFile = null; - private boolean isPairedReads = false; + private Boolean isPairedReads = null; private String platformName = null; private Integer readLength = null; private String rpkmFile = null; From ea1f70357f7335904712563a57584933dd9e02a2 Mon Sep 17 00:00:00 2001 From: Guillaume Poirier-Morency Date: Mon, 24 Jul 2023 15:55:35 -0700 Subject: [PATCH 5/8] Add tests for the -noLog2cpm switch --- .../gemma/core/apps/RNASeqDataAddCliTest.java | 112 ++++++++++++++++++ .../src/test/resources/GSE00001.counts.txt | 2 + .../src/test/resources/GSE00001.rpkm.txt | 2 + 3 files changed, 116 insertions(+) create mode 100644 gemma-cli/src/test/java/ubic/gemma/core/apps/RNASeqDataAddCliTest.java create mode 100644 gemma-cli/src/test/resources/GSE00001.counts.txt create mode 100644 gemma-cli/src/test/resources/GSE00001.rpkm.txt diff --git a/gemma-cli/src/test/java/ubic/gemma/core/apps/RNASeqDataAddCliTest.java b/gemma-cli/src/test/java/ubic/gemma/core/apps/RNASeqDataAddCliTest.java new file mode 100644 index 0000000000..da730c3140 --- /dev/null +++ b/gemma-cli/src/test/java/ubic/gemma/core/apps/RNASeqDataAddCliTest.java @@ -0,0 +1,112 @@ +package ubic.gemma.core.apps; + +import org.junit.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.security.test.context.support.WithMockUser; +import org.springframework.test.annotation.DirtiesContext; +import org.springframework.test.context.ContextConfiguration; +import ubic.gemma.core.analysis.service.ExpressionDataFileService; +import ubic.gemma.core.genome.gene.service.GeneService; +import ubic.gemma.core.loader.expression.DataUpdater; +import ubic.gemma.core.search.SearchService; +import ubic.gemma.core.util.AbstractCLI; +import ubic.gemma.core.util.test.BaseCliTest; +import ubic.gemma.model.expression.arrayDesign.ArrayDesign; +import ubic.gemma.model.expression.experiment.ExpressionExperiment; +import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService; +import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService; +import ubic.gemma.persistence.service.genome.taxon.TaxonService; +import ubic.gemma.persistence.util.TestComponent; + +import static org.junit.Assert.assertEquals; +import static org.mockito.Mockito.*; + +@DirtiesContext(classMode = DirtiesContext.ClassMode.AFTER_EACH_TEST_METHOD) +@ContextConfiguration +public class RNASeqDataAddCliTest extends BaseCliTest { + + @TestComponent + @Configuration + static class RNASeqDataAddCliTestContextConfiguration extends BaseCliTestContextConfiguration { + + @Bean + public RNASeqDataAddCli rnaSeqDataAddCli() { + return new RNASeqDataAddCli(); + } + + @Bean + public DataUpdater dataUpdater() { + return mock( DataUpdater.class ); + } + + @Bean + public GeneService geneService() { + return mock( GeneService.class ); + } + + @Bean + public TaxonService taxonService() { + return mock( TaxonService.class ); + } + + @Bean + public SearchService searchService() { + return mock( SearchService.class ); + } + + @Bean + public ExpressionDataFileService expressionDataFileService() { + return mock( ExpressionDataFileService.class ); + } + + @Bean + public ArrayDesignService arrayDesignService() { + return mock( ArrayDesignService.class ); + } + } + + @Autowired + private RNASeqDataAddCli cli; + + @Autowired + private DataUpdater dataUpdater; + + @Autowired + private ExpressionExperimentService expressionExperimentService; + + @Autowired + private ArrayDesignService arrayDesignService; + + @Test + @WithMockUser + public void test() { + ExpressionExperiment ee = new ExpressionExperiment(); + ArrayDesign ad = new ArrayDesign(); + when( expressionExperimentService.findByShortName( "GSE00001" ) ).thenReturn( ee ); + when( arrayDesignService.findByShortName( "Generic_human_ncbiIds" ) ).thenReturn( ad ); + assertEquals( AbstractCLI.SUCCESS, cli.executeCommand( new String[] { + "-e", "GSE00001", "-a", "Generic_human_ncbiIds", + "-rpkm", getClass().getResource( "/GSE00001.rpkm.txt" ).getFile(), + "-count", getClass().getResource( "/GSE00001.counts.txt" ).getFile() } ) ); + verify( dataUpdater ).addCountData( same( ee ), same( ad ), isNotNull(), isNotNull(), isNull(), + isNull(), eq( false ), eq( false ) ); + } + + @Test + @WithMockUser + public void testSkipLog2cpm() { + ExpressionExperiment ee = new ExpressionExperiment(); + ArrayDesign ad = new ArrayDesign(); + when( expressionExperimentService.findByShortName( "GSE00001" ) ).thenReturn( ee ); + when( arrayDesignService.findByShortName( "Generic_human_ncbiIds" ) ).thenReturn( ad ); + assertEquals( AbstractCLI.SUCCESS, cli.executeCommand( new String[] { + "-e", "GSE00001", "-a", "Generic_human_ncbiIds", + "-rpkm", getClass().getResource( "/GSE00001.rpkm.txt" ).getFile(), + "-count", getClass().getResource( "/GSE00001.counts.txt" ).getFile(), + "-noLog2cpm" } ) ); + verify( dataUpdater ).addCountData( same( ee ), same( ad ), isNotNull(), isNotNull(), isNull(), + isNull(), eq( false ), eq( true ) ); + } +} \ No newline at end of file diff --git a/gemma-cli/src/test/resources/GSE00001.counts.txt b/gemma-cli/src/test/resources/GSE00001.counts.txt new file mode 100644 index 0000000000..dc8d95a79d --- /dev/null +++ b/gemma-cli/src/test/resources/GSE00001.counts.txt @@ -0,0 +1,2 @@ +GSM000001 +1 \ No newline at end of file diff --git a/gemma-cli/src/test/resources/GSE00001.rpkm.txt b/gemma-cli/src/test/resources/GSE00001.rpkm.txt new file mode 100644 index 0000000000..acde930d36 --- /dev/null +++ b/gemma-cli/src/test/resources/GSE00001.rpkm.txt @@ -0,0 +1,2 @@ +GSM000001 +0.5 \ No newline at end of file From a46815a942859f6bf2fd03966d89e6860f103c60 Mon Sep 17 00:00:00 2001 From: Guillaume Poirier-Morency Date: Mon, 24 Jul 2023 20:25:02 -0700 Subject: [PATCH 6/8] Update CLI docs and arg logic that omitting pairedness result in undefined pairedness --- .../java/ubic/gemma/core/apps/RNASeqDataAddCli.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java index 55d2409bc7..d87f22280d 100644 --- a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java +++ b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java @@ -75,7 +75,7 @@ protected void buildOptions( Options options ) { options.addOption( RNASeqDataAddCli.ALLOW_MISSING, "Set this if your data files don't have information for all samples." ); options.addOption( Option.builder( "a" ).longOpt( null ).desc( "Target platform (must already exist in the system)" ).argName( "platform short name" ).hasArg().build() ); - options.addOption( Option.builder( RNASeqDataAddCli.METADATAOPT ).longOpt( null ).desc( "Information on read length given as a string like '100:paired', '36 (assumed unpaired)', or '36:unpaired' " ).argName( "length" ).hasArg().build() ); + options.addOption( Option.builder( RNASeqDataAddCli.METADATAOPT ).longOpt( null ).desc( "Information on read length given as a string like '100:paired', '36' (no pairedness assumed), or '36:unpaired' " ).argName( "length" ).hasArg().build() ); options.addOption( "log2cpm", "Just compute log2cpm from the existing stored count data (backfill); batchmode OK, no other options needed" ); @@ -126,6 +126,8 @@ protected void processOptions( CommandLine commandLine ) { } else { throw new IllegalArgumentException( "Value must be either 'paired' or 'unpaired' or left blank" ); } + } else { + this.isPairedReads = null; } } @@ -142,10 +144,10 @@ protected void processOptions( CommandLine commandLine ) { this.platformName = commandLine.getOptionValue( "a" ); - if (commandLine.hasOption( "noLog2cpm")) { + if ( commandLine.hasOption( "noLog2cpm" ) ) { this.justbackfillCountsAndRPKM = true; - if (this.justbackfillLog2cpm) { - throw new IllegalArgumentException("Can't use both noLog2cpm and log2cpm options"); + if ( this.justbackfillLog2cpm ) { + throw new IllegalArgumentException( "Can't use both noLog2cpm and log2cpm options" ); } } From d08628a05eb11a02264394ab61e739b893d9fcc0 Mon Sep 17 00:00:00 2001 From: Guillaume Poirier-Morency Date: Mon, 24 Jul 2023 21:00:46 -0700 Subject: [PATCH 7/8] Extract interface from ExpressionExperimentPlatformSwitchService This is needed to write unit tests for DataUpdater. --- ...essionExperimentPlatformSwitchService.java | 649 +---------------- ...onExperimentPlatformSwitchServiceImpl.java | 652 ++++++++++++++++++ .../ArrayDesignMergeServiceImpl.java | 5 +- 3 files changed, 658 insertions(+), 648 deletions(-) create mode 100644 gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchServiceImpl.java diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchService.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchService.java index d892b21ad9..b54789d89c 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchService.java +++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchService.java @@ -1,655 +1,12 @@ -/* - * The Gemma project - * - * Copyright (c) 2007 University of British Columbia - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ package ubic.gemma.core.loader.expression; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; - -import ubic.gemma.core.analysis.expression.AnalysisUtilService; -import ubic.gemma.core.analysis.preprocess.VectorMergingService; -import ubic.gemma.core.analysis.service.ExpressionExperimentVectorManipulatingService; -import ubic.gemma.model.common.quantitationtype.PrimitiveType; -import ubic.gemma.model.common.quantitationtype.QuantitationType; import ubic.gemma.model.expression.arrayDesign.ArrayDesign; -import ubic.gemma.model.expression.arrayDesign.TechnologyType; -import ubic.gemma.model.expression.bioAssay.BioAssay; -import ubic.gemma.model.expression.bioAssayData.BioAssayDimension; -import ubic.gemma.model.expression.bioAssayData.DesignElementDataVector; -import ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector; -import ubic.gemma.model.expression.biomaterial.BioMaterial; -import ubic.gemma.model.expression.designElement.CompositeSequence; import ubic.gemma.model.expression.experiment.ExpressionExperiment; -import ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet; import ubic.gemma.model.genome.biosequence.BioSequence; -import ubic.gemma.persistence.service.analysis.expression.sampleCoexpression.SampleCoexpressionAnalysisService; -import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService; -import ubic.gemma.persistence.service.expression.bioAssay.BioAssayService; -import ubic.gemma.persistence.service.expression.bioAssayData.BioAssayDimensionService; -import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService; -import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentSubSetService; - -/** - * Switch an expression experiment from one array design to another. This is valuable when the EE uses more than on AD, - * and a merged AD exists. The following steps are needed: - *
    - * - *
  • Delete old analyses of this experiment and processeddata vectors - *
  • For each array design, for each probe, identify the matching probe on the merged AD. Have to deal with situation - *
  • where more than one occurrence of each sequence is found. - *
  • all DEDVs must be switched to use the new AD's design elements - *
  • all bioassays must be switched to the new AD. - *
  • update the EE description - *
  • commit changes. - *
  • Computed processed data vectors - *
- * This also handles the case of multisamples-per-platform - NOT the case of one-sample-per-platform but multiple - * platforms; for that you have to run VectorMerging. For more nutty situations this will probably create a mess. - * - * @author pavlidis - * @see VectorMergingService - */ -@Component -public class ExpressionExperimentPlatformSwitchService extends ExpressionExperimentVectorManipulatingService { - - /** - * Used to identify design elements that have no sequence associated with them. - */ - public static final BioSequence NULL_BIOSEQUENCE; - private static final Log log = LogFactory.getLog( ExpressionExperimentPlatformSwitchService.class.getName() ); - - static { - NULL_BIOSEQUENCE = BioSequence.Factory.newInstance(); - ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE.setName( "______NULL______" ); - ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE.setId( -1L ); - } - - @Autowired - private ArrayDesignService arrayDesignService; - - @Autowired - private BioAssayDimensionService bioAssayDimensionService; - - @Autowired - private BioAssayService bioAssayService; - - @Autowired - private ExpressionExperimentService expressionExperimentService; - - @Autowired - private ExperimentPlatformSwitchHelperService helperService; - - @Autowired - private SampleCoexpressionAnalysisService sampleCoexpressionAnalysisService; - - @Autowired - private ExpressionExperimentSubSetService subsetService; - - @Autowired - private AnalysisUtilService analysisUtilService; - - /** - * If you know the array designs are already in a merged state, you should use switchExperimentToMergedPlatform - * - * @param ee ee - * @param arrayDesign The array design to switch to. If some samples already use that array design, nothing will be - * changed for them. - * @return the switched experiment - */ - public void switchExperimentToArrayDesign( ExpressionExperiment ee, ArrayDesign arrayDesign ) { - assert arrayDesign != null; - - // remove stuff that will be in the way. - processedExpressionDataVectorService.removeProcessedDataVectors( ee ); - sampleCoexpressionAnalysisService.removeForExperiment( ee ); - for ( ExpressionExperimentSubSet subset : expressionExperimentService.getSubSets( ee ) ) { - subsetService.remove( subset ); - } - analysisUtilService.deleteOldAnalyses( ee ); - - // get relation between sequence and designelements. - Map> designElementMap = new HashMap<>(); - Collection elsWithNoSeq = new HashSet<>(); - this.populateCSeq( arrayDesign, designElementMap, elsWithNoSeq ); - - ee = expressionExperimentService.thaw( ee ); - - if ( elsWithNoSeq.size() > 0 ) { - ExpressionExperimentPlatformSwitchService.log - .info( elsWithNoSeq.size() + " elements on the new platform have no associated sequence." ); - designElementMap.put( ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE, elsWithNoSeq ); - } - - boolean multiPlatformPerSample = this.switchPlatform( ee, arrayDesign ); - /* - * For a multiplatform-per-sample case ("Case 2", platforms are disjoint): (note that some samples might just be - * on one platform...) - * 1. Pick a BAD that can be used for all DataVectors (it has all BioAssays in it). - * 2. Switch vectors to use it - may require adding NaNs and reordering the vectors (switchDataForPlatform) - * 3. Delete the Bioassays that are using other BADs (cleanupUnused) - * - * For Case 1 (each sample run on one platform, all platforms are related/similar) Simply switch the vectors - * (switchDataForPlatform) - */ - - /* - * Now we have to get the BADs. Problem to watch out for: they might not be the same length, we need one that - * includes all BioMaterials. - */ - Collection unusedBADs = new HashSet<>(); - BioAssayDimension targetBioAssayDimension = null; - if ( multiPlatformPerSample ) { - targetBioAssayDimension = this.doMultiSample( ee, unusedBADs ); - } - if ( multiPlatformPerSample && targetBioAssayDimension == null ) { - throw new RuntimeException( "Data set cannot be switched to merged platform: no suitable bioassaydimension found" ); - } - - /* - * To account for cases where we have no data loaded yet. - */ - boolean hasData = expressionExperimentService.getQuantitationTypes( ee ).size() > 0 && ee.getRawExpressionDataVectors().size() > 0; - - Collection oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed( ee ); - Map> usedDesignElements = new HashMap<>(); - int totalVectorsSwitched = 0; - for ( ArrayDesign oldAd : oldArrayDesigns ) { - totalVectorsSwitched += this.switchDataForPlatform( ee, arrayDesign, designElementMap, targetBioAssayDimension, usedDesignElements, - oldAd ); - } - - if ( totalVectorsSwitched == 0 && hasData ) { - throw new RuntimeException( "No vectors were switched" ); - } - - String descriptionUpdate = "[Switched to use " + arrayDesign.getShortName() + " by Gemma]"; - if ( !ee.getDescription().contains( descriptionUpdate ) ) { - ee.setDescription( ee.getDescription() + " " + descriptionUpdate ); - } - - helperService.persist( ee, arrayDesign ); - - log.info( "Finishing up" ); - if ( targetBioAssayDimension != null && !unusedBADs.isEmpty() ) { - this.cleanupUnused( unusedBADs, targetBioAssayDimension ); - } - - ee = expressionExperimentService.loadOrFail( ee.getId() ); // refresh after cleanup - ee = expressionExperimentService.thawLite( ee ); - - if ( hasData ) { - processedExpressionDataVectorService.createProcessedDataVectors( ee ); // this still fails sometimes? works fine if run later by cli - } - } - - /** - * Automatically identify an appropriate merged platform - * - * @param expExp ee - * @return ee - */ - public void switchExperimentToMergedPlatform( ExpressionExperiment expExp ) { - ArrayDesign arrayDesign = this.locateMergedDesign( expExp ); - if ( arrayDesign == null ) - throw new IllegalArgumentException( "Experiment has no merged design to switch to" ); - this.switchExperimentToArrayDesign( expExp, arrayDesign ); - } - - private boolean switchPlatform( ExpressionExperiment ee, ArrayDesign arrayDesign ) { - boolean multiPlatformPerSample = false; - for ( BioAssay assay : ee.getBioAssays() ) { - - // Switch the assay to use the desired platform. However, if this is a second switch, don't lose the original value - if ( assay.getOriginalPlatform() == null ) { - assay.setOriginalPlatform( assay.getArrayDesignUsed() ); - } - assay.setArrayDesignUsed( arrayDesign ); - - /* - * Side effect: Detect cases of each-sample-run-on-multiple-platforms - we need to merge the bioassays, too. - * That means fiddling with the BioAssayDimensions (BADs). We do this check here to avoid unnecessarily - * inspecting BADs. - */ - if ( !multiPlatformPerSample && assay.getSampleUsed().getBioAssaysUsedIn().size() > 1 ) { - multiPlatformPerSample = true; - } - } - return multiPlatformPerSample; - } - - /** - * Remove bioassays that are no longer needed - * - */ - private void cleanupUnused( Collection unusedBADs, BioAssayDimension maxBAD ) { - ExpressionExperimentPlatformSwitchService.log.info( "Checking for unused BioAssays after merge" ); - - Collection removed = new HashSet<>(); - for ( BioAssayDimension bioAssayDimension : unusedBADs ) { - List bioAssays = bioAssayDimension.getBioAssays(); - for ( BioAssay ba : bioAssays ) { - if ( !maxBAD.getBioAssays().contains( ba ) && !removed.contains( ba ) ) { - try { - ExpressionExperimentPlatformSwitchService.log.info( "Deleting unused bioassay: " + ba ); - bioAssayService.remove( ba ); - removed.add( ba ); - } catch ( Exception e ) { - log.error( "Failed to delete: " + ba, e ); - } - } - } - try { - bioAssayDimensionService.remove( bioAssayDimension ); - log.info( "Removed unused bioAssayDimension ID=" + bioAssayDimension.getId() ); - } catch ( Exception e ) { - log.warn( "Failed to clean up unused bioassaydimension with ID=" + bioAssayDimension.getId() + ": " + e - .getMessage() ); - } - } - log.info( "Removed " + removed.size() + " usused bioassays" ); - } - - /** - * Find the bioassaydimension that covers all the biomaterials. - * - */ - private BioAssayDimension doMultiSample( ExpressionExperiment ee, Collection unusedBADs ) { - - int maxSize = 0; - BioAssayDimension maxBAD = null; - for ( BioAssay ba : ee.getBioAssays() ) { - Collection oldBioAssayDims = bioAssayService.findBioAssayDimensions( ba ); - for ( BioAssayDimension bioAssayDim : oldBioAssayDims ) { - unusedBADs.add( bioAssayDim ); - int size = bioAssayDim.getBioAssays().size(); - - if ( size > maxSize ) { - maxSize = size; - maxBAD = bioAssayDim; - } - } - } - - assert unusedBADs.size() > 1; // otherwise we shouldn't be here. - unusedBADs.remove( maxBAD ); - - /* - * Make sure all biomaterials in the study are included in the chosen bioassaydimension. If not, we'd have - * to make a new BAD. - */ - if ( maxBAD != null ) { - Collection bmsInmaxBAD = new HashSet<>(); - for ( BioAssay ba : maxBAD.getBioAssays() ) { - bmsInmaxBAD.add( ba.getSampleUsed() ); - } - - for ( BioAssay ba : ee.getBioAssays() ) { - if ( !bmsInmaxBAD.contains( ba.getSampleUsed() ) ) { - - ExpressionExperimentPlatformSwitchService.log - .debug( "This experiment looked like it had samples run on more than one platform, " - + "but it also has no BioAssayDimension that is eligible to accomodate all samples (Example: " - + ba.getSampleUsed() ); - maxBAD = null; - break; - } - } - - } - - if ( maxBAD == null ) { - log.info( "Creating new bioassaydimension to accomodate merged data as no existing one was suitable" ); - maxBAD = BioAssayDimension.Factory.newInstance( "For " + ee.getBioAssays().size() + - " bioMaterials", "Created to accomodate platform switch", new ArrayList<>( ee.getBioAssays() ) ); - maxBAD = bioAssayDimensionService.create( maxBAD ); - } - - - return maxBAD; - } - - /** - * Determine whether the two bioassaydimensions are the same, based on the samples used. Note it is inefficient to - * call this over and over but it's not a big deal so far. - * - * @param currentOrder current order - * @param desiredOrder desired order - * @return boolean - */ - private boolean equivalent( List currentOrder, List desiredOrder ) { - if ( currentOrder.size() != desiredOrder.size() ) { - return false; - } - - for ( int i = 0; i < currentOrder.size(); i++ ) { - if ( !currentOrder.get( i ).getSampleUsed().equals( desiredOrder.get( i ).getSampleUsed() ) ) { - return false; - } - } - - return true; - - } - - private ArrayDesign locateMergedDesign( ExpressionExperiment expExp ) { - // get the array designs for this EE - ArrayDesign arrayDesign = null; - Collection oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed( expExp ); - - // find the AD they have been merged into, make sure it is exists and they are all merged into the same AD. - for ( ArrayDesign design : oldArrayDesigns ) { - ArrayDesign mergedInto = design.getMergedInto(); - // TODO: use thaw for collection of platforms - mergedInto = arrayDesignService.thaw( mergedInto ); - - if ( mergedInto == null ) { - throw new IllegalArgumentException( - design + " used by " + expExp + " is not merged into another design" ); - } - - // TODO: go up the merge tree to find the root. This is too slow. - // while ( mergedInto.getMergedInto() != null ) { - // mergedInto = arrayDesignService.thaw( mergedInto.getMergedInto() ); - // } - - if ( arrayDesign == null ) { - arrayDesign = mergedInto; - arrayDesign = arrayDesignService.thaw( arrayDesign ); - } - - if ( !mergedInto.equals( arrayDesign ) ) { - throw new IllegalArgumentException( - design + " used by " + expExp + " is not merged into " + arrayDesign ); - } - - } - return arrayDesign; - } - - /** - * Set up a map of sequences to elements for the platform we're switching to - * - */ - private void populateCSeq( ArrayDesign arrayDesign, - Map> designElementMap, - Collection elsWithNoSeq ) { - for ( CompositeSequence cs : arrayDesign.getCompositeSequences() ) { - BioSequence bs = cs.getBiologicalCharacteristic(); - if ( bs == null ) { - elsWithNoSeq.add( cs ); - } else { - if ( !designElementMap.containsKey( bs ) ) { - designElementMap.put( bs, new HashSet() ); - } - designElementMap.get( bs ).add( cs ); - } - } - } - - /** - * @param designElementMap Mapping of sequences to probes for the platform that is being switch from. This is - * used - * to identify new candidates. - * @param usedDesignElements probes from the new design that have already been assigned to probes from the old - * design. If things are done correctly (the old design was merged into the new) then - * there should be enough. - * Map is of the new design probe to the old design probe it was used for (this is - * debugging information) - * @param vector vector - * @param bad BioAssayDimension to use, if necessary. If this is null or already the one used, - * it's ignored. - * Otherwise the vector data will be rewritten to match it. - * @return true if a switch was made - * @throws IllegalStateException if there is no (unused) design element matching the vector's biosequence - */ - private boolean processVector( Map> designElementMap, - Map> usedDesignElements, RawExpressionDataVector vector, - BioAssayDimension bad ) { - - CompositeSequence oldDe = vector.getDesignElement(); - - Collection newElCandidates; - BioSequence seq = oldDe.getBiologicalCharacteristic(); - if ( seq == null ) { - newElCandidates = designElementMap.get( ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE ); - } else { - newElCandidates = designElementMap.get( seq ); - } - - if ( newElCandidates == null || newElCandidates.isEmpty() ) { - return false; - // throw new IllegalStateException( - // "There are no candidates probes for sequence: " + seq + "('null' should be okay)" ); - } - - for ( CompositeSequence newEl : newElCandidates ) { - if ( !usedDesignElements.containsKey( newEl ) ) { - - vector.setDesignElement( newEl ); - usedDesignElements.put( newEl, new HashSet() ); - usedDesignElements.get( newEl ).add( vector.getBioAssayDimension() ); - break; - } - - if ( !usedDesignElements.get( newEl ).contains( vector.getBioAssayDimension() ) ) { - /* - * Then it's okay to use it. - */ - vector.setDesignElement( newEl ); - usedDesignElements.get( newEl ).add( vector.getBioAssayDimension() ); - break; - } - } - - if ( bad != null && !vector.getBioAssayDimension().equals( bad ) ) { - /* - * 1. Check if they are already the same; then just switch it to the desired BAD - * 2. If not, then the vector data has to be rewritten. - */ - this.vectorReWrite( vector, bad ); - - } - return true; - } - - /** - * Switch the vectors from one platform to the other, using the bioassaydimension passed if non-null - * - * @param targetBioAssayDimension - if null we keep the current one; otherwise we rewrite data datavectors to use - * the new one. - * @return how many vectors were switched - */ - private int switchDataForPlatform( ExpressionExperiment ee, ArrayDesign arrayDesign, - Map> designElementMap, BioAssayDimension targetBioAssayDimension, - Map> usedDesignElements, ArrayDesign oldAd ) { - if ( oldAd.equals( arrayDesign ) ) - return 0; - - oldAd = arrayDesignService.thaw( oldAd ); - - if ( oldAd.getCompositeSequences().size() == 0 && !oldAd.getTechnologyType().equals( TechnologyType.SEQUENCING ) ) { - /* - * Bug 3451 - this is okay if it is a RNA-seq experiment etc. prior to data upload. - */ - throw new IllegalStateException( oldAd + " has no elements" ); - } - - int totalSwitched = 0; - Collection qts = expressionExperimentService.getQuantitationTypes( ee, oldAd ); - ExpressionExperimentPlatformSwitchService.log - .info( "Processing " + qts.size() + " quantitation types for vectors on " + oldAd ); - for ( QuantitationType type : qts ) { - - // use each design element only once per quantitation type + bioassaydimension per array design - usedDesignElements.clear(); - - // assumption: we have no processed data. They should have been deleted at this point. - Collection vecsForQt = new HashSet<>(); - - // - - int count = 0; - for ( RawExpressionDataVector vector : ee.getRawExpressionDataVectors() ) { - - if ( !vector.getQuantitationType().equals( type ) ) { - continue; - } - - CompositeSequence oldDe = vector.getDesignElement(); - - if ( !oldDe.getArrayDesign().equals( oldAd ) ) { - continue; - } - vecsForQt.add( vector ); - - } - - if ( vecsForQt.isEmpty() ) { - /* - * This can happen when the quantitation types vary for the array designs. - */ - log.info( "No vectors for " + type + " on " + oldAd + " (Might be okay; not all QTs might be on all platforms)" ); - continue; - } - - log.info( "Switching " + vecsForQt.size() + " vectors for " + type + " from " + oldAd.getShortName() - + " to " + arrayDesign.getShortName() - + ( targetBioAssayDimension == null ? "" : ", BioAssayDimension=" + targetBioAssayDimension ) ); - - vecsForQt = rawExpressionDataVectorService.thaw( vecsForQt ); - - int numwarns = 0; - int maxwarns = 30; - for ( RawExpressionDataVector vector : vecsForQt ) { - if ( this.processVector( designElementMap, usedDesignElements, vector, targetBioAssayDimension ) ) { - count++; - } else { - if ( numwarns++ < maxwarns ) { - log.warn( "No matching element found on target platform for " + vector.getDesignElement() ); - } - if ( numwarns == maxwarns ) { - log.warn( "[Further no-match warnings suppressed]" ); - } - } - } - - if ( count != vecsForQt.size() ) { - throw new IllegalStateException( - "Found matches for only " + count + "/" + vecsForQt.size() + " vectors for " + type + " from " + oldAd.getShortName() - + ( targetBioAssayDimension == null ? "" : ", BioAssayDimension=" + targetBioAssayDimension ) ); - } - - // sanity check. this is all fine. - // for ( RawExpressionDataVector v : vecsForQt ) { - // if ( v.getQuantitationType().equals( type ) - // && !arrayDesign.equals( v.getDesignElement().getArrayDesign() ) ) { - // throw new IllegalStateException( "A raw vector for QT =" + v.getQuantitationType() - // + " was not correctly switched to the target platform " + arrayDesign + ", it was on " - // + v.getDesignElement().getArrayDesign() + " while switching from " + oldAd ); - // } - // } - - //this check shouldn't be necessary - for ( RawExpressionDataVector v : ee.getRawExpressionDataVectors() ) { - if ( v.getQuantitationType().equals( type ) - && !arrayDesign.equals( v.getDesignElement().getArrayDesign() ) && v.getDesignElement().getArrayDesign().equals( oldAd ) ) { - throw new IllegalStateException( "A raw vector " + v + "for QT =" + v.getQuantitationType() - + " was not correctly switched to the target platform " + arrayDesign + ", it was still on " - + v.getDesignElement().getArrayDesign() ); - } - } - - totalSwitched += count; - - } - return totalSwitched; - } - - /** - * Rearrange/expand a vector as necessary to use the given BioAssayDimension. Only used for multiplatform case of - * samples run on multiple platforms. - * - * @param vector vector - * @param bad to be used as the replacement. - */ - private void vectorReWrite( DesignElementDataVector vector, BioAssayDimension bad ) { - List desiredOrder = bad.getBioAssays(); - List currentOrder = vector.getBioAssayDimension().getBioAssays(); - if ( this.equivalent( currentOrder, desiredOrder ) ) { - // Easy, we can just switch it. - vector.setBioAssayDimension( bad ); - return; - } - - /* - * We remake the data vector following the new ordering. - */ - PrimitiveType representation = vector.getQuantitationType().getRepresentation(); - Object missingVal; - if ( representation.equals( PrimitiveType.DOUBLE ) ) { - missingVal = Double.NaN; - } else if ( representation.equals( PrimitiveType.STRING ) ) { - missingVal = ""; - } else if ( representation.equals( PrimitiveType.INT ) ) { - missingVal = 0; - } else if ( representation.equals( PrimitiveType.BOOLEAN ) ) { - missingVal = false; - } else { - throw new UnsupportedOperationException( - "Missing values in data vectors of type " + representation + " not supported (when processing " - + vector ); - } - - List oldData = new ArrayList<>(); - super.convertFromBytes( oldData, vector.getQuantitationType().getRepresentation(), vector ); - - /* - * Now data has the old data, so we need to rearrange it to match, inserting missings as necessary. - */ - Map bm2loc = new HashMap<>(); - int i = 0; - List newData = new ArrayList<>(); - // initialize - for ( BioAssay ba : desiredOrder ) { - bm2loc.put( ba.getSampleUsed(), i++ ); - newData.add( missingVal ); - } - // Put data into new locations - int j = 0; - for ( BioAssay ba : currentOrder ) { - Integer loc = bm2loc.get( ba.getSampleUsed() ); - assert loc != null; - newData.set( loc, oldData.get( j++ ) ); - } +public interface ExpressionExperimentPlatformSwitchService { - byte[] newDataAr = converter.toBytes( newData.toArray() ); - vector.setData( newDataAr ); - vector.setBioAssayDimension( bad ); + void switchExperimentToArrayDesign( ExpressionExperiment ee, ArrayDesign arrayDesign ); - } + void switchExperimentToMergedPlatform( ExpressionExperiment expExp ); } diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchServiceImpl.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchServiceImpl.java new file mode 100644 index 0000000000..b8ace91736 --- /dev/null +++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchServiceImpl.java @@ -0,0 +1,652 @@ +/* + * The Gemma project + * + * Copyright (c) 2007 University of British Columbia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package ubic.gemma.core.loader.expression; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import ubic.gemma.core.analysis.expression.AnalysisUtilService; +import ubic.gemma.core.analysis.preprocess.VectorMergingService; +import ubic.gemma.core.analysis.service.ExpressionExperimentVectorManipulatingService; +import ubic.gemma.model.common.quantitationtype.PrimitiveType; +import ubic.gemma.model.common.quantitationtype.QuantitationType; +import ubic.gemma.model.expression.arrayDesign.ArrayDesign; +import ubic.gemma.model.expression.arrayDesign.TechnologyType; +import ubic.gemma.model.expression.bioAssay.BioAssay; +import ubic.gemma.model.expression.bioAssayData.BioAssayDimension; +import ubic.gemma.model.expression.bioAssayData.DesignElementDataVector; +import ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector; +import ubic.gemma.model.expression.biomaterial.BioMaterial; +import ubic.gemma.model.expression.designElement.CompositeSequence; +import ubic.gemma.model.expression.experiment.ExpressionExperiment; +import ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet; +import ubic.gemma.model.genome.biosequence.BioSequence; +import ubic.gemma.persistence.service.analysis.expression.sampleCoexpression.SampleCoexpressionAnalysisService; +import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService; +import ubic.gemma.persistence.service.expression.bioAssay.BioAssayService; +import ubic.gemma.persistence.service.expression.bioAssayData.BioAssayDimensionService; +import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService; +import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentSubSetService; + +import java.util.*; + +/** + * Switch an expression experiment from one array design to another. This is valuable when the EE uses more than on AD, + * and a merged AD exists. The following steps are needed: + *
    + * + *
  • Delete old analyses of this experiment and processeddata vectors + *
  • For each array design, for each probe, identify the matching probe on the merged AD. Have to deal with situation + *
  • where more than one occurrence of each sequence is found. + *
  • all DEDVs must be switched to use the new AD's design elements + *
  • all bioassays must be switched to the new AD. + *
  • update the EE description + *
  • commit changes. + *
  • Computed processed data vectors + *
+ * This also handles the case of multisamples-per-platform - NOT the case of one-sample-per-platform but multiple + * platforms; for that you have to run VectorMerging. For more nutty situations this will probably create a mess. + * + * @author pavlidis + * @see VectorMergingService + */ +@Component +public class ExpressionExperimentPlatformSwitchServiceImpl extends ExpressionExperimentVectorManipulatingService implements ExpressionExperimentPlatformSwitchService { + + private static final Log log = LogFactory.getLog( ExpressionExperimentPlatformSwitchServiceImpl.class.getName() ); + + /** + * Used to identify design elements that have no sequence associated with them. + */ + public static BioSequence NULL_BIOSEQUENCE; + + static { + NULL_BIOSEQUENCE = BioSequence.Factory.newInstance(); + ExpressionExperimentPlatformSwitchServiceImpl.NULL_BIOSEQUENCE.setName( "______NULL______" ); + ExpressionExperimentPlatformSwitchServiceImpl.NULL_BIOSEQUENCE.setId( -1L ); + } + + @Autowired + private ArrayDesignService arrayDesignService; + + @Autowired + private BioAssayDimensionService bioAssayDimensionService; + + @Autowired + private BioAssayService bioAssayService; + + @Autowired + private ExpressionExperimentService expressionExperimentService; + + @Autowired + private ExperimentPlatformSwitchHelperService helperService; + + @Autowired + private SampleCoexpressionAnalysisService sampleCoexpressionAnalysisService; + + @Autowired + private ExpressionExperimentSubSetService subsetService; + + @Autowired + private AnalysisUtilService analysisUtilService; + + /** + * If you know the array designs are already in a merged state, you should use switchExperimentToMergedPlatform + * + * @param ee ee + * @param arrayDesign The array design to switch to. If some samples already use that array design, nothing will be + * changed for them. + * @return the switched experiment + */ + @Override + public void switchExperimentToArrayDesign( ExpressionExperiment ee, ArrayDesign arrayDesign ) { + assert arrayDesign != null; + + // remove stuff that will be in the way. + processedExpressionDataVectorService.removeProcessedDataVectors( ee ); + sampleCoexpressionAnalysisService.removeForExperiment( ee ); + for ( ExpressionExperimentSubSet subset : expressionExperimentService.getSubSets( ee ) ) { + subsetService.remove( subset ); + } + analysisUtilService.deleteOldAnalyses( ee ); + + // get relation between sequence and designelements. + Map> designElementMap = new HashMap<>(); + Collection elsWithNoSeq = new HashSet<>(); + this.populateCSeq( arrayDesign, designElementMap, elsWithNoSeq ); + + ee = expressionExperimentService.thaw( ee ); + + if ( elsWithNoSeq.size() > 0 ) { + ExpressionExperimentPlatformSwitchServiceImpl.log + .info( elsWithNoSeq.size() + " elements on the new platform have no associated sequence." ); + designElementMap.put( ExpressionExperimentPlatformSwitchServiceImpl.NULL_BIOSEQUENCE, elsWithNoSeq ); + } + + boolean multiPlatformPerSample = this.switchPlatform( ee, arrayDesign ); + /* + * For a multiplatform-per-sample case ("Case 2", platforms are disjoint): (note that some samples might just be + * on one platform...) + * 1. Pick a BAD that can be used for all DataVectors (it has all BioAssays in it). + * 2. Switch vectors to use it - may require adding NaNs and reordering the vectors (switchDataForPlatform) + * 3. Delete the Bioassays that are using other BADs (cleanupUnused) + * + * For Case 1 (each sample run on one platform, all platforms are related/similar) Simply switch the vectors + * (switchDataForPlatform) + */ + + /* + * Now we have to get the BADs. Problem to watch out for: they might not be the same length, we need one that + * includes all BioMaterials. + */ + Collection unusedBADs = new HashSet<>(); + BioAssayDimension targetBioAssayDimension = null; + if ( multiPlatformPerSample ) { + targetBioAssayDimension = this.doMultiSample( ee, unusedBADs ); + } + if ( multiPlatformPerSample && targetBioAssayDimension == null ) { + throw new RuntimeException( "Data set cannot be switched to merged platform: no suitable bioassaydimension found" ); + } + + /* + * To account for cases where we have no data loaded yet. + */ + boolean hasData = expressionExperimentService.getQuantitationTypes( ee ).size() > 0 && ee.getRawExpressionDataVectors().size() > 0; + + Collection oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed( ee ); + Map> usedDesignElements = new HashMap<>(); + int totalVectorsSwitched = 0; + for ( ArrayDesign oldAd : oldArrayDesigns ) { + totalVectorsSwitched += this.switchDataForPlatform( ee, arrayDesign, designElementMap, targetBioAssayDimension, usedDesignElements, + oldAd ); + } + + if ( totalVectorsSwitched == 0 && hasData ) { + throw new RuntimeException( "No vectors were switched" ); + } + + String descriptionUpdate = "[Switched to use " + arrayDesign.getShortName() + " by Gemma]"; + if ( !ee.getDescription().contains( descriptionUpdate ) ) { + ee.setDescription( ee.getDescription() + " " + descriptionUpdate ); + } + + helperService.persist( ee, arrayDesign ); + + log.info( "Finishing up" ); + if ( targetBioAssayDimension != null && !unusedBADs.isEmpty() ) { + this.cleanupUnused( unusedBADs, targetBioAssayDimension ); + } + + ee = expressionExperimentService.loadOrFail( ee.getId() ); // refresh after cleanup + ee = expressionExperimentService.thawLite( ee ); + + if ( hasData ) { + processedExpressionDataVectorService.createProcessedDataVectors( ee ); // this still fails sometimes? works fine if run later by cli + } + } + + /** + * Automatically identify an appropriate merged platform + * + * @param expExp ee + * @return ee + */ + @Override + public void switchExperimentToMergedPlatform( ExpressionExperiment expExp ) { + ArrayDesign arrayDesign = this.locateMergedDesign( expExp ); + if ( arrayDesign == null ) + throw new IllegalArgumentException( "Experiment has no merged design to switch to" ); + this.switchExperimentToArrayDesign( expExp, arrayDesign ); + } + + private boolean switchPlatform( ExpressionExperiment ee, ArrayDesign arrayDesign ) { + boolean multiPlatformPerSample = false; + for ( BioAssay assay : ee.getBioAssays() ) { + + // Switch the assay to use the desired platform. However, if this is a second switch, don't lose the original value + if ( assay.getOriginalPlatform() == null ) { + assay.setOriginalPlatform( assay.getArrayDesignUsed() ); + } + assay.setArrayDesignUsed( arrayDesign ); + + /* + * Side effect: Detect cases of each-sample-run-on-multiple-platforms - we need to merge the bioassays, too. + * That means fiddling with the BioAssayDimensions (BADs). We do this check here to avoid unnecessarily + * inspecting BADs. + */ + if ( !multiPlatformPerSample && assay.getSampleUsed().getBioAssaysUsedIn().size() > 1 ) { + multiPlatformPerSample = true; + } + } + return multiPlatformPerSample; + } + + /** + * Remove bioassays that are no longer needed + * + */ + private void cleanupUnused( Collection unusedBADs, BioAssayDimension maxBAD ) { + ExpressionExperimentPlatformSwitchServiceImpl.log.info( "Checking for unused BioAssays after merge" ); + + Collection removed = new HashSet<>(); + for ( BioAssayDimension bioAssayDimension : unusedBADs ) { + List bioAssays = bioAssayDimension.getBioAssays(); + for ( BioAssay ba : bioAssays ) { + if ( !maxBAD.getBioAssays().contains( ba ) && !removed.contains( ba ) ) { + try { + ExpressionExperimentPlatformSwitchServiceImpl.log.info( "Deleting unused bioassay: " + ba ); + bioAssayService.remove( ba ); + removed.add( ba ); + } catch ( Exception e ) { + log.error( "Failed to delete: " + ba, e ); + } + } + } + try { + bioAssayDimensionService.remove( bioAssayDimension ); + log.info( "Removed unused bioAssayDimension ID=" + bioAssayDimension.getId() ); + } catch ( Exception e ) { + log.warn( "Failed to clean up unused bioassaydimension with ID=" + bioAssayDimension.getId() + ": " + e + .getMessage() ); + } + } + log.info( "Removed " + removed.size() + " usused bioassays" ); + } + + /** + * Find the bioassaydimension that covers all the biomaterials. + * + */ + private BioAssayDimension doMultiSample( ExpressionExperiment ee, Collection unusedBADs ) { + + int maxSize = 0; + BioAssayDimension maxBAD = null; + for ( BioAssay ba : ee.getBioAssays() ) { + Collection oldBioAssayDims = bioAssayService.findBioAssayDimensions( ba ); + for ( BioAssayDimension bioAssayDim : oldBioAssayDims ) { + unusedBADs.add( bioAssayDim ); + int size = bioAssayDim.getBioAssays().size(); + + if ( size > maxSize ) { + maxSize = size; + maxBAD = bioAssayDim; + } + } + } + + assert unusedBADs.size() > 1; // otherwise we shouldn't be here. + unusedBADs.remove( maxBAD ); + + /* + * Make sure all biomaterials in the study are included in the chosen bioassaydimension. If not, we'd have + * to make a new BAD. + */ + if ( maxBAD != null ) { + Collection bmsInmaxBAD = new HashSet<>(); + for ( BioAssay ba : maxBAD.getBioAssays() ) { + bmsInmaxBAD.add( ba.getSampleUsed() ); + } + + for ( BioAssay ba : ee.getBioAssays() ) { + if ( !bmsInmaxBAD.contains( ba.getSampleUsed() ) ) { + + ExpressionExperimentPlatformSwitchServiceImpl.log + .debug( "This experiment looked like it had samples run on more than one platform, " + + "but it also has no BioAssayDimension that is eligible to accomodate all samples (Example: " + + ba.getSampleUsed() ); + maxBAD = null; + break; + } + } + + } + + if ( maxBAD == null ) { + log.info( "Creating new bioassaydimension to accomodate merged data as no existing one was suitable" ); + maxBAD = BioAssayDimension.Factory.newInstance( "For " + ee.getBioAssays().size() + + " bioMaterials", "Created to accomodate platform switch", new ArrayList<>( ee.getBioAssays() ) ); + maxBAD = bioAssayDimensionService.create( maxBAD ); + } + + + return maxBAD; + } + + /** + * Determine whether the two bioassaydimensions are the same, based on the samples used. Note it is inefficient to + * call this over and over but it's not a big deal so far. + * + * @param currentOrder current order + * @param desiredOrder desired order + * @return boolean + */ + private boolean equivalent( List currentOrder, List desiredOrder ) { + if ( currentOrder.size() != desiredOrder.size() ) { + return false; + } + + for ( int i = 0; i < currentOrder.size(); i++ ) { + if ( !currentOrder.get( i ).getSampleUsed().equals( desiredOrder.get( i ).getSampleUsed() ) ) { + return false; + } + } + + return true; + + } + + private ArrayDesign locateMergedDesign( ExpressionExperiment expExp ) { + // get the array designs for this EE + ArrayDesign arrayDesign = null; + Collection oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed( expExp ); + + // find the AD they have been merged into, make sure it is exists and they are all merged into the same AD. + for ( ArrayDesign design : oldArrayDesigns ) { + ArrayDesign mergedInto = design.getMergedInto(); + // TODO: use thaw for collection of platforms + mergedInto = arrayDesignService.thaw( mergedInto ); + + if ( mergedInto == null ) { + throw new IllegalArgumentException( + design + " used by " + expExp + " is not merged into another design" ); + } + + // TODO: go up the merge tree to find the root. This is too slow. + // while ( mergedInto.getMergedInto() != null ) { + // mergedInto = arrayDesignService.thaw( mergedInto.getMergedInto() ); + // } + + if ( arrayDesign == null ) { + arrayDesign = mergedInto; + arrayDesign = arrayDesignService.thaw( arrayDesign ); + } + + if ( !mergedInto.equals( arrayDesign ) ) { + throw new IllegalArgumentException( + design + " used by " + expExp + " is not merged into " + arrayDesign ); + } + + } + return arrayDesign; + } + + /** + * Set up a map of sequences to elements for the platform we're switching to + * + */ + private void populateCSeq( ArrayDesign arrayDesign, + Map> designElementMap, + Collection elsWithNoSeq ) { + for ( CompositeSequence cs : arrayDesign.getCompositeSequences() ) { + BioSequence bs = cs.getBiologicalCharacteristic(); + if ( bs == null ) { + elsWithNoSeq.add( cs ); + } else { + if ( !designElementMap.containsKey( bs ) ) { + designElementMap.put( bs, new HashSet() ); + } + designElementMap.get( bs ).add( cs ); + } + } + } + + /** + * @param designElementMap Mapping of sequences to probes for the platform that is being switch from. This is + * used + * to identify new candidates. + * @param usedDesignElements probes from the new design that have already been assigned to probes from the old + * design. If things are done correctly (the old design was merged into the new) then + * there should be enough. + * Map is of the new design probe to the old design probe it was used for (this is + * debugging information) + * @param vector vector + * @param bad BioAssayDimension to use, if necessary. If this is null or already the one used, + * it's ignored. + * Otherwise the vector data will be rewritten to match it. + * @return true if a switch was made + * @throws IllegalStateException if there is no (unused) design element matching the vector's biosequence + */ + private boolean processVector( Map> designElementMap, + Map> usedDesignElements, RawExpressionDataVector vector, + BioAssayDimension bad ) { + + CompositeSequence oldDe = vector.getDesignElement(); + + Collection newElCandidates; + BioSequence seq = oldDe.getBiologicalCharacteristic(); + if ( seq == null ) { + newElCandidates = designElementMap.get( ExpressionExperimentPlatformSwitchServiceImpl.NULL_BIOSEQUENCE ); + } else { + newElCandidates = designElementMap.get( seq ); + } + + if ( newElCandidates == null || newElCandidates.isEmpty() ) { + return false; + // throw new IllegalStateException( + // "There are no candidates probes for sequence: " + seq + "('null' should be okay)" ); + } + + for ( CompositeSequence newEl : newElCandidates ) { + if ( !usedDesignElements.containsKey( newEl ) ) { + + vector.setDesignElement( newEl ); + usedDesignElements.put( newEl, new HashSet() ); + usedDesignElements.get( newEl ).add( vector.getBioAssayDimension() ); + break; + } + + if ( !usedDesignElements.get( newEl ).contains( vector.getBioAssayDimension() ) ) { + /* + * Then it's okay to use it. + */ + vector.setDesignElement( newEl ); + usedDesignElements.get( newEl ).add( vector.getBioAssayDimension() ); + break; + } + } + + if ( bad != null && !vector.getBioAssayDimension().equals( bad ) ) { + /* + * 1. Check if they are already the same; then just switch it to the desired BAD + * 2. If not, then the vector data has to be rewritten. + */ + this.vectorReWrite( vector, bad ); + + } + return true; + } + + /** + * Switch the vectors from one platform to the other, using the bioassaydimension passed if non-null + * + * @param targetBioAssayDimension - if null we keep the current one; otherwise we rewrite data datavectors to use + * the new one. + * @return how many vectors were switched + */ + private int switchDataForPlatform( ExpressionExperiment ee, ArrayDesign arrayDesign, + Map> designElementMap, BioAssayDimension targetBioAssayDimension, + Map> usedDesignElements, ArrayDesign oldAd ) { + if ( oldAd.equals( arrayDesign ) ) + return 0; + + oldAd = arrayDesignService.thaw( oldAd ); + + if ( oldAd.getCompositeSequences().size() == 0 && !oldAd.getTechnologyType().equals( TechnologyType.SEQUENCING ) ) { + /* + * Bug 3451 - this is okay if it is a RNA-seq experiment etc. prior to data upload. + */ + throw new IllegalStateException( oldAd + " has no elements" ); + } + + int totalSwitched = 0; + Collection qts = expressionExperimentService.getQuantitationTypes( ee, oldAd ); + ExpressionExperimentPlatformSwitchServiceImpl.log + .info( "Processing " + qts.size() + " quantitation types for vectors on " + oldAd ); + for ( QuantitationType type : qts ) { + + // use each design element only once per quantitation type + bioassaydimension per array design + usedDesignElements.clear(); + + // assumption: we have no processed data. They should have been deleted at this point. + Collection vecsForQt = new HashSet<>(); + + // + + int count = 0; + for ( RawExpressionDataVector vector : ee.getRawExpressionDataVectors() ) { + + if ( !vector.getQuantitationType().equals( type ) ) { + continue; + } + + CompositeSequence oldDe = vector.getDesignElement(); + + if ( !oldDe.getArrayDesign().equals( oldAd ) ) { + continue; + } + vecsForQt.add( vector ); + + } + + if ( vecsForQt.isEmpty() ) { + /* + * This can happen when the quantitation types vary for the array designs. + */ + log.info( "No vectors for " + type + " on " + oldAd + " (Might be okay; not all QTs might be on all platforms)" ); + continue; + } + + log.info( "Switching " + vecsForQt.size() + " vectors for " + type + " from " + oldAd.getShortName() + + " to " + arrayDesign.getShortName() + + ( targetBioAssayDimension == null ? "" : ", BioAssayDimension=" + targetBioAssayDimension ) ); + + vecsForQt = rawExpressionDataVectorService.thaw( vecsForQt ); + + int numwarns = 0; + int maxwarns = 30; + for ( RawExpressionDataVector vector : vecsForQt ) { + if ( this.processVector( designElementMap, usedDesignElements, vector, targetBioAssayDimension ) ) { + count++; + } else { + if ( numwarns++ < maxwarns ) { + log.warn( "No matching element found on target platform for " + vector.getDesignElement() ); + } + if ( numwarns == maxwarns ) { + log.warn( "[Further no-match warnings suppressed]" ); + } + } + } + + if ( count != vecsForQt.size() ) { + throw new IllegalStateException( + "Found matches for only " + count + "/" + vecsForQt.size() + " vectors for " + type + " from " + oldAd.getShortName() + + ( targetBioAssayDimension == null ? "" : ", BioAssayDimension=" + targetBioAssayDimension ) ); + } + + // sanity check. this is all fine. + // for ( RawExpressionDataVector v : vecsForQt ) { + // if ( v.getQuantitationType().equals( type ) + // && !arrayDesign.equals( v.getDesignElement().getArrayDesign() ) ) { + // throw new IllegalStateException( "A raw vector for QT =" + v.getQuantitationType() + // + " was not correctly switched to the target platform " + arrayDesign + ", it was on " + // + v.getDesignElement().getArrayDesign() + " while switching from " + oldAd ); + // } + // } + + //this check shouldn't be necessary + for ( RawExpressionDataVector v : ee.getRawExpressionDataVectors() ) { + if ( v.getQuantitationType().equals( type ) + && !arrayDesign.equals( v.getDesignElement().getArrayDesign() ) && v.getDesignElement().getArrayDesign().equals( oldAd ) ) { + throw new IllegalStateException( "A raw vector " + v + "for QT =" + v.getQuantitationType() + + " was not correctly switched to the target platform " + arrayDesign + ", it was still on " + + v.getDesignElement().getArrayDesign() ); + } + } + + totalSwitched += count; + + } + return totalSwitched; + } + + /** + * Rearrange/expand a vector as necessary to use the given BioAssayDimension. Only used for multiplatform case of + * samples run on multiple platforms. + * + * @param vector vector + * @param bad to be used as the replacement. + */ + private void vectorReWrite( DesignElementDataVector vector, BioAssayDimension bad ) { + List desiredOrder = bad.getBioAssays(); + List currentOrder = vector.getBioAssayDimension().getBioAssays(); + if ( this.equivalent( currentOrder, desiredOrder ) ) { + // Easy, we can just switch it. + vector.setBioAssayDimension( bad ); + return; + } + + /* + * We remake the data vector following the new ordering. + */ + PrimitiveType representation = vector.getQuantitationType().getRepresentation(); + Object missingVal; + if ( representation.equals( PrimitiveType.DOUBLE ) ) { + missingVal = Double.NaN; + } else if ( representation.equals( PrimitiveType.STRING ) ) { + missingVal = ""; + } else if ( representation.equals( PrimitiveType.INT ) ) { + missingVal = 0; + } else if ( representation.equals( PrimitiveType.BOOLEAN ) ) { + missingVal = false; + } else { + throw new UnsupportedOperationException( + "Missing values in data vectors of type " + representation + " not supported (when processing " + + vector ); + } + + List oldData = new ArrayList<>(); + super.convertFromBytes( oldData, vector.getQuantitationType().getRepresentation(), vector ); + + /* + * Now data has the old data, so we need to rearrange it to match, inserting missings as necessary. + */ + Map bm2loc = new HashMap<>(); + int i = 0; + List newData = new ArrayList<>(); + // initialize + for ( BioAssay ba : desiredOrder ) { + bm2loc.put( ba.getSampleUsed(), i++ ); + newData.add( missingVal ); + } + + // Put data into new locations + int j = 0; + for ( BioAssay ba : currentOrder ) { + Integer loc = bm2loc.get( ba.getSampleUsed() ); + assert loc != null; + newData.set( loc, oldData.get( j++ ) ); + } + + byte[] newDataAr = converter.toBytes( newData.toArray() ); + vector.setData( newDataAr ); + vector.setBioAssayDimension( bad ); + + } +} diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/arrayDesign/ArrayDesignMergeServiceImpl.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/arrayDesign/ArrayDesignMergeServiceImpl.java index d8cefd2d54..34f04f7555 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/arrayDesign/ArrayDesignMergeServiceImpl.java +++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/arrayDesign/ArrayDesignMergeServiceImpl.java @@ -25,6 +25,7 @@ import org.springframework.stereotype.Component; import ubic.gemma.core.analysis.report.ArrayDesignReportService; import ubic.gemma.core.loader.expression.ExpressionExperimentPlatformSwitchService; +import ubic.gemma.core.loader.expression.ExpressionExperimentPlatformSwitchServiceImpl; import ubic.gemma.model.expression.arrayDesign.ArrayDesign; import ubic.gemma.model.expression.designElement.CompositeSequence; import ubic.gemma.model.genome.biosequence.BioSequence; @@ -210,7 +211,7 @@ private ArrayDesign makeBioSeqMap( Map makeNewProbes( ArrayDesign arrayDesign, CompositeSequence newCs = CompositeSequence.Factory.newInstance(); - if ( !bs.equals( ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE ) ) { + if ( !bs.equals( ExpressionExperimentPlatformSwitchServiceImpl.NULL_BIOSEQUENCE ) ) { newCs.setBiologicalCharacteristic( bs ); } From 8c6a5283d8e2864c359ae5f4dedc275510627452 Mon Sep 17 00:00:00 2001 From: Guillaume Poirier-Morency Date: Mon, 24 Jul 2023 21:27:21 -0700 Subject: [PATCH 8/8] Add a unit test for skipping log2cpm and post-processing in DataUpdater --- .../loader/expression/DataUpdater2Test.java | 173 ++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdater2Test.java diff --git a/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdater2Test.java b/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdater2Test.java new file mode 100644 index 0000000000..14817733b7 --- /dev/null +++ b/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdater2Test.java @@ -0,0 +1,173 @@ +package ubic.gemma.core.loader.expression; + +import org.junit.Before; +import org.junit.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.AbstractJUnit4SpringContextTests; +import ubic.basecode.dataStructure.matrix.DoubleMatrix; +import ubic.basecode.io.reader.DoubleMatrixReader; +import ubic.gemma.core.analysis.preprocess.PreprocessorService; +import ubic.gemma.core.analysis.preprocess.VectorMergingService; +import ubic.gemma.core.loader.expression.geo.service.GeoService; +import ubic.gemma.model.expression.arrayDesign.ArrayDesign; +import ubic.gemma.model.expression.bioAssay.BioAssay; +import ubic.gemma.model.expression.biomaterial.BioMaterial; +import ubic.gemma.model.expression.designElement.CompositeSequence; +import ubic.gemma.model.expression.experiment.ExpressionExperiment; +import ubic.gemma.persistence.service.analysis.expression.pca.PrincipalComponentAnalysisService; +import ubic.gemma.persistence.service.analysis.expression.sampleCoexpression.SampleCoexpressionAnalysisService; +import ubic.gemma.persistence.service.common.auditAndSecurity.AuditTrailService; +import ubic.gemma.persistence.service.common.quantitationtype.QuantitationTypeService; +import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService; +import ubic.gemma.persistence.service.expression.bioAssay.BioAssayService; +import ubic.gemma.persistence.service.expression.bioAssayData.BioAssayDimensionService; +import ubic.gemma.persistence.service.expression.bioAssayData.RawAndProcessedExpressionDataVectorService; +import ubic.gemma.persistence.service.expression.bioAssayData.RawExpressionDataVectorService; +import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService; +import ubic.gemma.persistence.util.TestComponent; + +import java.io.IOException; +import java.util.Collections; + +import static org.mockito.Mockito.*; + +@ContextConfiguration +public class DataUpdater2Test extends AbstractJUnit4SpringContextTests { + + @Configuration + @TestComponent + static class DataUpdater2TestContextConfiguration { + + @Bean + public DataUpdater dataUpdater() { + return new DataUpdaterImpl(); + } + + @Bean + public ArrayDesignService arrayDesignService() { + return mock( ArrayDesignService.class ); + } + + @Bean + public AuditTrailService auditTrailService() { + return mock( AuditTrailService.class ); + } + + @Bean + public BioAssayDimensionService bioAssayDimensionService() { + return mock( BioAssayDimensionService.class ); + } + + @Bean + public BioAssayService bioAssayService() { + return mock( BioAssayService.class ); + } + + @Bean + public ExpressionExperimentPlatformSwitchService experimentPlatformSwitchService() { + return mock( ExpressionExperimentPlatformSwitchService.class ); + } + + @Bean + public ExpressionExperimentService experimentService() { + return mock( ExpressionExperimentService.class ); + } + + @Bean + public GeoService geoService() { + return mock( GeoService.class ); + } + + @Bean + public PrincipalComponentAnalysisService pcaService() { + return mock( PrincipalComponentAnalysisService.class ); + } + + @Bean + public PreprocessorService preprocessorService() { + return mock( PreprocessorService.class ); + } + + @Bean + public QuantitationTypeService qtService() { + return mock( QuantitationTypeService.class ); + } + + @Bean + public RawExpressionDataVectorService rawExpressionDataVectorService() { + return mock( RawExpressionDataVectorService.class ); + } + + @Bean + public SampleCoexpressionAnalysisService sampleCorService() { + return mock( SampleCoexpressionAnalysisService.class ); + } + + @Bean + public VectorMergingService vectorMergingService() { + return mock( VectorMergingService.class ); + } + + @Bean + public RawAndProcessedExpressionDataVectorService rawAndProcessedExpressionDataVectorService() { + return mock( RawAndProcessedExpressionDataVectorService.class ); + } + } + + @Autowired + private DataUpdater dataUpdater; + + @Autowired + private PreprocessorService preprocessorService; + + @Autowired + private ExpressionExperimentService expressionExperimentService; + + @Autowired + private ArrayDesignService arrayDesignService; + + @Autowired + private BioAssayDimensionService bioAssayDimensionService; + + private ExpressionExperiment ee; + private ArrayDesign ad; + private DoubleMatrix rpkmMatrix, countMatrix; + + @Before + public void setUp() throws IOException { + rpkmMatrix = new DoubleMatrixReader().read( getClass().getResourceAsStream( "/data/loader/expression/flatfileload/GSE19166_expression_count.test.txt" ) ); + countMatrix = new DoubleMatrixReader().read( getClass().getResourceAsStream( "/data/loader/expression/flatfileload/GSE19166_expression_RPKM.test.txt" ) ); + ee = new ExpressionExperiment(); + for ( String col : countMatrix.getColNames() ) { + BioAssay ba = BioAssay.Factory.newInstance( col ); + BioMaterial bm = BioMaterial.Factory.newInstance( col ); + bm.setBioAssaysUsedIn( Collections.singleton( ba ) ); + ba.setSampleUsed( bm ); + ee.getBioAssays().add( ba ); + } + ad = new ArrayDesign(); + ad.setId( 1L ); + for ( String row : countMatrix.getRowNames() ) { + ad.getCompositeSequences().add( CompositeSequence.Factory.newInstance( row, ad ) ); + } + } + + @Test + public void testSkipRecomputeLog2cpm() { + when( expressionExperimentService.thawLite( ee ) ).thenReturn( ee ); + when( expressionExperimentService.addRawVectors( same( ee ), any() ) ).thenReturn( ee ); + when( expressionExperimentService.replaceRawVectors( same( ee ), any() ) ).thenReturn( ee ); + when( arrayDesignService.thaw( ad ) ).thenReturn( ad ); + when( expressionExperimentService.getArrayDesignsUsed( ee ) ).thenReturn( Collections.singleton( ad ) ); + when( bioAssayDimensionService.findOrCreate( any() ) ).thenAnswer( a -> a.getArgument( 0 ) ); + dataUpdater.addCountData( ee, ad, countMatrix, rpkmMatrix, 31, false, false, false ); + verify( preprocessorService ).process( ee ); + verify( expressionExperimentService, times( 2 ) ).addRawVectors( same( ee ), any() ); + dataUpdater.addCountData( ee, ad, countMatrix, rpkmMatrix, 30, false, false, true ); + verify( expressionExperimentService ).replaceRawVectors( same( ee ), any() ); + verifyNoMoreInteractions( preprocessorService ); + } +} \ No newline at end of file