From 5205963612213971178e907be474b7390ba28139 Mon Sep 17 00:00:00 2001
From: Paul Pavlidis <paul@msl.ubc.ca>
Date: Mon, 17 Jul 2023 11:06:12 -0700
Subject: [PATCH 1/8] allow backfilling of counts and rpkm without triggering
 preprocessing

---
 .../gemma/core/apps/RNASeqDataAddCli.java     | 14 +++-
 .../core/loader/expression/DataUpdater.java   | 18 +++-
 .../loader/expression/DataUpdaterImpl.java    | 82 +++++++++----------
 .../analysis/expression/diff/DiffExTest.java  |  2 +-
 .../preprocess/MeanVarianceServiceTest.java   |  5 +-
 .../loader/expression/DataUpdaterTest.java    |  8 +-
 6 files changed, 76 insertions(+), 53 deletions(-)

diff --git a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java
index 88ccc44d23..8f8c3b2af1 100644
--- a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java
+++ b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java
@@ -34,7 +34,6 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.nio.file.Files;
 import java.nio.file.StandardCopyOption;
 import java.text.MessageFormat;
 import java.util.Collection;
@@ -59,6 +58,7 @@ public class RNASeqDataAddCli extends ExpressionExperimentManipulatingCLI {
     private String rpkmFile = null;
     private boolean justbackfillLog2cpm = false;
     private File qualityControlReportFile = null;
+    private boolean justbackfillCountsAndRPKM = false;
 
     @Override
     public CommandGroup getCommandGroup() {
@@ -81,6 +81,8 @@ protected void buildOptions( Options options ) {
 
         // RNA-Seq pipeline QC report
         options.addOption( RNASeqDataAddCli.MULTIQC_METADATA_FILE_OPT, true, "File containing a MultiQC report" );
+
+        options.addOption( Option.builder( "noLog2cpm" ).longOpt( null ).desc( "Skip log2cpm computation and postprocessing, only backfill counts and rpkm" ).build() );
     }
 
     @Override
@@ -133,12 +135,20 @@ protected void processOptions( CommandLine commandLine ) {
         if ( rpkmFile == null && countFile == null )
             throw new IllegalArgumentException( "Must provide either RPKM or count data (or both)" );
 
+
         if ( !commandLine.hasOption( "a" ) ) {
             throw new IllegalArgumentException( "Must provide target platform" );
         }
 
         this.platformName = commandLine.getOptionValue( "a" );
 
+        if (commandLine.hasOption( "noLog2cpm")) {
+            this.justbackfillCountsAndRPKM = true;
+            if (this.justbackfillLog2cpm) {
+                throw new IllegalArgumentException("Can't use both noLog2cpm and log2cpm options");
+            }
+        }
+
         if ( commandLine.hasOption( RNASeqDataAddCli.MULTIQC_METADATA_FILE_OPT ) ) {
             qualityControlReportFile = new File( commandLine.getOptionValue( RNASeqDataAddCli.MULTIQC_METADATA_FILE_OPT ) );
             if ( !qualityControlReportFile.isFile() || !qualityControlReportFile.canRead() ) {
@@ -208,7 +218,7 @@ protected void doWork() throws Exception {
             }
 
             serv.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, readLength, isPairedReads,
-                    allowMissingSamples );
+                    allowMissingSamples, justbackfillCountsAndRPKM );
 
         } catch ( IOException e ) {
             throw new Exception( "Failed while processing " + ee, e );
diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdater.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdater.java
index 8a760d58f4..752ab8955b 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdater.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdater.java
@@ -11,9 +11,25 @@
 public interface DataUpdater {
     void addAffyDataFromAPTOutput( ExpressionExperiment ee, String pathToAptOutputFile ) throws IOException;
 
+    /**
+     * RNA-seq: Replaces data. Starting with the count data, we compute the log2cpm, which is the preferred quantitation
+     * type we use internally. Counts and FPKM (if provided) are stored in addition.
+     *
+     * Rows (genes) that have all zero counts are ignored entirely.
+     *
+     * @param ee                  ee
+     * @param targetArrayDesign   - this should be one of the "Generic" gene-based platforms. The data set will be
+     *                            switched to use it.
+     * @param countMatrix         Representing 'raw' counts (added after rpkm, if provided).
+     * @param rpkmMatrix          Representing per-gene normalized data, optional (RPKM or FPKM)
+     * @param readLength          read length
+     * @param isPairedReads       is paired reads
+     * @param allowMissingSamples if true, samples that are missing data will be deleted from the experiment.
+     * @param skipLog2cpm         Only load the counts (and RPKM/FPKM if provided) - implemented to allow backfilling without updating log2cpm and preproc.
+     */
     void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign,
             DoubleMatrix<String, String> countMatrix, DoubleMatrix<String, String> rpkmMatrix, Integer readLength,
-            Boolean isPairedReads, boolean allowMissingSamples );
+            Boolean isPairedReads, boolean allowMissingSamples, boolean skipLog2cpm );
 
     void log2cpmFromCounts( ExpressionExperiment ee, QuantitationType qt );
 
diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java
index 36692411e4..24c06e332d 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java
@@ -178,26 +178,11 @@ public void addAffyDataFromAPTOutput( ExpressionExperiment ee, String pathToAptO
 
     }
 
-    /**
-     * RNA-seq: Replaces data. Starting with the count data, we compute the log2cpm, which is the preferred quantitation
-     * type we use internally. Counts and FPKM (if provided) are stored in addition.
-     *
-     * Rows (genes) that have all zero counts are ignored entirely.
-     *
-     * @param ee                  ee
-     * @param targetArrayDesign   - this should be one of the "Generic" gene-based platforms. The data set will be
-     *                            switched to use it.
-     * @param countMatrix         Representing 'raw' counts (added after rpkm, if provided).
-     * @param rpkmMatrix          Representing per-gene normalized data, optional (RPKM or FPKM)
-     * @param allowMissingSamples if true, samples that are missing data will be deleted from the experiment.
-     * @param isPairedReads       is paired reads
-     * @param readLength          read length
-     */
     @Override
     @Transactional(propagation = Propagation.NEVER)
     public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign,
             DoubleMatrix<String, String> countMatrix, DoubleMatrix<String, String> rpkmMatrix, Integer readLength,
-            Boolean isPairedReads, boolean allowMissingSamples ) {
+            Boolean isPairedReads, boolean allowMissingSamples, boolean skipLog2cpm ) {
 
         if ( countMatrix == null )
             throw new IllegalArgumentException( "You must provide count matrix (rpkm is optional)" );
@@ -215,6 +200,7 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign
         }
 
         ee = experimentService.thawLite( ee );
+        Collection<QuantitationType> oldQts = ee.getQuantitationTypes();
 
         ee = this.dealWithMissingSamples( ee, countMatrix, allowMissingSamples );
 
@@ -225,27 +211,8 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign
         assert !properCountMatrix.getColNames().isEmpty();
         assert !properCountMatrix.getRowNames().isEmpty();
 
-        Collection<QuantitationType> oldQts = ee.getQuantitationTypes();
-
         //    countEEMatrix = this.removeNoDataRows( countEEMatrix );
 
-        QuantitationType log2cpmQt = this.makelog2cpmQt();
-        for ( QuantitationType oldqt : oldQts ) { // use old QT if possible
-            if ( oldqt.getName().equals( log2cpmQt.getName() ) ) {
-                log2cpmQt = oldqt;
-                break;
-            }
-        }
-
-        DoubleMatrix1D librarySize = MatrixStats.colSums( countMatrix );
-        DoubleMatrix<CompositeSequence, BioMaterial> log2cpmMatrix = MatrixStats
-                .convertToLog2Cpm( properCountMatrix, librarySize );
-
-        ExpressionDataDoubleMatrix log2cpmEEMatrix = new ExpressionDataDoubleMatrix( ee, log2cpmQt, log2cpmMatrix );
-
-        // important: replaceData takes care of the platform switch if necessary; call first. It also deletes old QTs, so from here we have to remake them.
-        ee = this.replaceData( ee, targetArrayDesign, log2cpmEEMatrix );
-
         QuantitationType countqt = this.makeCountQt();
         for ( QuantitationType oldqt : oldQts ) { // use old QT if possible
             if ( oldqt.getName().equals( countqt.getName() ) ) {
@@ -254,11 +221,11 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign
             }
         }
         ExpressionDataDoubleMatrix countEEMatrix = new ExpressionDataDoubleMatrix( ee, countqt, properCountMatrix );
-
         ee = this.addData( ee, targetArrayDesign, countEEMatrix );
-
         this.addTotalCountInformation( ee, countEEMatrix, readLength, isPairedReads );
 
+
+        /* add rpkm/fpkm data if available */
         if ( rpkmMatrix != null ) {
 
             DoubleMatrix<CompositeSequence, BioMaterial> properRPKMMatrix = this
@@ -281,6 +248,31 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign
             this.addData( ee, targetArrayDesign, rpkmEEMatrix );
         }
 
+        // offered only to allow us to backfill the count and rpkm data without updating log2cpm and triggering postprocessing
+        if ( skipLog2cpm ) {
+            log.info( "Done adding count data, skipping the rest as per selected options" );
+            return;
+        }
+
+        /* Add the log2cpm data.*/
+        QuantitationType log2cpmQt = this.makelog2cpmQt();
+        for ( QuantitationType oldqt : oldQts ) { // use old QT if possible
+            if ( oldqt.getName().equals( log2cpmQt.getName() ) ) {
+                log2cpmQt = oldqt;
+                break;
+            }
+        }
+
+        DoubleMatrix1D librarySize = MatrixStats.colSums( countMatrix );
+        DoubleMatrix<CompositeSequence, BioMaterial> log2cpmMatrix = MatrixStats
+                .convertToLog2Cpm( properCountMatrix, librarySize );
+
+        ExpressionDataDoubleMatrix log2cpmEEMatrix = new ExpressionDataDoubleMatrix( ee, log2cpmQt, log2cpmMatrix );
+
+        // important: replaceData takes care of the platform switch if necessary; call first. It also deletes old QTs, so from here we have to remake them.
+        ee = this.replaceData( ee, targetArrayDesign, log2cpmEEMatrix );
+
+
     }
 
     /**
@@ -677,9 +669,9 @@ public ExpressionExperiment replaceData( ExpressionExperiment ee, ArrayDesign ta
      * RNA-seq
      *
      * @param ee            experiment
-     * @param countEEMatrix count ee matrix
-     * @param readLength    read length
-     * @param isPairedReads is paired reads
+     * @param countEEMatrix count ee matrix; used to compute library sizes
+     * @param readLength    read length (optional)
+     * @param isPairedReads is paired reads (optional)
      */
     private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDoubleMatrix countEEMatrix,
             Integer readLength, Boolean isPairedReads ) {
@@ -693,8 +685,14 @@ private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDo
             }
             DataUpdaterImpl.log.info( ba + " total library size=" + librarySize );
 
-            ba.setSequenceReadLength( readLength );
-            ba.setSequencePairedReads( isPairedReads );
+            if ( readLength != null ) {
+                ba.setSequenceReadLength( readLength );
+            }
+
+            if ( isPairedReads != null ) {
+                ba.setSequencePairedReads( isPairedReads );
+            }
+
             ba.setSequenceReadCount( librarySize );
 
             bioAssayService.update( ba );
diff --git a/gemma-core/src/test/java/ubic/gemma/core/analysis/expression/diff/DiffExTest.java b/gemma-core/src/test/java/ubic/gemma/core/analysis/expression/diff/DiffExTest.java
index 86f9691e77..937323fc94 100644
--- a/gemma-core/src/test/java/ubic/gemma/core/analysis/expression/diff/DiffExTest.java
+++ b/gemma-core/src/test/java/ubic/gemma/core/analysis/expression/diff/DiffExTest.java
@@ -146,7 +146,7 @@ public void testCountData() throws Exception {
 
             // the experiment has 8 samples but the data has 4 columns so allow missing samples
             // GSM718707 GSM718708 GSM718709 GSM718710
-            dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, null, 36, true, true );
+            dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, null, 36, true, true, false );
         }
 
         // make sure to do a thawRawAndProcessed() to get the addCountData() updates
diff --git a/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/MeanVarianceServiceTest.java b/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/MeanVarianceServiceTest.java
index e6f615e933..5507b9d6bc 100644
--- a/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/MeanVarianceServiceTest.java
+++ b/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/MeanVarianceServiceTest.java
@@ -33,7 +33,6 @@
 import ubic.gemma.core.security.authorization.acl.AclTestUtils;
 import ubic.gemma.core.util.test.category.GeoTest;
 import ubic.gemma.core.util.test.category.SlowTest;
-import ubic.gemma.model.common.auditAndSecurity.AuditAction;
 import ubic.gemma.model.common.quantitationtype.*;
 import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
 import ubic.gemma.model.expression.arrayDesign.TechnologyType;
@@ -235,12 +234,12 @@ final public void testServiceCreateCountData() throws Exception {
             targetArrayDesign = arrayDesignService.thaw( targetArrayDesign );
 
             try {
-                dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false );
+                dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false, false );
                 fail( "Should have gotten an exception" );
             } catch ( IllegalArgumentException e ) {
                 // Expected
             }
-            dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, true );
+            dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, true, false );
         }
 
         ee = eeService.thaw( this.ee );
diff --git a/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdaterTest.java b/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdaterTest.java
index bd183ba894..8ee5f15221 100644
--- a/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdaterTest.java
+++ b/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdaterTest.java
@@ -241,7 +241,7 @@ public void testLoadRNASeqData() throws Exception {
         assertEquals( 199, targetArrayDesign.getCompositeSequences().size() );
 
         // Main step.
-        dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false );
+        dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false, false );
         ee = experimentService.loadOrFail( ee.getId() );
         ee = experimentService.thaw( ee );
 
@@ -285,7 +285,7 @@ public void testLoadRNASeqData() throws Exception {
         assertFalse( dataVectorService.getProcessedDataVectors( ee2 ).isEmpty() );
 
         // Call it again to test that we don't leak QTs
-        dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false );
+        dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false, false );
         ee = experimentService.load( ee.getId() );
         assertNotNull( ee );
         ee = this.experimentService.thawLite( ee );
@@ -328,12 +328,12 @@ public void testLoadRNASeqDataWithMissingSamples() throws Exception {
                     .getTestPersistentArrayDesign( probeNames, taxonService.findByCommonName( "human" ) );
             targetArrayDesign = arrayDesignService.thaw( targetArrayDesign );
             try {
-                dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false );
+                dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false, false );
                 fail( "Should have gotten an exception" );
             } catch ( IllegalArgumentException e ) {
                 // Expected
             }
-            dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, true );
+            dataUpdater.addCountData( ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, true, false );
         }
 
         /*

From 36bf8df2a6db8c841f6e793c6e881c0d55a5958a Mon Sep 17 00:00:00 2001
From: Paul Pavlidis <paul@msl.ubc.ca>
Date: Mon, 17 Jul 2023 11:13:17 -0700
Subject: [PATCH 2/8] add a check that the library size is not changed when
 adding counts to data that already has librarysizes

---
 .../ubic/gemma/core/loader/expression/DataUpdaterImpl.java     | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java
index 24c06e332d..ed105e8c38 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java
@@ -693,6 +693,9 @@ private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDo
                 ba.setSequencePairedReads( isPairedReads );
             }
 
+            if ( ba.getSequenceReadCount() != null && ba.getSequenceReadCount() != librarySize ) {
+                throw new IllegalStateException( "Read count was already set for " + ba + " and is not the same (existing=" + ba.getSequenceReadCount() + " vs updated=" + librarySize + ")" );
+            }
             ba.setSequenceReadCount( librarySize );
 
             bioAssayService.update( ba );

From 2dd9ae120d46596656202c59e66236e0677a3b9c Mon Sep 17 00:00:00 2001
From: Paul Pavlidis <paul@msl.ubc.ca>
Date: Mon, 17 Jul 2023 11:18:02 -0700
Subject: [PATCH 3/8] add check for whether already stored library sizes can
 deviate from the new

---
 .../gemma/core/loader/expression/DataUpdaterImpl.java    | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java
index ed105e8c38..731b35713f 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/DataUpdaterImpl.java
@@ -222,7 +222,7 @@ public void addCountData( ExpressionExperiment ee, ArrayDesign targetArrayDesign
         }
         ExpressionDataDoubleMatrix countEEMatrix = new ExpressionDataDoubleMatrix( ee, countqt, properCountMatrix );
         ee = this.addData( ee, targetArrayDesign, countEEMatrix );
-        this.addTotalCountInformation( ee, countEEMatrix, readLength, isPairedReads );
+        this.addTotalCountInformation( ee, countEEMatrix, readLength, isPairedReads, skipLog2cpm );
 
 
         /* add rpkm/fpkm data if available */
@@ -672,9 +672,12 @@ public ExpressionExperiment replaceData( ExpressionExperiment ee, ArrayDesign ta
      * @param countEEMatrix count ee matrix; used to compute library sizes
      * @param readLength    read length (optional)
      * @param isPairedReads is paired reads (optional)
+     * @param requireExistingLibrarySizesMatch   whether to care if existing information about librarysizes can conflict with the one stored.
+     *                                           This should be true when simply backfilling counts, as a check.
+     *                                           It should be false if we are actually trying to replace existing counts.
      */
     private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDoubleMatrix countEEMatrix,
-            Integer readLength, Boolean isPairedReads ) {
+            Integer readLength, Boolean isPairedReads, boolean requireExistingLibrarySizesMatch ) {
         for ( BioAssay ba : ee.getBioAssays() ) {
             Double[] col = countEEMatrix.getColumn( ba );
             int librarySize = ( int ) Math.floor( DescriptiveWithMissing.sum( new DoubleArrayList( ArrayUtils.toPrimitive( col ) ) ) );
@@ -693,7 +696,7 @@ private void addTotalCountInformation( ExpressionExperiment ee, ExpressionDataDo
                 ba.setSequencePairedReads( isPairedReads );
             }
 
-            if ( ba.getSequenceReadCount() != null && ba.getSequenceReadCount() != librarySize ) {
+            if ( ba.getSequenceReadCount() != null && ba.getSequenceReadCount() != librarySize && requireExistingLibrarySizesMatch ) {
                 throw new IllegalStateException( "Read count was already set for " + ba + " and is not the same (existing=" + ba.getSequenceReadCount() + " vs updated=" + librarySize + ")" );
             }
             ba.setSequenceReadCount( librarySize );

From d124d35de3d65324e2dda3a47e740f267752f0da Mon Sep 17 00:00:00 2001
From: Guillaume Poirier-Morency <poirigui@msl.ubc.ca>
Date: Mon, 24 Jul 2023 15:56:59 -0700
Subject: [PATCH 4/8] Use null as default for missing pairedness information

---
 .../src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java
index 8f8c3b2af1..55d2409bc7 100644
--- a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java
+++ b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java
@@ -52,7 +52,7 @@ public class RNASeqDataAddCli extends ExpressionExperimentManipulatingCLI {
     private static final String MULTIQC_METADATA_FILE_OPT = "multiqc";
     private boolean allowMissingSamples = false;
     private String countFile = null;
-    private boolean isPairedReads = false;
+    private Boolean isPairedReads = null;
     private String platformName = null;
     private Integer readLength = null;
     private String rpkmFile = null;

From ea1f70357f7335904712563a57584933dd9e02a2 Mon Sep 17 00:00:00 2001
From: Guillaume Poirier-Morency <poirigui@msl.ubc.ca>
Date: Mon, 24 Jul 2023 15:55:35 -0700
Subject: [PATCH 5/8] Add tests for the -noLog2cpm switch

---
 .../gemma/core/apps/RNASeqDataAddCliTest.java | 112 ++++++++++++++++++
 .../src/test/resources/GSE00001.counts.txt    |   2 +
 .../src/test/resources/GSE00001.rpkm.txt      |   2 +
 3 files changed, 116 insertions(+)
 create mode 100644 gemma-cli/src/test/java/ubic/gemma/core/apps/RNASeqDataAddCliTest.java
 create mode 100644 gemma-cli/src/test/resources/GSE00001.counts.txt
 create mode 100644 gemma-cli/src/test/resources/GSE00001.rpkm.txt

diff --git a/gemma-cli/src/test/java/ubic/gemma/core/apps/RNASeqDataAddCliTest.java b/gemma-cli/src/test/java/ubic/gemma/core/apps/RNASeqDataAddCliTest.java
new file mode 100644
index 0000000000..da730c3140
--- /dev/null
+++ b/gemma-cli/src/test/java/ubic/gemma/core/apps/RNASeqDataAddCliTest.java
@@ -0,0 +1,112 @@
+package ubic.gemma.core.apps;
+
+import org.junit.Test;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+import org.springframework.security.test.context.support.WithMockUser;
+import org.springframework.test.annotation.DirtiesContext;
+import org.springframework.test.context.ContextConfiguration;
+import ubic.gemma.core.analysis.service.ExpressionDataFileService;
+import ubic.gemma.core.genome.gene.service.GeneService;
+import ubic.gemma.core.loader.expression.DataUpdater;
+import ubic.gemma.core.search.SearchService;
+import ubic.gemma.core.util.AbstractCLI;
+import ubic.gemma.core.util.test.BaseCliTest;
+import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
+import ubic.gemma.model.expression.experiment.ExpressionExperiment;
+import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService;
+import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService;
+import ubic.gemma.persistence.service.genome.taxon.TaxonService;
+import ubic.gemma.persistence.util.TestComponent;
+
+import static org.junit.Assert.assertEquals;
+import static org.mockito.Mockito.*;
+
+@DirtiesContext(classMode = DirtiesContext.ClassMode.AFTER_EACH_TEST_METHOD)
+@ContextConfiguration
+public class RNASeqDataAddCliTest extends BaseCliTest {
+
+    @TestComponent
+    @Configuration
+    static class RNASeqDataAddCliTestContextConfiguration extends BaseCliTestContextConfiguration {
+
+        @Bean
+        public RNASeqDataAddCli rnaSeqDataAddCli() {
+            return new RNASeqDataAddCli();
+        }
+
+        @Bean
+        public DataUpdater dataUpdater() {
+            return mock( DataUpdater.class );
+        }
+
+        @Bean
+        public GeneService geneService() {
+            return mock( GeneService.class );
+        }
+
+        @Bean
+        public TaxonService taxonService() {
+            return mock( TaxonService.class );
+        }
+
+        @Bean
+        public SearchService searchService() {
+            return mock( SearchService.class );
+        }
+
+        @Bean
+        public ExpressionDataFileService expressionDataFileService() {
+            return mock( ExpressionDataFileService.class );
+        }
+
+        @Bean
+        public ArrayDesignService arrayDesignService() {
+            return mock( ArrayDesignService.class );
+        }
+    }
+
+    @Autowired
+    private RNASeqDataAddCli cli;
+
+    @Autowired
+    private DataUpdater dataUpdater;
+
+    @Autowired
+    private ExpressionExperimentService expressionExperimentService;
+
+    @Autowired
+    private ArrayDesignService arrayDesignService;
+
+    @Test
+    @WithMockUser
+    public void test() {
+        ExpressionExperiment ee = new ExpressionExperiment();
+        ArrayDesign ad = new ArrayDesign();
+        when( expressionExperimentService.findByShortName( "GSE00001" ) ).thenReturn( ee );
+        when( arrayDesignService.findByShortName( "Generic_human_ncbiIds" ) ).thenReturn( ad );
+        assertEquals( AbstractCLI.SUCCESS, cli.executeCommand( new String[] {
+                "-e", "GSE00001", "-a", "Generic_human_ncbiIds",
+                "-rpkm", getClass().getResource( "/GSE00001.rpkm.txt" ).getFile(),
+                "-count", getClass().getResource( "/GSE00001.counts.txt" ).getFile() } ) );
+        verify( dataUpdater ).addCountData( same( ee ), same( ad ), isNotNull(), isNotNull(), isNull(),
+                isNull(), eq( false ), eq( false ) );
+    }
+
+    @Test
+    @WithMockUser
+    public void testSkipLog2cpm() {
+        ExpressionExperiment ee = new ExpressionExperiment();
+        ArrayDesign ad = new ArrayDesign();
+        when( expressionExperimentService.findByShortName( "GSE00001" ) ).thenReturn( ee );
+        when( arrayDesignService.findByShortName( "Generic_human_ncbiIds" ) ).thenReturn( ad );
+        assertEquals( AbstractCLI.SUCCESS, cli.executeCommand( new String[] {
+                "-e", "GSE00001", "-a", "Generic_human_ncbiIds",
+                "-rpkm", getClass().getResource( "/GSE00001.rpkm.txt" ).getFile(),
+                "-count", getClass().getResource( "/GSE00001.counts.txt" ).getFile(),
+                "-noLog2cpm" } ) );
+        verify( dataUpdater ).addCountData( same( ee ), same( ad ), isNotNull(), isNotNull(), isNull(),
+                isNull(), eq( false ), eq( true ) );
+    }
+}
\ No newline at end of file
diff --git a/gemma-cli/src/test/resources/GSE00001.counts.txt b/gemma-cli/src/test/resources/GSE00001.counts.txt
new file mode 100644
index 0000000000..dc8d95a79d
--- /dev/null
+++ b/gemma-cli/src/test/resources/GSE00001.counts.txt
@@ -0,0 +1,2 @@
+GSM000001
+1
\ No newline at end of file
diff --git a/gemma-cli/src/test/resources/GSE00001.rpkm.txt b/gemma-cli/src/test/resources/GSE00001.rpkm.txt
new file mode 100644
index 0000000000..acde930d36
--- /dev/null
+++ b/gemma-cli/src/test/resources/GSE00001.rpkm.txt
@@ -0,0 +1,2 @@
+GSM000001
+0.5
\ No newline at end of file

From a46815a942859f6bf2fd03966d89e6860f103c60 Mon Sep 17 00:00:00 2001
From: Guillaume Poirier-Morency <poirigui@msl.ubc.ca>
Date: Mon, 24 Jul 2023 20:25:02 -0700
Subject: [PATCH 6/8] Update CLI docs and arg logic that omitting pairedness
 result in undefined pairedness

---
 .../java/ubic/gemma/core/apps/RNASeqDataAddCli.java    | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java
index 55d2409bc7..d87f22280d 100644
--- a/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java
+++ b/gemma-cli/src/main/java/ubic/gemma/core/apps/RNASeqDataAddCli.java
@@ -75,7 +75,7 @@ protected void buildOptions( Options options ) {
         options.addOption( RNASeqDataAddCli.ALLOW_MISSING, "Set this if your data files don't have information for all samples." );
         options.addOption( Option.builder( "a" ).longOpt( null ).desc( "Target platform (must already exist in the system)" ).argName( "platform short name" ).hasArg().build() );
 
-        options.addOption( Option.builder( RNASeqDataAddCli.METADATAOPT ).longOpt( null ).desc( "Information on read length given as a string like '100:paired', '36 (assumed unpaired)', or '36:unpaired' " ).argName( "length" ).hasArg().build() );
+        options.addOption( Option.builder( RNASeqDataAddCli.METADATAOPT ).longOpt( null ).desc( "Information on read length given as a string like '100:paired', '36' (no pairedness assumed), or '36:unpaired' " ).argName( "length" ).hasArg().build() );
 
         options.addOption( "log2cpm", "Just compute log2cpm from the existing stored count data (backfill); batchmode OK, no other options needed" );
 
@@ -126,6 +126,8 @@ protected void processOptions( CommandLine commandLine ) {
                 } else {
                     throw new IllegalArgumentException( "Value must be either 'paired' or 'unpaired' or left blank" );
                 }
+            } else {
+                this.isPairedReads = null;
             }
 
         }
@@ -142,10 +144,10 @@ protected void processOptions( CommandLine commandLine ) {
 
         this.platformName = commandLine.getOptionValue( "a" );
 
-        if (commandLine.hasOption( "noLog2cpm")) {
+        if ( commandLine.hasOption( "noLog2cpm" ) ) {
             this.justbackfillCountsAndRPKM = true;
-            if (this.justbackfillLog2cpm) {
-                throw new IllegalArgumentException("Can't use both noLog2cpm and log2cpm options");
+            if ( this.justbackfillLog2cpm ) {
+                throw new IllegalArgumentException( "Can't use both noLog2cpm and log2cpm options" );
             }
         }
 

From d08628a05eb11a02264394ab61e739b893d9fcc0 Mon Sep 17 00:00:00 2001
From: Guillaume Poirier-Morency <poirigui@msl.ubc.ca>
Date: Mon, 24 Jul 2023 21:00:46 -0700
Subject: [PATCH 7/8] Extract interface from
 ExpressionExperimentPlatformSwitchService

This is needed to write unit tests for DataUpdater.
---
 ...essionExperimentPlatformSwitchService.java | 649 +----------------
 ...onExperimentPlatformSwitchServiceImpl.java | 652 ++++++++++++++++++
 .../ArrayDesignMergeServiceImpl.java          |   5 +-
 3 files changed, 658 insertions(+), 648 deletions(-)
 create mode 100644 gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchServiceImpl.java

diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchService.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchService.java
index d892b21ad9..b54789d89c 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchService.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchService.java
@@ -1,655 +1,12 @@
-/*
- * The Gemma project
- *
- * Copyright (c) 2007 University of British Columbia
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
 package ubic.gemma.core.loader.expression;
 
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.stereotype.Component;
-
-import ubic.gemma.core.analysis.expression.AnalysisUtilService;
-import ubic.gemma.core.analysis.preprocess.VectorMergingService;
-import ubic.gemma.core.analysis.service.ExpressionExperimentVectorManipulatingService;
-import ubic.gemma.model.common.quantitationtype.PrimitiveType;
-import ubic.gemma.model.common.quantitationtype.QuantitationType;
 import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
-import ubic.gemma.model.expression.arrayDesign.TechnologyType;
-import ubic.gemma.model.expression.bioAssay.BioAssay;
-import ubic.gemma.model.expression.bioAssayData.BioAssayDimension;
-import ubic.gemma.model.expression.bioAssayData.DesignElementDataVector;
-import ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector;
-import ubic.gemma.model.expression.biomaterial.BioMaterial;
-import ubic.gemma.model.expression.designElement.CompositeSequence;
 import ubic.gemma.model.expression.experiment.ExpressionExperiment;
-import ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet;
 import ubic.gemma.model.genome.biosequence.BioSequence;
-import ubic.gemma.persistence.service.analysis.expression.sampleCoexpression.SampleCoexpressionAnalysisService;
-import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService;
-import ubic.gemma.persistence.service.expression.bioAssay.BioAssayService;
-import ubic.gemma.persistence.service.expression.bioAssayData.BioAssayDimensionService;
-import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService;
-import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentSubSetService;
-
-/**
- * Switch an expression experiment from one array design to another. This is valuable when the EE uses more than on AD,
- * and a merged AD exists. The following steps are needed:
- * <ul>
- *
- * <li>Delete old analyses of this experiment and processeddata vectors
- * <li>For each array design, for each probe, identify the matching probe on the merged AD. Have to deal with situation
- * <li>where more than one occurrence of each sequence is found.
- * <li>all DEDVs must be switched to use the new AD's design elements
- * <li>all bioassays must be switched to the new AD.
- * <li>update the EE description
- * <li>commit changes.
- * <li>Computed processed data vectors
- * </ul>
- * This also handles the case of multisamples-per-platform - NOT the case of one-sample-per-platform but multiple
- * platforms; for that you have to run VectorMerging. For more nutty situations this will probably create a mess.
- *
- * @author pavlidis
- * @see    VectorMergingService
- */
-@Component
-public class ExpressionExperimentPlatformSwitchService extends ExpressionExperimentVectorManipulatingService {
-
-    /**
-     * Used to identify design elements that have no sequence associated with them.
-     */
-    public static final BioSequence NULL_BIOSEQUENCE;
-    private static final Log log = LogFactory.getLog( ExpressionExperimentPlatformSwitchService.class.getName() );
-
-    static {
-        NULL_BIOSEQUENCE = BioSequence.Factory.newInstance();
-        ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE.setName( "______NULL______" );
-        ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE.setId( -1L );
-    }
-
-    @Autowired
-    private ArrayDesignService arrayDesignService;
-
-    @Autowired
-    private BioAssayDimensionService bioAssayDimensionService;
-
-    @Autowired
-    private BioAssayService bioAssayService;
-
-    @Autowired
-    private ExpressionExperimentService expressionExperimentService;
-
-    @Autowired
-    private ExperimentPlatformSwitchHelperService helperService;
-
-    @Autowired
-    private SampleCoexpressionAnalysisService sampleCoexpressionAnalysisService;
-
-    @Autowired
-    private ExpressionExperimentSubSetService subsetService;
-
-    @Autowired
-    private AnalysisUtilService analysisUtilService;
-
-    /**
-     * If you know the array designs are already in a merged state, you should use switchExperimentToMergedPlatform
-     *
-     * @param  ee          ee
-     * @param  arrayDesign The array design to switch to. If some samples already use that array design, nothing will be
-     *                     changed for them.
-     * @return the switched experiment
-     */
-    public void switchExperimentToArrayDesign( ExpressionExperiment ee, ArrayDesign arrayDesign ) {
-        assert arrayDesign != null;
-
-        // remove stuff that will be in the way.
-        processedExpressionDataVectorService.removeProcessedDataVectors( ee );
-        sampleCoexpressionAnalysisService.removeForExperiment( ee );
-        for ( ExpressionExperimentSubSet subset : expressionExperimentService.getSubSets( ee ) ) {
-            subsetService.remove( subset );
-        }
-        analysisUtilService.deleteOldAnalyses( ee );
-
-        // get relation between sequence and designelements.
-        Map<BioSequence, Collection<CompositeSequence>> designElementMap = new HashMap<>();
-        Collection<CompositeSequence> elsWithNoSeq = new HashSet<>();
-        this.populateCSeq( arrayDesign, designElementMap, elsWithNoSeq );
-
-        ee = expressionExperimentService.thaw( ee );
-
-        if ( elsWithNoSeq.size() > 0 ) {
-            ExpressionExperimentPlatformSwitchService.log
-                    .info( elsWithNoSeq.size() + " elements on the new platform have no associated sequence." );
-            designElementMap.put( ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE, elsWithNoSeq );
-        }
-
-        boolean multiPlatformPerSample = this.switchPlatform( ee, arrayDesign );
-        /*
-         * For a multiplatform-per-sample case ("Case 2", platforms are disjoint): (note that some samples might just be
-         * on one platform...)
-         * 1. Pick a BAD that can be used for all DataVectors (it has all BioAssays in it).
-         * 2. Switch vectors to use it - may require adding NaNs and reordering the vectors (switchDataForPlatform)
-         * 3. Delete the Bioassays that are using other BADs (cleanupUnused)
-         *
-         * For Case 1 (each sample run on one platform, all platforms are related/similar) Simply switch the vectors
-         * (switchDataForPlatform)
-         */
-
-        /*
-         * Now we have to get the BADs. Problem to watch out for: they might not be the same length, we need one that
-         * includes all BioMaterials.
-         */
-        Collection<BioAssayDimension> unusedBADs = new HashSet<>();
-        BioAssayDimension targetBioAssayDimension = null;
-        if ( multiPlatformPerSample ) {
-            targetBioAssayDimension = this.doMultiSample( ee, unusedBADs );
-        }
-        if ( multiPlatformPerSample && targetBioAssayDimension == null ) {
-            throw new RuntimeException( "Data set cannot be switched to merged platform: no suitable bioassaydimension found" );
-        }
-
-        /*
-         * To account for cases where we have no data loaded yet.
-         */
-        boolean hasData = expressionExperimentService.getQuantitationTypes( ee ).size() > 0 && ee.getRawExpressionDataVectors().size() > 0;
-
-        Collection<ArrayDesign> oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed( ee );
-        Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements = new HashMap<>();
-        int totalVectorsSwitched = 0;
-        for ( ArrayDesign oldAd : oldArrayDesigns ) {
-            totalVectorsSwitched += this.switchDataForPlatform( ee, arrayDesign, designElementMap, targetBioAssayDimension, usedDesignElements,
-                    oldAd );
-        }
-
-        if ( totalVectorsSwitched == 0 && hasData ) {
-            throw new RuntimeException( "No vectors were switched" );
-        }
-
-        String descriptionUpdate = "[Switched to use " + arrayDesign.getShortName() + " by Gemma]";
-        if ( !ee.getDescription().contains( descriptionUpdate ) ) {
-            ee.setDescription( ee.getDescription() + " " + descriptionUpdate );
-        }
-
-        helperService.persist( ee, arrayDesign );
-
-        log.info( "Finishing up" );
-        if ( targetBioAssayDimension != null && !unusedBADs.isEmpty() ) {
-            this.cleanupUnused( unusedBADs, targetBioAssayDimension );
-        }
-
-        ee = expressionExperimentService.loadOrFail( ee.getId() ); // refresh after cleanup
-        ee = expressionExperimentService.thawLite( ee );
-
-        if ( hasData ) {
-            processedExpressionDataVectorService.createProcessedDataVectors( ee ); // this still fails sometimes? works fine if run later by cli
-        }
-    }
-
-    /**
-     * Automatically identify an appropriate merged platform
-     *
-     * @param  expExp ee
-     * @return ee
-     */
-    public void switchExperimentToMergedPlatform( ExpressionExperiment expExp ) {
-        ArrayDesign arrayDesign = this.locateMergedDesign( expExp );
-        if ( arrayDesign == null )
-            throw new IllegalArgumentException( "Experiment has no merged design to switch to" );
-        this.switchExperimentToArrayDesign( expExp, arrayDesign );
-    }
-
-    private boolean switchPlatform( ExpressionExperiment ee, ArrayDesign arrayDesign ) {
-        boolean multiPlatformPerSample = false;
-        for ( BioAssay assay : ee.getBioAssays() ) {
-
-            // Switch the assay to use the desired platform. However, if this is a second switch, don't lose the original value
-            if ( assay.getOriginalPlatform() == null ) {
-                assay.setOriginalPlatform( assay.getArrayDesignUsed() );
-            }
-            assay.setArrayDesignUsed( arrayDesign );
-
-            /*
-             * Side effect: Detect cases of each-sample-run-on-multiple-platforms - we need to merge the bioassays, too.
-             * That means fiddling with the BioAssayDimensions (BADs). We do this check here to avoid unnecessarily
-             * inspecting BADs.
-             */
-            if ( !multiPlatformPerSample && assay.getSampleUsed().getBioAssaysUsedIn().size() > 1 ) {
-                multiPlatformPerSample = true;
-            }
-        }
-        return multiPlatformPerSample;
-    }
-
-    /**
-     * Remove bioassays that are no longer needed
-     *
-     */
-    private void cleanupUnused( Collection<BioAssayDimension> unusedBADs, BioAssayDimension maxBAD ) {
-        ExpressionExperimentPlatformSwitchService.log.info( "Checking for unused BioAssays after merge" );
-
-        Collection<BioAssay> removed = new HashSet<>();
-        for ( BioAssayDimension bioAssayDimension : unusedBADs ) {
-            List<BioAssay> bioAssays = bioAssayDimension.getBioAssays();
-            for ( BioAssay ba : bioAssays ) {
-                if ( !maxBAD.getBioAssays().contains( ba ) && !removed.contains( ba ) ) {
-                    try {
-                        ExpressionExperimentPlatformSwitchService.log.info( "Deleting unused bioassay: " + ba );
-                        bioAssayService.remove( ba );
-                        removed.add( ba );
-                    } catch ( Exception e ) {
-                        log.error( "Failed to delete: " + ba, e );
-                    }
-                }
-            }
-            try {
-                bioAssayDimensionService.remove( bioAssayDimension );
-                log.info( "Removed unused bioAssayDimension ID=" + bioAssayDimension.getId() );
-            } catch ( Exception e ) {
-                log.warn( "Failed to clean up unused bioassaydimension with ID=" + bioAssayDimension.getId() + ": " + e
-                        .getMessage() );
-            }
-        }
-        log.info( "Removed " + removed.size() + " usused bioassays" );
-    }
-
-    /**
-     * Find the bioassaydimension that covers all the biomaterials.
-     *
-     */
-    private BioAssayDimension doMultiSample( ExpressionExperiment ee, Collection<BioAssayDimension> unusedBADs ) {
-
-        int maxSize = 0;
-        BioAssayDimension maxBAD = null;
-        for ( BioAssay ba : ee.getBioAssays() ) {
-            Collection<BioAssayDimension> oldBioAssayDims = bioAssayService.findBioAssayDimensions( ba );
-            for ( BioAssayDimension bioAssayDim : oldBioAssayDims ) {
-                unusedBADs.add( bioAssayDim );
-                int size = bioAssayDim.getBioAssays().size();
-
-                if ( size > maxSize ) {
-                    maxSize = size;
-                    maxBAD = bioAssayDim;
-                }
-            }
-        }
-
-        assert unusedBADs.size() > 1; // otherwise we shouldn't be here.
-        unusedBADs.remove( maxBAD );
-
-        /*
-         * Make sure all biomaterials in the study are included in the chosen bioassaydimension. If not, we'd have
-         * to make a new BAD.
-         */
-        if ( maxBAD != null ) {
-            Collection<BioMaterial> bmsInmaxBAD = new HashSet<>();
-            for ( BioAssay ba : maxBAD.getBioAssays() ) {
-                bmsInmaxBAD.add( ba.getSampleUsed() );
-            }
-
-            for ( BioAssay ba : ee.getBioAssays() ) {
-                if ( !bmsInmaxBAD.contains( ba.getSampleUsed() ) ) {
-
-                    ExpressionExperimentPlatformSwitchService.log
-                            .debug( "This experiment looked like it had samples run on more than one platform, "
-                                    + "but it also has no BioAssayDimension that is eligible to accomodate all samples (Example: "
-                                    + ba.getSampleUsed() );
-                    maxBAD = null;
-                    break;
-                }
-            }
-
-        }
-
-        if ( maxBAD == null ) {
-            log.info( "Creating new bioassaydimension to accomodate merged data as no existing one was suitable" );
-            maxBAD = BioAssayDimension.Factory.newInstance( "For " + ee.getBioAssays().size() +
-                    " bioMaterials", "Created to accomodate platform switch", new ArrayList<>( ee.getBioAssays() ) );
-            maxBAD = bioAssayDimensionService.create( maxBAD );
-        }
-
-
-        return maxBAD;
-    }
-
-    /**
-     * Determine whether the two bioassaydimensions are the same, based on the samples used. Note it is inefficient to
-     * call this over and over but it's not a big deal so far.
-     *
-     * @param  currentOrder current order
-     * @param  desiredOrder desired order
-     * @return boolean
-     */
-    private boolean equivalent( List<BioAssay> currentOrder, List<BioAssay> desiredOrder ) {
-        if ( currentOrder.size() != desiredOrder.size() ) {
-            return false;
-        }
-
-        for ( int i = 0; i < currentOrder.size(); i++ ) {
-            if ( !currentOrder.get( i ).getSampleUsed().equals( desiredOrder.get( i ).getSampleUsed() ) ) {
-                return false;
-            }
-        }
-
-        return true;
-
-    }
-
-    private ArrayDesign locateMergedDesign( ExpressionExperiment expExp ) {
-        // get the array designs for this EE
-        ArrayDesign arrayDesign = null;
-        Collection<ArrayDesign> oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed( expExp );
-
-        // find the AD they have been merged into, make sure it is exists and they are all merged into the same AD.
-        for ( ArrayDesign design : oldArrayDesigns ) {
-            ArrayDesign mergedInto = design.getMergedInto();
-            // TODO: use thaw for collection of platforms
-            mergedInto = arrayDesignService.thaw( mergedInto );
-
-            if ( mergedInto == null ) {
-                throw new IllegalArgumentException(
-                        design + " used by " + expExp + " is not merged into another design" );
-            }
-
-            // TODO: go up the merge tree to find the root. This is too slow.
-            // while ( mergedInto.getMergedInto() != null ) {
-            // mergedInto = arrayDesignService.thaw( mergedInto.getMergedInto() );
-            // }
-
-            if ( arrayDesign == null ) {
-                arrayDesign = mergedInto;
-                arrayDesign = arrayDesignService.thaw( arrayDesign );
-            }
-
-            if ( !mergedInto.equals( arrayDesign ) ) {
-                throw new IllegalArgumentException(
-                        design + " used by " + expExp + " is not merged into " + arrayDesign );
-            }
-
-        }
-        return arrayDesign;
-    }
-
-    /**
-     * Set up a map of sequences to elements for the platform we're switching to
-     *
-     */
-    private void populateCSeq( ArrayDesign arrayDesign,
-            Map<BioSequence, Collection<CompositeSequence>> designElementMap,
-            Collection<CompositeSequence> elsWithNoSeq ) {
-        for ( CompositeSequence cs : arrayDesign.getCompositeSequences() ) {
-            BioSequence bs = cs.getBiologicalCharacteristic();
-            if ( bs == null ) {
-                elsWithNoSeq.add( cs );
-            } else {
-                if ( !designElementMap.containsKey( bs ) ) {
-                    designElementMap.put( bs, new HashSet<CompositeSequence>() );
-                }
-                designElementMap.get( bs ).add( cs );
-            }
-        }
-    }
-
-    /**
-     * @param  designElementMap      Mapping of sequences to probes for the platform that is being switch from. This is
-     *                               used
-     *                               to identify new candidates.
-     * @param  usedDesignElements    probes from the new design that have already been assigned to probes from the old
-     *                               design. If things are done correctly (the old design was merged into the new) then
-     *                               there should be enough.
-     *                               Map is of the new design probe to the old design probe it was used for (this is
-     *                               debugging information)
-     * @param  vector                vector
-     * @param  bad                   BioAssayDimension to use, if necessary. If this is null or already the one used,
-     *                               it's ignored.
-     *                               Otherwise the vector data will be rewritten to match it.
-     * @return true if a switch was made
-     * @throws IllegalStateException if there is no (unused) design element matching the vector's biosequence
-     */
-    private boolean processVector( Map<BioSequence, Collection<CompositeSequence>> designElementMap,
-            Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements, RawExpressionDataVector vector,
-            BioAssayDimension bad ) {
-
-        CompositeSequence oldDe = vector.getDesignElement();
-
-        Collection<CompositeSequence> newElCandidates;
-        BioSequence seq = oldDe.getBiologicalCharacteristic();
-        if ( seq == null ) {
-            newElCandidates = designElementMap.get( ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE );
-        } else {
-            newElCandidates = designElementMap.get( seq );
-        }
-
-        if ( newElCandidates == null || newElCandidates.isEmpty() ) {
-            return false;
-            //            throw new IllegalStateException(
-            //                    "There are no candidates probes for sequence: " + seq + "('null' should be okay)" );
-        }
-
-        for ( CompositeSequence newEl : newElCandidates ) {
-            if ( !usedDesignElements.containsKey( newEl ) ) {
-
-                vector.setDesignElement( newEl );
-                usedDesignElements.put( newEl, new HashSet<BioAssayDimension>() );
-                usedDesignElements.get( newEl ).add( vector.getBioAssayDimension() );
-                break;
-            }
-
-            if ( !usedDesignElements.get( newEl ).contains( vector.getBioAssayDimension() ) ) {
-                /*
-                 * Then it's okay to use it.
-                 */
-                vector.setDesignElement( newEl );
-                usedDesignElements.get( newEl ).add( vector.getBioAssayDimension() );
-                break;
-            }
-        }
-
-        if ( bad != null && !vector.getBioAssayDimension().equals( bad ) ) {
-            /*
-             * 1. Check if they are already the same; then just switch it to the desired BAD
-             * 2. If not, then the vector data has to be rewritten.
-             */
-            this.vectorReWrite( vector, bad );
-
-        }
-        return true;
-    }
-
-    /**
-     * Switch the vectors from one platform to the other, using the bioassaydimension passed if non-null
-     *
-     * @param  targetBioAssayDimension - if null we keep the current one; otherwise we rewrite data datavectors to use
-     *                                 the new one.
-     * @return how many vectors were switched
-     */
-    private int switchDataForPlatform( ExpressionExperiment ee, ArrayDesign arrayDesign,
-            Map<BioSequence, Collection<CompositeSequence>> designElementMap, BioAssayDimension targetBioAssayDimension,
-            Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements, ArrayDesign oldAd ) {
-        if ( oldAd.equals( arrayDesign ) )
-            return 0;
-
-        oldAd = arrayDesignService.thaw( oldAd );
-
-        if ( oldAd.getCompositeSequences().size() == 0 && !oldAd.getTechnologyType().equals( TechnologyType.SEQUENCING ) ) {
-            /*
-             * Bug 3451 - this is okay if it is a RNA-seq experiment etc. prior to data upload.
-             */
-            throw new IllegalStateException( oldAd + " has no elements" );
-        }
-
-        int totalSwitched = 0;
-        Collection<QuantitationType> qts = expressionExperimentService.getQuantitationTypes( ee, oldAd );
-        ExpressionExperimentPlatformSwitchService.log
-                .info( "Processing " + qts.size() + " quantitation types for vectors on " + oldAd );
-        for ( QuantitationType type : qts ) {
-
-            // use each design element only once per quantitation type + bioassaydimension per array design
-            usedDesignElements.clear();
-
-            // assumption: we have no processed data. They should have been deleted at this point.
-            Collection<RawExpressionDataVector> vecsForQt = new HashSet<>();
-
-            //
-
-            int count = 0;
-            for ( RawExpressionDataVector vector : ee.getRawExpressionDataVectors() ) {
-
-                if ( !vector.getQuantitationType().equals( type ) ) {
-                    continue;
-                }
-
-                CompositeSequence oldDe = vector.getDesignElement();
-
-                if ( !oldDe.getArrayDesign().equals( oldAd ) ) {
-                    continue;
-                }
-                vecsForQt.add( vector );
-
-            }
-
-            if ( vecsForQt.isEmpty() ) {
-                /*
-                 * This can happen when the quantitation types vary for the array designs.
-                 */
-                log.info( "No vectors for " + type + " on " + oldAd + " (Might be okay; not all QTs might be on all platforms)" );
-                continue;
-            }
-
-            log.info( "Switching " + vecsForQt.size() + " vectors for " + type + " from " + oldAd.getShortName()
-                    + " to " + arrayDesign.getShortName()
-                    + ( targetBioAssayDimension == null ? "" : ", BioAssayDimension=" + targetBioAssayDimension ) );
-
-            vecsForQt = rawExpressionDataVectorService.thaw( vecsForQt );
-
-            int numwarns = 0;
-            int maxwarns = 30;
-            for ( RawExpressionDataVector vector : vecsForQt ) {
-                if ( this.processVector( designElementMap, usedDesignElements, vector, targetBioAssayDimension ) ) {
-                    count++;
-                } else {
-                    if ( numwarns++ < maxwarns ) {
-                        log.warn( "No matching element found on target platform for " + vector.getDesignElement() );
-                    }
-                    if ( numwarns == maxwarns ) {
-                        log.warn( "[Further no-match warnings suppressed]" );
-                    }
-                }
-            }
-
-            if ( count != vecsForQt.size() ) {
-                throw new IllegalStateException(
-                        "Found matches for only " + count + "/" + vecsForQt.size() + " vectors for " + type + " from " + oldAd.getShortName()
-                                + ( targetBioAssayDimension == null ? "" : ", BioAssayDimension=" + targetBioAssayDimension ) );
-            }
-
-            // sanity check. this is all fine.
-            //            for ( RawExpressionDataVector v : vecsForQt ) {
-            //                if ( v.getQuantitationType().equals( type )
-            //                        && !arrayDesign.equals( v.getDesignElement().getArrayDesign() ) ) {
-            //                    throw new IllegalStateException( "A raw vector for QT =" + v.getQuantitationType()
-            //                            + " was not correctly switched to the target platform " + arrayDesign + ", it was on "
-            //                            + v.getDesignElement().getArrayDesign() + " while switching from " + oldAd );
-            //                }
-            //            }
-
-            //this check shouldn't be necessary
-            for ( RawExpressionDataVector v : ee.getRawExpressionDataVectors() ) {
-                if ( v.getQuantitationType().equals( type )
-                        && !arrayDesign.equals( v.getDesignElement().getArrayDesign() ) && v.getDesignElement().getArrayDesign().equals( oldAd ) ) {
-                    throw new IllegalStateException( "A raw vector " + v + "for QT =" + v.getQuantitationType()
-                            + " was not correctly switched to the target platform " + arrayDesign + ", it was still on "
-                            + v.getDesignElement().getArrayDesign() );
-                }
-            }
-
-            totalSwitched += count;
-
-        }
-        return totalSwitched;
-    }
-
-    /**
-     * Rearrange/expand a vector as necessary to use the given BioAssayDimension. Only used for multiplatform case of
-     * samples run on multiple platforms.
-     *
-     * @param vector vector
-     * @param bad    to be used as the replacement.
-     */
-    private void vectorReWrite( DesignElementDataVector vector, BioAssayDimension bad ) {
-        List<BioAssay> desiredOrder = bad.getBioAssays();
-        List<BioAssay> currentOrder = vector.getBioAssayDimension().getBioAssays();
-        if ( this.equivalent( currentOrder, desiredOrder ) ) {
-            // Easy, we can just switch it.
-            vector.setBioAssayDimension( bad );
-            return;
-        }
-
-        /*
-         * We remake the data vector following the new ordering.
-         */
-        PrimitiveType representation = vector.getQuantitationType().getRepresentation();
-        Object missingVal;
-        if ( representation.equals( PrimitiveType.DOUBLE ) ) {
-            missingVal = Double.NaN;
-        } else if ( representation.equals( PrimitiveType.STRING ) ) {
-            missingVal = "";
-        } else if ( representation.equals( PrimitiveType.INT ) ) {
-            missingVal = 0;
-        } else if ( representation.equals( PrimitiveType.BOOLEAN ) ) {
-            missingVal = false;
-        } else {
-            throw new UnsupportedOperationException(
-                    "Missing values in data vectors of type " + representation + " not supported (when processing "
-                            + vector );
-        }
-
-        List<Object> oldData = new ArrayList<>();
-        super.convertFromBytes( oldData, vector.getQuantitationType().getRepresentation(), vector );
-
-        /*
-         * Now data has the old data, so we need to rearrange it to match, inserting missings as necessary.
-         */
-        Map<BioMaterial, Integer> bm2loc = new HashMap<>();
-        int i = 0;
-        List<Object> newData = new ArrayList<>();
-        // initialize
-        for ( BioAssay ba : desiredOrder ) {
-            bm2loc.put( ba.getSampleUsed(), i++ );
-            newData.add( missingVal );
-        }
 
-        // Put data into new locations
-        int j = 0;
-        for ( BioAssay ba : currentOrder ) {
-            Integer loc = bm2loc.get( ba.getSampleUsed() );
-            assert loc != null;
-            newData.set( loc, oldData.get( j++ ) );
-        }
+public interface ExpressionExperimentPlatformSwitchService {
 
-        byte[] newDataAr = converter.toBytes( newData.toArray() );
-        vector.setData( newDataAr );
-        vector.setBioAssayDimension( bad );
+    void switchExperimentToArrayDesign( ExpressionExperiment ee, ArrayDesign arrayDesign );
 
-    }
+    void switchExperimentToMergedPlatform( ExpressionExperiment expExp );
 }
diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchServiceImpl.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchServiceImpl.java
new file mode 100644
index 0000000000..b8ace91736
--- /dev/null
+++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/ExpressionExperimentPlatformSwitchServiceImpl.java
@@ -0,0 +1,652 @@
+/*
+ * The Gemma project
+ *
+ * Copyright (c) 2007 University of British Columbia
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package ubic.gemma.core.loader.expression;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+import ubic.gemma.core.analysis.expression.AnalysisUtilService;
+import ubic.gemma.core.analysis.preprocess.VectorMergingService;
+import ubic.gemma.core.analysis.service.ExpressionExperimentVectorManipulatingService;
+import ubic.gemma.model.common.quantitationtype.PrimitiveType;
+import ubic.gemma.model.common.quantitationtype.QuantitationType;
+import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
+import ubic.gemma.model.expression.arrayDesign.TechnologyType;
+import ubic.gemma.model.expression.bioAssay.BioAssay;
+import ubic.gemma.model.expression.bioAssayData.BioAssayDimension;
+import ubic.gemma.model.expression.bioAssayData.DesignElementDataVector;
+import ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector;
+import ubic.gemma.model.expression.biomaterial.BioMaterial;
+import ubic.gemma.model.expression.designElement.CompositeSequence;
+import ubic.gemma.model.expression.experiment.ExpressionExperiment;
+import ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet;
+import ubic.gemma.model.genome.biosequence.BioSequence;
+import ubic.gemma.persistence.service.analysis.expression.sampleCoexpression.SampleCoexpressionAnalysisService;
+import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService;
+import ubic.gemma.persistence.service.expression.bioAssay.BioAssayService;
+import ubic.gemma.persistence.service.expression.bioAssayData.BioAssayDimensionService;
+import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService;
+import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentSubSetService;
+
+import java.util.*;
+
+/**
+ * Switch an expression experiment from one array design to another. This is valuable when the EE uses more than on AD,
+ * and a merged AD exists. The following steps are needed:
+ * <ul>
+ *
+ * <li>Delete old analyses of this experiment and processeddata vectors
+ * <li>For each array design, for each probe, identify the matching probe on the merged AD. Have to deal with situation
+ * <li>where more than one occurrence of each sequence is found.
+ * <li>all DEDVs must be switched to use the new AD's design elements
+ * <li>all bioassays must be switched to the new AD.
+ * <li>update the EE description
+ * <li>commit changes.
+ * <li>Computed processed data vectors
+ * </ul>
+ * This also handles the case of multisamples-per-platform - NOT the case of one-sample-per-platform but multiple
+ * platforms; for that you have to run VectorMerging. For more nutty situations this will probably create a mess.
+ *
+ * @author pavlidis
+ * @see    VectorMergingService
+ */
+@Component
+public class ExpressionExperimentPlatformSwitchServiceImpl extends ExpressionExperimentVectorManipulatingService implements ExpressionExperimentPlatformSwitchService {
+
+    private static final Log log = LogFactory.getLog( ExpressionExperimentPlatformSwitchServiceImpl.class.getName() );
+
+    /**
+     * Used to identify design elements that have no sequence associated with them.
+     */
+    public static BioSequence NULL_BIOSEQUENCE;
+
+    static {
+        NULL_BIOSEQUENCE = BioSequence.Factory.newInstance();
+        ExpressionExperimentPlatformSwitchServiceImpl.NULL_BIOSEQUENCE.setName( "______NULL______" );
+        ExpressionExperimentPlatformSwitchServiceImpl.NULL_BIOSEQUENCE.setId( -1L );
+    }
+
+    @Autowired
+    private ArrayDesignService arrayDesignService;
+
+    @Autowired
+    private BioAssayDimensionService bioAssayDimensionService;
+
+    @Autowired
+    private BioAssayService bioAssayService;
+
+    @Autowired
+    private ExpressionExperimentService expressionExperimentService;
+
+    @Autowired
+    private ExperimentPlatformSwitchHelperService helperService;
+
+    @Autowired
+    private SampleCoexpressionAnalysisService sampleCoexpressionAnalysisService;
+
+    @Autowired
+    private ExpressionExperimentSubSetService subsetService;
+
+    @Autowired
+    private AnalysisUtilService analysisUtilService;
+
+    /**
+     * If you know the array designs are already in a merged state, you should use switchExperimentToMergedPlatform
+     *
+     * @param  ee          ee
+     * @param  arrayDesign The array design to switch to. If some samples already use that array design, nothing will be
+     *                     changed for them.
+     * @return the switched experiment
+     */
+    @Override
+    public void switchExperimentToArrayDesign( ExpressionExperiment ee, ArrayDesign arrayDesign ) {
+        assert arrayDesign != null;
+
+        // remove stuff that will be in the way.
+        processedExpressionDataVectorService.removeProcessedDataVectors( ee );
+        sampleCoexpressionAnalysisService.removeForExperiment( ee );
+        for ( ExpressionExperimentSubSet subset : expressionExperimentService.getSubSets( ee ) ) {
+            subsetService.remove( subset );
+        }
+        analysisUtilService.deleteOldAnalyses( ee );
+
+        // get relation between sequence and designelements.
+        Map<BioSequence, Collection<CompositeSequence>> designElementMap = new HashMap<>();
+        Collection<CompositeSequence> elsWithNoSeq = new HashSet<>();
+        this.populateCSeq( arrayDesign, designElementMap, elsWithNoSeq );
+
+        ee = expressionExperimentService.thaw( ee );
+
+        if ( elsWithNoSeq.size() > 0 ) {
+            ExpressionExperimentPlatformSwitchServiceImpl.log
+                    .info( elsWithNoSeq.size() + " elements on the new platform have no associated sequence." );
+            designElementMap.put( ExpressionExperimentPlatformSwitchServiceImpl.NULL_BIOSEQUENCE, elsWithNoSeq );
+        }
+
+        boolean multiPlatformPerSample = this.switchPlatform( ee, arrayDesign );
+        /*
+         * For a multiplatform-per-sample case ("Case 2", platforms are disjoint): (note that some samples might just be
+         * on one platform...)
+         * 1. Pick a BAD that can be used for all DataVectors (it has all BioAssays in it).
+         * 2. Switch vectors to use it - may require adding NaNs and reordering the vectors (switchDataForPlatform)
+         * 3. Delete the Bioassays that are using other BADs (cleanupUnused)
+         *
+         * For Case 1 (each sample run on one platform, all platforms are related/similar) Simply switch the vectors
+         * (switchDataForPlatform)
+         */
+
+        /*
+         * Now we have to get the BADs. Problem to watch out for: they might not be the same length, we need one that
+         * includes all BioMaterials.
+         */
+        Collection<BioAssayDimension> unusedBADs = new HashSet<>();
+        BioAssayDimension targetBioAssayDimension = null;
+        if ( multiPlatformPerSample ) {
+            targetBioAssayDimension = this.doMultiSample( ee, unusedBADs );
+        }
+        if ( multiPlatformPerSample && targetBioAssayDimension == null ) {
+            throw new RuntimeException( "Data set cannot be switched to merged platform: no suitable bioassaydimension found" );
+        }
+
+        /*
+         * To account for cases where we have no data loaded yet.
+         */
+        boolean hasData = expressionExperimentService.getQuantitationTypes( ee ).size() > 0 && ee.getRawExpressionDataVectors().size() > 0;
+
+        Collection<ArrayDesign> oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed( ee );
+        Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements = new HashMap<>();
+        int totalVectorsSwitched = 0;
+        for ( ArrayDesign oldAd : oldArrayDesigns ) {
+            totalVectorsSwitched += this.switchDataForPlatform( ee, arrayDesign, designElementMap, targetBioAssayDimension, usedDesignElements,
+                    oldAd );
+        }
+
+        if ( totalVectorsSwitched == 0 && hasData ) {
+            throw new RuntimeException( "No vectors were switched" );
+        }
+
+        String descriptionUpdate = "[Switched to use " + arrayDesign.getShortName() + " by Gemma]";
+        if ( !ee.getDescription().contains( descriptionUpdate ) ) {
+            ee.setDescription( ee.getDescription() + " " + descriptionUpdate );
+        }
+
+        helperService.persist( ee, arrayDesign );
+
+        log.info( "Finishing up" );
+        if ( targetBioAssayDimension != null && !unusedBADs.isEmpty() ) {
+            this.cleanupUnused( unusedBADs, targetBioAssayDimension );
+        }
+
+        ee = expressionExperimentService.loadOrFail( ee.getId() ); // refresh after cleanup
+        ee = expressionExperimentService.thawLite( ee );
+
+        if ( hasData ) {
+            processedExpressionDataVectorService.createProcessedDataVectors( ee ); // this still fails sometimes? works fine if run later by cli
+        }
+    }
+
+    /**
+     * Automatically identify an appropriate merged platform
+     *
+     * @param  expExp ee
+     * @return ee
+     */
+    @Override
+    public void switchExperimentToMergedPlatform( ExpressionExperiment expExp ) {
+        ArrayDesign arrayDesign = this.locateMergedDesign( expExp );
+        if ( arrayDesign == null )
+            throw new IllegalArgumentException( "Experiment has no merged design to switch to" );
+        this.switchExperimentToArrayDesign( expExp, arrayDesign );
+    }
+
+    private boolean switchPlatform( ExpressionExperiment ee, ArrayDesign arrayDesign ) {
+        boolean multiPlatformPerSample = false;
+        for ( BioAssay assay : ee.getBioAssays() ) {
+
+            // Switch the assay to use the desired platform. However, if this is a second switch, don't lose the original value
+            if ( assay.getOriginalPlatform() == null ) {
+                assay.setOriginalPlatform( assay.getArrayDesignUsed() );
+            }
+            assay.setArrayDesignUsed( arrayDesign );
+
+            /*
+             * Side effect: Detect cases of each-sample-run-on-multiple-platforms - we need to merge the bioassays, too.
+             * That means fiddling with the BioAssayDimensions (BADs). We do this check here to avoid unnecessarily
+             * inspecting BADs.
+             */
+            if ( !multiPlatformPerSample && assay.getSampleUsed().getBioAssaysUsedIn().size() > 1 ) {
+                multiPlatformPerSample = true;
+            }
+        }
+        return multiPlatformPerSample;
+    }
+
+    /**
+     * Remove bioassays that are no longer needed
+     *
+     */
+    private void cleanupUnused( Collection<BioAssayDimension> unusedBADs, BioAssayDimension maxBAD ) {
+        ExpressionExperimentPlatformSwitchServiceImpl.log.info( "Checking for unused BioAssays after merge" );
+
+        Collection<BioAssay> removed = new HashSet<>();
+        for ( BioAssayDimension bioAssayDimension : unusedBADs ) {
+            List<BioAssay> bioAssays = bioAssayDimension.getBioAssays();
+            for ( BioAssay ba : bioAssays ) {
+                if ( !maxBAD.getBioAssays().contains( ba ) && !removed.contains( ba ) ) {
+                    try {
+                        ExpressionExperimentPlatformSwitchServiceImpl.log.info( "Deleting unused bioassay: " + ba );
+                        bioAssayService.remove( ba );
+                        removed.add( ba );
+                    } catch ( Exception e ) {
+                        log.error( "Failed to delete: " + ba, e );
+                    }
+                }
+            }
+            try {
+                bioAssayDimensionService.remove( bioAssayDimension );
+                log.info( "Removed unused bioAssayDimension ID=" + bioAssayDimension.getId() );
+            } catch ( Exception e ) {
+                log.warn( "Failed to clean up unused bioassaydimension with ID=" + bioAssayDimension.getId() + ": " + e
+                        .getMessage() );
+            }
+        }
+        log.info( "Removed " + removed.size() + " usused bioassays" );
+    }
+
+    /**
+     * Find the bioassaydimension that covers all the biomaterials.
+     *
+     */
+    private BioAssayDimension doMultiSample( ExpressionExperiment ee, Collection<BioAssayDimension> unusedBADs ) {
+
+        int maxSize = 0;
+        BioAssayDimension maxBAD = null;
+        for ( BioAssay ba : ee.getBioAssays() ) {
+            Collection<BioAssayDimension> oldBioAssayDims = bioAssayService.findBioAssayDimensions( ba );
+            for ( BioAssayDimension bioAssayDim : oldBioAssayDims ) {
+                unusedBADs.add( bioAssayDim );
+                int size = bioAssayDim.getBioAssays().size();
+
+                if ( size > maxSize ) {
+                    maxSize = size;
+                    maxBAD = bioAssayDim;
+                }
+            }
+        }
+
+        assert unusedBADs.size() > 1; // otherwise we shouldn't be here.
+        unusedBADs.remove( maxBAD );
+
+        /*
+         * Make sure all biomaterials in the study are included in the chosen bioassaydimension. If not, we'd have
+         * to make a new BAD.
+         */
+        if ( maxBAD != null ) {
+            Collection<BioMaterial> bmsInmaxBAD = new HashSet<>();
+            for ( BioAssay ba : maxBAD.getBioAssays() ) {
+                bmsInmaxBAD.add( ba.getSampleUsed() );
+            }
+
+            for ( BioAssay ba : ee.getBioAssays() ) {
+                if ( !bmsInmaxBAD.contains( ba.getSampleUsed() ) ) {
+
+                    ExpressionExperimentPlatformSwitchServiceImpl.log
+                            .debug( "This experiment looked like it had samples run on more than one platform, "
+                                    + "but it also has no BioAssayDimension that is eligible to accomodate all samples (Example: "
+                                    + ba.getSampleUsed() );
+                    maxBAD = null;
+                    break;
+                }
+            }
+
+        }
+
+        if ( maxBAD == null ) {
+            log.info( "Creating new bioassaydimension to accomodate merged data as no existing one was suitable" );
+            maxBAD = BioAssayDimension.Factory.newInstance( "For " + ee.getBioAssays().size() +
+                    " bioMaterials", "Created to accomodate platform switch", new ArrayList<>( ee.getBioAssays() ) );
+            maxBAD = bioAssayDimensionService.create( maxBAD );
+        }
+
+
+        return maxBAD;
+    }
+
+    /**
+     * Determine whether the two bioassaydimensions are the same, based on the samples used. Note it is inefficient to
+     * call this over and over but it's not a big deal so far.
+     *
+     * @param  currentOrder current order
+     * @param  desiredOrder desired order
+     * @return boolean
+     */
+    private boolean equivalent( List<BioAssay> currentOrder, List<BioAssay> desiredOrder ) {
+        if ( currentOrder.size() != desiredOrder.size() ) {
+            return false;
+        }
+
+        for ( int i = 0; i < currentOrder.size(); i++ ) {
+            if ( !currentOrder.get( i ).getSampleUsed().equals( desiredOrder.get( i ).getSampleUsed() ) ) {
+                return false;
+            }
+        }
+
+        return true;
+
+    }
+
+    private ArrayDesign locateMergedDesign( ExpressionExperiment expExp ) {
+        // get the array designs for this EE
+        ArrayDesign arrayDesign = null;
+        Collection<ArrayDesign> oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed( expExp );
+
+        // find the AD they have been merged into, make sure it is exists and they are all merged into the same AD.
+        for ( ArrayDesign design : oldArrayDesigns ) {
+            ArrayDesign mergedInto = design.getMergedInto();
+            // TODO: use thaw for collection of platforms
+            mergedInto = arrayDesignService.thaw( mergedInto );
+
+            if ( mergedInto == null ) {
+                throw new IllegalArgumentException(
+                        design + " used by " + expExp + " is not merged into another design" );
+            }
+
+            // TODO: go up the merge tree to find the root. This is too slow.
+            // while ( mergedInto.getMergedInto() != null ) {
+            // mergedInto = arrayDesignService.thaw( mergedInto.getMergedInto() );
+            // }
+
+            if ( arrayDesign == null ) {
+                arrayDesign = mergedInto;
+                arrayDesign = arrayDesignService.thaw( arrayDesign );
+            }
+
+            if ( !mergedInto.equals( arrayDesign ) ) {
+                throw new IllegalArgumentException(
+                        design + " used by " + expExp + " is not merged into " + arrayDesign );
+            }
+
+        }
+        return arrayDesign;
+    }
+
+    /**
+     * Set up a map of sequences to elements for the platform we're switching to
+     *
+     */
+    private void populateCSeq( ArrayDesign arrayDesign,
+            Map<BioSequence, Collection<CompositeSequence>> designElementMap,
+            Collection<CompositeSequence> elsWithNoSeq ) {
+        for ( CompositeSequence cs : arrayDesign.getCompositeSequences() ) {
+            BioSequence bs = cs.getBiologicalCharacteristic();
+            if ( bs == null ) {
+                elsWithNoSeq.add( cs );
+            } else {
+                if ( !designElementMap.containsKey( bs ) ) {
+                    designElementMap.put( bs, new HashSet<CompositeSequence>() );
+                }
+                designElementMap.get( bs ).add( cs );
+            }
+        }
+    }
+
+    /**
+     * @param  designElementMap      Mapping of sequences to probes for the platform that is being switch from. This is
+     *                               used
+     *                               to identify new candidates.
+     * @param  usedDesignElements    probes from the new design that have already been assigned to probes from the old
+     *                               design. If things are done correctly (the old design was merged into the new) then
+     *                               there should be enough.
+     *                               Map is of the new design probe to the old design probe it was used for (this is
+     *                               debugging information)
+     * @param  vector                vector
+     * @param  bad                   BioAssayDimension to use, if necessary. If this is null or already the one used,
+     *                               it's ignored.
+     *                               Otherwise the vector data will be rewritten to match it.
+     * @return true if a switch was made
+     * @throws IllegalStateException if there is no (unused) design element matching the vector's biosequence
+     */
+    private boolean processVector( Map<BioSequence, Collection<CompositeSequence>> designElementMap,
+            Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements, RawExpressionDataVector vector,
+            BioAssayDimension bad ) {
+
+        CompositeSequence oldDe = vector.getDesignElement();
+
+        Collection<CompositeSequence> newElCandidates;
+        BioSequence seq = oldDe.getBiologicalCharacteristic();
+        if ( seq == null ) {
+            newElCandidates = designElementMap.get( ExpressionExperimentPlatformSwitchServiceImpl.NULL_BIOSEQUENCE );
+        } else {
+            newElCandidates = designElementMap.get( seq );
+        }
+
+        if ( newElCandidates == null || newElCandidates.isEmpty() ) {
+            return false;
+            //            throw new IllegalStateException(
+            //                    "There are no candidates probes for sequence: " + seq + "('null' should be okay)" );
+        }
+
+        for ( CompositeSequence newEl : newElCandidates ) {
+            if ( !usedDesignElements.containsKey( newEl ) ) {
+
+                vector.setDesignElement( newEl );
+                usedDesignElements.put( newEl, new HashSet<BioAssayDimension>() );
+                usedDesignElements.get( newEl ).add( vector.getBioAssayDimension() );
+                break;
+            }
+
+            if ( !usedDesignElements.get( newEl ).contains( vector.getBioAssayDimension() ) ) {
+                /*
+                 * Then it's okay to use it.
+                 */
+                vector.setDesignElement( newEl );
+                usedDesignElements.get( newEl ).add( vector.getBioAssayDimension() );
+                break;
+            }
+        }
+
+        if ( bad != null && !vector.getBioAssayDimension().equals( bad ) ) {
+            /*
+             * 1. Check if they are already the same; then just switch it to the desired BAD
+             * 2. If not, then the vector data has to be rewritten.
+             */
+            this.vectorReWrite( vector, bad );
+
+        }
+        return true;
+    }
+
+    /**
+     * Switch the vectors from one platform to the other, using the bioassaydimension passed if non-null
+     *
+     * @param  targetBioAssayDimension - if null we keep the current one; otherwise we rewrite data datavectors to use
+     *                                 the new one.
+     * @return how many vectors were switched
+     */
+    private int switchDataForPlatform( ExpressionExperiment ee, ArrayDesign arrayDesign,
+            Map<BioSequence, Collection<CompositeSequence>> designElementMap, BioAssayDimension targetBioAssayDimension,
+            Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements, ArrayDesign oldAd ) {
+        if ( oldAd.equals( arrayDesign ) )
+            return 0;
+
+        oldAd = arrayDesignService.thaw( oldAd );
+
+        if ( oldAd.getCompositeSequences().size() == 0 && !oldAd.getTechnologyType().equals( TechnologyType.SEQUENCING ) ) {
+            /*
+             * Bug 3451 - this is okay if it is a RNA-seq experiment etc. prior to data upload.
+             */
+            throw new IllegalStateException( oldAd + " has no elements" );
+        }
+
+        int totalSwitched = 0;
+        Collection<QuantitationType> qts = expressionExperimentService.getQuantitationTypes( ee, oldAd );
+        ExpressionExperimentPlatformSwitchServiceImpl.log
+                .info( "Processing " + qts.size() + " quantitation types for vectors on " + oldAd );
+        for ( QuantitationType type : qts ) {
+
+            // use each design element only once per quantitation type + bioassaydimension per array design
+            usedDesignElements.clear();
+
+            // assumption: we have no processed data. They should have been deleted at this point.
+            Collection<RawExpressionDataVector> vecsForQt = new HashSet<>();
+
+            //
+
+            int count = 0;
+            for ( RawExpressionDataVector vector : ee.getRawExpressionDataVectors() ) {
+
+                if ( !vector.getQuantitationType().equals( type ) ) {
+                    continue;
+                }
+
+                CompositeSequence oldDe = vector.getDesignElement();
+
+                if ( !oldDe.getArrayDesign().equals( oldAd ) ) {
+                    continue;
+                }
+                vecsForQt.add( vector );
+
+            }
+
+            if ( vecsForQt.isEmpty() ) {
+                /*
+                 * This can happen when the quantitation types vary for the array designs.
+                 */
+                log.info( "No vectors for " + type + " on " + oldAd + " (Might be okay; not all QTs might be on all platforms)" );
+                continue;
+            }
+
+            log.info( "Switching " + vecsForQt.size() + " vectors for " + type + " from " + oldAd.getShortName()
+                    + " to " + arrayDesign.getShortName()
+                    + ( targetBioAssayDimension == null ? "" : ", BioAssayDimension=" + targetBioAssayDimension ) );
+
+            vecsForQt = rawExpressionDataVectorService.thaw( vecsForQt );
+
+            int numwarns = 0;
+            int maxwarns = 30;
+            for ( RawExpressionDataVector vector : vecsForQt ) {
+                if ( this.processVector( designElementMap, usedDesignElements, vector, targetBioAssayDimension ) ) {
+                    count++;
+                } else {
+                    if ( numwarns++ < maxwarns ) {
+                        log.warn( "No matching element found on target platform for " + vector.getDesignElement() );
+                    }
+                    if ( numwarns == maxwarns ) {
+                        log.warn( "[Further no-match warnings suppressed]" );
+                    }
+                }
+            }
+
+            if ( count != vecsForQt.size() ) {
+                throw new IllegalStateException(
+                        "Found matches for only " + count + "/" + vecsForQt.size() + " vectors for " + type + " from " + oldAd.getShortName()
+                                + ( targetBioAssayDimension == null ? "" : ", BioAssayDimension=" + targetBioAssayDimension ) );
+            }
+
+            // sanity check. this is all fine.
+            //            for ( RawExpressionDataVector v : vecsForQt ) {
+            //                if ( v.getQuantitationType().equals( type )
+            //                        && !arrayDesign.equals( v.getDesignElement().getArrayDesign() ) ) {
+            //                    throw new IllegalStateException( "A raw vector for QT =" + v.getQuantitationType()
+            //                            + " was not correctly switched to the target platform " + arrayDesign + ", it was on "
+            //                            + v.getDesignElement().getArrayDesign() + " while switching from " + oldAd );
+            //                }
+            //            }
+
+            //this check shouldn't be necessary
+            for ( RawExpressionDataVector v : ee.getRawExpressionDataVectors() ) {
+                if ( v.getQuantitationType().equals( type )
+                        && !arrayDesign.equals( v.getDesignElement().getArrayDesign() ) && v.getDesignElement().getArrayDesign().equals( oldAd ) ) {
+                    throw new IllegalStateException( "A raw vector " + v + "for QT =" + v.getQuantitationType()
+                            + " was not correctly switched to the target platform " + arrayDesign + ", it was still on "
+                            + v.getDesignElement().getArrayDesign() );
+                }
+            }
+
+            totalSwitched += count;
+
+        }
+        return totalSwitched;
+    }
+
+    /**
+     * Rearrange/expand a vector as necessary to use the given BioAssayDimension. Only used for multiplatform case of
+     * samples run on multiple platforms.
+     *
+     * @param vector vector
+     * @param bad    to be used as the replacement.
+     */
+    private void vectorReWrite( DesignElementDataVector vector, BioAssayDimension bad ) {
+        List<BioAssay> desiredOrder = bad.getBioAssays();
+        List<BioAssay> currentOrder = vector.getBioAssayDimension().getBioAssays();
+        if ( this.equivalent( currentOrder, desiredOrder ) ) {
+            // Easy, we can just switch it.
+            vector.setBioAssayDimension( bad );
+            return;
+        }
+
+        /*
+         * We remake the data vector following the new ordering.
+         */
+        PrimitiveType representation = vector.getQuantitationType().getRepresentation();
+        Object missingVal;
+        if ( representation.equals( PrimitiveType.DOUBLE ) ) {
+            missingVal = Double.NaN;
+        } else if ( representation.equals( PrimitiveType.STRING ) ) {
+            missingVal = "";
+        } else if ( representation.equals( PrimitiveType.INT ) ) {
+            missingVal = 0;
+        } else if ( representation.equals( PrimitiveType.BOOLEAN ) ) {
+            missingVal = false;
+        } else {
+            throw new UnsupportedOperationException(
+                    "Missing values in data vectors of type " + representation + " not supported (when processing "
+                            + vector );
+        }
+
+        List<Object> oldData = new ArrayList<>();
+        super.convertFromBytes( oldData, vector.getQuantitationType().getRepresentation(), vector );
+
+        /*
+         * Now data has the old data, so we need to rearrange it to match, inserting missings as necessary.
+         */
+        Map<BioMaterial, Integer> bm2loc = new HashMap<>();
+        int i = 0;
+        List<Object> newData = new ArrayList<>();
+        // initialize
+        for ( BioAssay ba : desiredOrder ) {
+            bm2loc.put( ba.getSampleUsed(), i++ );
+            newData.add( missingVal );
+        }
+
+        // Put data into new locations
+        int j = 0;
+        for ( BioAssay ba : currentOrder ) {
+            Integer loc = bm2loc.get( ba.getSampleUsed() );
+            assert loc != null;
+            newData.set( loc, oldData.get( j++ ) );
+        }
+
+        byte[] newDataAr = converter.toBytes( newData.toArray() );
+        vector.setData( newDataAr );
+        vector.setBioAssayDimension( bad );
+
+    }
+}
diff --git a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/arrayDesign/ArrayDesignMergeServiceImpl.java b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/arrayDesign/ArrayDesignMergeServiceImpl.java
index d8cefd2d54..34f04f7555 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/loader/expression/arrayDesign/ArrayDesignMergeServiceImpl.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/loader/expression/arrayDesign/ArrayDesignMergeServiceImpl.java
@@ -25,6 +25,7 @@
 import org.springframework.stereotype.Component;
 import ubic.gemma.core.analysis.report.ArrayDesignReportService;
 import ubic.gemma.core.loader.expression.ExpressionExperimentPlatformSwitchService;
+import ubic.gemma.core.loader.expression.ExpressionExperimentPlatformSwitchServiceImpl;
 import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
 import ubic.gemma.model.expression.designElement.CompositeSequence;
 import ubic.gemma.model.genome.biosequence.BioSequence;
@@ -210,7 +211,7 @@ private ArrayDesign makeBioSeqMap( Map<BioSequence, Collection<CompositeSequence
             if ( bs == null ) {
                 // this is common. We could try to match them up based on the probe name, because there is nothing else
                 // to go on.
-                bs = ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE;
+                bs = ExpressionExperimentPlatformSwitchServiceImpl.NULL_BIOSEQUENCE;
             }
 
             if ( !globalBsMap.containsKey( bs ) ) {
@@ -265,7 +266,7 @@ private Collection<CompositeSequence> makeNewProbes( ArrayDesign arrayDesign,
 
                 CompositeSequence newCs = CompositeSequence.Factory.newInstance();
 
-                if ( !bs.equals( ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE ) ) {
+                if ( !bs.equals( ExpressionExperimentPlatformSwitchServiceImpl.NULL_BIOSEQUENCE ) ) {
                     newCs.setBiologicalCharacteristic( bs );
                 }
 

From 8c6a5283d8e2864c359ae5f4dedc275510627452 Mon Sep 17 00:00:00 2001
From: Guillaume Poirier-Morency <poirigui@msl.ubc.ca>
Date: Mon, 24 Jul 2023 21:27:21 -0700
Subject: [PATCH 8/8] Add a unit test for skipping log2cpm and post-processing
 in DataUpdater

---
 .../loader/expression/DataUpdater2Test.java   | 173 ++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdater2Test.java

diff --git a/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdater2Test.java b/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdater2Test.java
new file mode 100644
index 0000000000..14817733b7
--- /dev/null
+++ b/gemma-core/src/test/java/ubic/gemma/core/loader/expression/DataUpdater2Test.java
@@ -0,0 +1,173 @@
+package ubic.gemma.core.loader.expression;
+
+import org.junit.Before;
+import org.junit.Test;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.AbstractJUnit4SpringContextTests;
+import ubic.basecode.dataStructure.matrix.DoubleMatrix;
+import ubic.basecode.io.reader.DoubleMatrixReader;
+import ubic.gemma.core.analysis.preprocess.PreprocessorService;
+import ubic.gemma.core.analysis.preprocess.VectorMergingService;
+import ubic.gemma.core.loader.expression.geo.service.GeoService;
+import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
+import ubic.gemma.model.expression.bioAssay.BioAssay;
+import ubic.gemma.model.expression.biomaterial.BioMaterial;
+import ubic.gemma.model.expression.designElement.CompositeSequence;
+import ubic.gemma.model.expression.experiment.ExpressionExperiment;
+import ubic.gemma.persistence.service.analysis.expression.pca.PrincipalComponentAnalysisService;
+import ubic.gemma.persistence.service.analysis.expression.sampleCoexpression.SampleCoexpressionAnalysisService;
+import ubic.gemma.persistence.service.common.auditAndSecurity.AuditTrailService;
+import ubic.gemma.persistence.service.common.quantitationtype.QuantitationTypeService;
+import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService;
+import ubic.gemma.persistence.service.expression.bioAssay.BioAssayService;
+import ubic.gemma.persistence.service.expression.bioAssayData.BioAssayDimensionService;
+import ubic.gemma.persistence.service.expression.bioAssayData.RawAndProcessedExpressionDataVectorService;
+import ubic.gemma.persistence.service.expression.bioAssayData.RawExpressionDataVectorService;
+import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService;
+import ubic.gemma.persistence.util.TestComponent;
+
+import java.io.IOException;
+import java.util.Collections;
+
+import static org.mockito.Mockito.*;
+
+@ContextConfiguration
+public class DataUpdater2Test extends AbstractJUnit4SpringContextTests {
+
+    @Configuration
+    @TestComponent
+    static class DataUpdater2TestContextConfiguration {
+
+        @Bean
+        public DataUpdater dataUpdater() {
+            return new DataUpdaterImpl();
+        }
+
+        @Bean
+        public ArrayDesignService arrayDesignService() {
+            return mock( ArrayDesignService.class );
+        }
+
+        @Bean
+        public AuditTrailService auditTrailService() {
+            return mock( AuditTrailService.class );
+        }
+
+        @Bean
+        public BioAssayDimensionService bioAssayDimensionService() {
+            return mock( BioAssayDimensionService.class );
+        }
+
+        @Bean
+        public BioAssayService bioAssayService() {
+            return mock( BioAssayService.class );
+        }
+
+        @Bean
+        public ExpressionExperimentPlatformSwitchService experimentPlatformSwitchService() {
+            return mock( ExpressionExperimentPlatformSwitchService.class );
+        }
+
+        @Bean
+        public ExpressionExperimentService experimentService() {
+            return mock( ExpressionExperimentService.class );
+        }
+
+        @Bean
+        public GeoService geoService() {
+            return mock( GeoService.class );
+        }
+
+        @Bean
+        public PrincipalComponentAnalysisService pcaService() {
+            return mock( PrincipalComponentAnalysisService.class );
+        }
+
+        @Bean
+        public PreprocessorService preprocessorService() {
+            return mock( PreprocessorService.class );
+        }
+
+        @Bean
+        public QuantitationTypeService qtService() {
+            return mock( QuantitationTypeService.class );
+        }
+
+        @Bean
+        public RawExpressionDataVectorService rawExpressionDataVectorService() {
+            return mock( RawExpressionDataVectorService.class );
+        }
+
+        @Bean
+        public SampleCoexpressionAnalysisService sampleCorService() {
+            return mock( SampleCoexpressionAnalysisService.class );
+        }
+
+        @Bean
+        public VectorMergingService vectorMergingService() {
+            return mock( VectorMergingService.class );
+        }
+
+        @Bean
+        public RawAndProcessedExpressionDataVectorService rawAndProcessedExpressionDataVectorService() {
+            return mock( RawAndProcessedExpressionDataVectorService.class );
+        }
+    }
+
+    @Autowired
+    private DataUpdater dataUpdater;
+
+    @Autowired
+    private PreprocessorService preprocessorService;
+
+    @Autowired
+    private ExpressionExperimentService expressionExperimentService;
+
+    @Autowired
+    private ArrayDesignService arrayDesignService;
+
+    @Autowired
+    private BioAssayDimensionService bioAssayDimensionService;
+
+    private ExpressionExperiment ee;
+    private ArrayDesign ad;
+    private DoubleMatrix<String, String> rpkmMatrix, countMatrix;
+
+    @Before
+    public void setUp() throws IOException {
+        rpkmMatrix = new DoubleMatrixReader().read( getClass().getResourceAsStream( "/data/loader/expression/flatfileload/GSE19166_expression_count.test.txt" ) );
+        countMatrix = new DoubleMatrixReader().read( getClass().getResourceAsStream( "/data/loader/expression/flatfileload/GSE19166_expression_RPKM.test.txt" ) );
+        ee = new ExpressionExperiment();
+        for ( String col : countMatrix.getColNames() ) {
+            BioAssay ba = BioAssay.Factory.newInstance( col );
+            BioMaterial bm = BioMaterial.Factory.newInstance( col );
+            bm.setBioAssaysUsedIn( Collections.singleton( ba ) );
+            ba.setSampleUsed( bm );
+            ee.getBioAssays().add( ba );
+        }
+        ad = new ArrayDesign();
+        ad.setId( 1L );
+        for ( String row : countMatrix.getRowNames() ) {
+            ad.getCompositeSequences().add( CompositeSequence.Factory.newInstance( row, ad ) );
+        }
+    }
+
+    @Test
+    public void testSkipRecomputeLog2cpm() {
+        when( expressionExperimentService.thawLite( ee ) ).thenReturn( ee );
+        when( expressionExperimentService.addRawVectors( same( ee ), any() ) ).thenReturn( ee );
+        when( expressionExperimentService.replaceRawVectors( same( ee ), any() ) ).thenReturn( ee );
+        when( arrayDesignService.thaw( ad ) ).thenReturn( ad );
+        when( expressionExperimentService.getArrayDesignsUsed( ee ) ).thenReturn( Collections.singleton( ad ) );
+        when( bioAssayDimensionService.findOrCreate( any() ) ).thenAnswer( a -> a.getArgument( 0 ) );
+        dataUpdater.addCountData( ee, ad, countMatrix, rpkmMatrix, 31, false, false, false );
+        verify( preprocessorService ).process( ee );
+        verify( expressionExperimentService, times( 2 ) ).addRawVectors( same( ee ), any() );
+        dataUpdater.addCountData( ee, ad, countMatrix, rpkmMatrix, 30, false, false, true );
+        verify( expressionExperimentService ).replaceRawVectors( same( ee ), any() );
+        verifyNoMoreInteractions( preprocessorService );
+    }
+}
\ No newline at end of file