Skip to content

Commit

Permalink
[SYSTEMDS-2787] Compression Steps Reorganization
Browse files Browse the repository at this point in the history
This commit contains various changes (some massive ones).
The biggest change is the ordering of compression steps, of which
now we classify first on a sample of the data. Since this was experimented
to be 10-30% faster. Furthermore this allows us to try compression at a
lower cost if the compression is not valid to perform.

Overall Compression time for covtype went from
 - ~1.0 to 0.36 sec (including read from disk) 0.11 sec compression

Furthermore now unlike before the transpose is heuristically chosen, Since
it is more efficient not to transpose the matrix for compression in some
cases.

- Compressed Sparse matrix multiplication fix
- modified matrix multiplication to push down information of
  transposing to the ba+* op. to allow not decompressing the matrix.
- Configuration option of enabling and disabling overlapping compression.
- decompress row section direct access to the matrix block not using
  quick set/get.
- adding safe boolean to decompress to specify if management of
  nnz should be done. This allows the decompression of intermediates at
  near half the computation cost.
- Add configuration for sampling ratio default 0.01 but with a minimum
  sample size of 2000 elements.
- DML Config settings for Cocode-Compression method default to COST
- add support for right sparse matrix multiplication with overlapping
  output. Further improvements are on the way.
- Compression statistics are added when statistics and compression is
  enabled
- Readers for extracting bitmaps are optimized for either transposed or
  untransposed matrices giving 5-15% improved performance.
- Hashmaps are modified to improve insertion time since previously they
  would hash values twice 10% improved performance. furthermore the
  default sizes are modified to start smaller.
- Additional tests for multipication to cover different edge cases.
  • Loading branch information
Baunsgaard committed Jan 8, 2021
1 parent d4839b4 commit 12cdc89
Show file tree
Hide file tree
Showing 65 changed files with 2,099 additions and 959 deletions.
11 changes: 10 additions & 1 deletion src/main/java/org/apache/sysds/conf/DMLConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ public class DMLConfig
public static final String COMPRESSED_LINALG = "sysds.compressed.linalg"; //auto, cost, true, false
public static final String COMPRESSED_LOSSY = "sysds.compressed.lossy";
public static final String COMPRESSED_VALID_COMPRESSIONS = "sysds.compressed.valid.compressions";
public static final String COMPRESSED_OVERLAPPING = "sysds.compressed.overlapping"; // true, false
public static final String COMPRESSED_SAMPLING_RATIO = "sysds.compressed.sampling.ratio"; // 0.1
public static final String COMPRESSED_COCODE = "sysds.compressed.cocode"; // COST
public static final String COMPRESSED_TRANSPOSE = "sysds.compressed.transpose"; // true, false, auto.
public static final String NATIVE_BLAS = "sysds.native.blas";
public static final String NATIVE_BLAS_DIR = "sysds.native.blas.directory";
public static final String CODEGEN = "sysds.codegen.enabled"; //boolean
Expand Down Expand Up @@ -121,6 +125,10 @@ public class DMLConfig
_defaultVals.put(COMPRESSED_LINALG, Compression.CompressConfig.AUTO.name() );
_defaultVals.put(COMPRESSED_LOSSY, "false" );
_defaultVals.put(COMPRESSED_VALID_COMPRESSIONS, "DDC,OLE,RLE");
_defaultVals.put(COMPRESSED_OVERLAPPING, "false" );
_defaultVals.put(COMPRESSED_SAMPLING_RATIO, "0.01");
_defaultVals.put(COMPRESSED_COCODE, "COST");
_defaultVals.put(COMPRESSED_TRANSPOSE, "auto");
_defaultVals.put(CODEGEN, "false" );
_defaultVals.put(CODEGEN_API, GeneratorAPI.JAVA.name() );
_defaultVals.put(CODEGEN_COMPILER, CompilerType.AUTO.name() );
Expand Down Expand Up @@ -385,7 +393,8 @@ public String getConfigInfo() {
String[] tmpConfig = new String[] {
LOCAL_TMP_DIR,SCRATCH_SPACE,OPTIMIZATION_LEVEL, DEFAULT_BLOCK_SIZE,
CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS, NATIVE_BLAS_DIR,
COMPRESSED_LINALG, COMPRESSED_LOSSY, COMPRESSED_VALID_COMPRESSIONS,
COMPRESSED_LINALG, COMPRESSED_LOSSY, COMPRESSED_VALID_COMPRESSIONS, COMPRESSED_OVERLAPPING,
COMPRESSED_SAMPLING_RATIO, COMPRESSED_COCODE, COMPRESSED_TRANSPOSE,
CODEGEN, CODEGEN_API, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS,
STATS_MAX_WRAP_LEN, PRINT_GPU_MEMORY_INFO,
AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY,
Expand Down
15 changes: 14 additions & 1 deletion src/main/java/org/apache/sysds/hops/AggBinaryOp.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.sysds.common.Types.OpOp2;
import org.apache.sysds.common.Types.ReOrgOp;
import org.apache.sysds.common.Types.ValueType;
import org.apache.sysds.conf.ConfigurationManager;
import org.apache.sysds.hops.rewrite.HopRewriteUtils;
import org.apache.sysds.lops.Lop;
import org.apache.sysds.lops.LopProperties.ExecType;
Expand Down Expand Up @@ -598,7 +599,7 @@ private void constructCPLopsPMM()
private void constructCPLopsMM(ExecType et)
{
Lop matmultCP = null;

String cla = ConfigurationManager.getDMLConfig().getTextValue("sysds.compressed.linalg");
if (et == ExecType.GPU) {
Hop h1 = getInput().get(0);
Hop h2 = getInput().get(1);
Expand All @@ -615,6 +616,18 @@ private void constructCPLopsMM(ExecType et)
matmultCP = new MatMultCP(left, right, getDataType(), getValueType(), et, leftTrans, rightTrans);
setOutputDimensions(matmultCP);
}
else if (cla.equals("true") || cla.equals("cost")){
Hop h1 = getInput().get(0);
Hop h2 = getInput().get(1);
int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
boolean leftTrans = HopRewriteUtils.isTransposeOperation(h1);
boolean rightTrans = HopRewriteUtils.isTransposeOperation(h2);
Lop left = !leftTrans ? h1.constructLops() :
h1.getInput().get(0).constructLops();
Lop right = !rightTrans ? h2.constructLops() :
h2.getInput().get(0).constructLops();
matmultCP = new MatMultCP(left, right, getDataType(), getValueType(), et, k, leftTrans, rightTrans);
}
else {
if( isLeftTransposeRewriteApplicable(true) ) {
matmultCP = constructCPLopsMMWithLeftTransposeRewrite();
Expand Down
47 changes: 29 additions & 18 deletions src/main/java/org/apache/sysds/lops/MatMultCP.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,58 +24,69 @@
import org.apache.sysds.common.Types.DataType;
import org.apache.sysds.common.Types.ValueType;

public class MatMultCP extends Lop
{
public class MatMultCP extends Lop {
private int numThreads = -1;
private boolean isLeftTransposed; // Used for GPU matmult operation
private boolean isRightTransposed;

private boolean useTranspose;

public MatMultCP(Lop input1, Lop input2, DataType dt, ValueType vt, ExecType et) {
this(input1, input2, dt, vt, et, 1);
}

public MatMultCP(Lop input1, Lop input2, DataType dt, ValueType vt, ExecType et, int k) {
super(Lop.Type.MatMultCP, dt, vt);
init(input1, input2, dt, vt, et);
numThreads = k;
}
public MatMultCP(Lop input1, Lop input2, DataType dt, ValueType vt, ExecType et,
boolean isLeftTransposed, boolean isRightTransposed) {

public MatMultCP(Lop input1, Lop input2, DataType dt, ValueType vt, ExecType et, boolean isLeftTransposed,
boolean isRightTransposed) {
super(Lop.Type.Binary, dt, vt);
init(input1, input2, dt, vt, et);
this.isLeftTransposed = isLeftTransposed;
this.isRightTransposed = isRightTransposed;
this.useTranspose = true;
}


public MatMultCP(Lop input1, Lop input2, DataType dt, ValueType vt, ExecType et, int k, boolean isLeftTransposed,
boolean isRightTransposed) {
this(input1, input2, dt, vt, et, k);
this.isLeftTransposed = isLeftTransposed;
this.isRightTransposed = isRightTransposed;
this.useTranspose = true;
}

private void init(Lop input1, Lop input2, DataType dt, ValueType vt, ExecType et) {
addInput(input1);
addInput(input2);
input1.addOutput(this);
input2.addOutput(this);
lps.setProperties( inputs, et);
lps.setProperties(inputs, et);
}

@Override
public String toString() {
return " Operation: ba+*";
}

@Override
public String getInstructions(String input1, String input2, String output) {
if( getExecType() == ExecType.CP ) {
return InstructionUtils.concatOperands(
getExecType().name(), "ba+*",
if(!useTranspose) {
return InstructionUtils.concatOperands(getExecType().name(),
"ba+*",
getInputs().get(0).prepInputOperand(input1),
getInputs().get(1).prepInputOperand(input2),
prepOutputOperand(output), String.valueOf(numThreads));
prepOutputOperand(output),
String.valueOf(numThreads));
}
else { //GPU
return InstructionUtils.concatOperands(
getExecType().name(), "ba+*",
else { // GPU or compressed
return InstructionUtils.concatOperands(getExecType().name(),
"ba+*",
getInputs().get(0).prepInputOperand(input1),
getInputs().get(1).prepInputOperand(input2),
prepOutputOperand(output), String.valueOf(numThreads),
prepOutputOperand(output),
String.valueOf(numThreads),
String.valueOf(isLeftTransposed),
String.valueOf(isRightTransposed));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
import org.apache.sysds.runtime.matrix.operators.COVOperator;
import org.apache.sysds.runtime.matrix.operators.Operator;
import org.apache.sysds.runtime.matrix.operators.QuaternaryOperator;
import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
import org.apache.sysds.runtime.matrix.operators.TernaryOperator;
import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
import org.apache.sysds.runtime.util.IndexRange;
Expand Down Expand Up @@ -82,6 +81,7 @@ public AbstractCompressedMatrixBlock() {

/**
* Create a potentially overlapping Compressed Matrix Block.
*
* @param overLapping boolean specifying if the matrix blocks columns are overlapping.
*/
public AbstractCompressedMatrixBlock(boolean overLapping) {
Expand Down Expand Up @@ -175,13 +175,6 @@ public void incrementalAggregate(AggregateOperator aggOp, MatrixValue newWithCor
throw new DMLRuntimeException("CompressedMatrixBlock: incrementalAggregate not supported.");
}

@Override
public MatrixBlock reorgOperations(ReorgOperator op, MatrixValue ret, int startRow, int startColumn, int length) {
printDecompressWarning("reorgOperations");
MatrixBlock tmp = decompress();
return tmp.reorgOperations(op, ret, startRow, startColumn, length);
}

@Override
public MatrixBlock append(MatrixBlock that, MatrixBlock ret, boolean cbind) {
if(cbind) // use supported operation
Expand Down Expand Up @@ -394,13 +387,6 @@ public MatrixBlock rexpandOperations(MatrixBlock ret, double max, boolean rows,
return tmp.rexpandOperations(ret, max, rows, cast, ignore, k);
}

@Override
public MatrixBlock replaceOperations(MatrixValue result, double pattern, double replacement) {
printDecompressWarning("replaceOperations");
MatrixBlock tmp = decompress();
return tmp.replaceOperations(result, pattern, replacement);
}

@Override
public void ctableOperations(Operator op, double scalar, MatrixValue that, CTableMap resultMap,
MatrixBlock resultBlock) {
Expand Down Expand Up @@ -507,8 +493,9 @@ private static boolean isCompressed(MatrixBlock mb) {
return(mb instanceof CompressedMatrixBlock);
}

protected static MatrixBlock getUncompressed(MatrixValue mVal) {
return isCompressed((MatrixBlock) mVal) ? ((CompressedMatrixBlock) mVal).decompress(OptimizerUtils.getConstrainedNumThreads(-1)) : (MatrixBlock) mVal;
public static MatrixBlock getUncompressed(MatrixValue mVal) {
return isCompressed((MatrixBlock) mVal) ? ((CompressedMatrixBlock) mVal)
.decompress(OptimizerUtils.getConstrainedNumThreads(-1)) : (MatrixBlock) mVal;
}

protected void printDecompressWarning(String operation) {
Expand Down
Loading

0 comments on commit 12cdc89

Please sign in to comment.