From 4545eb7a4f8691450bc83071f3bfeab5dcf24274 Mon Sep 17 00:00:00 2001 From: Sumedh Sakdeo Date: Wed, 18 Sep 2024 12:14:21 -0700 Subject: [PATCH 1/2] DLO: Solving multiple objective optimization problem for file count reduction --- .../BaseDataLayoutCandidateSelector.java | 58 +++++++++++ .../ranker/DataLayoutCandidateSelector.java | 15 +++ .../ranker/DataLayoutStrategyScorer.java | 15 +++ .../GreedyMaxBudgetCandidateSelector.java | 39 ++++++++ ...leWeightedSumDataLayoutStrategyScorer.java | 83 ++++++++++++++++ .../TopKDataLayoutCandidateSelector.java | 27 +++++ .../strategy/DataLayoutStrategy.java | 3 + ...ightedSumDataLayoutStrategyScorerTest.java | 99 +++++++++++++++++++ 8 files changed, 339 insertions(+) create mode 100644 libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/BaseDataLayoutCandidateSelector.java create mode 100644 libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutCandidateSelector.java create mode 100644 libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutStrategyScorer.java create mode 100644 libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/GreedyMaxBudgetCandidateSelector.java create mode 100644 libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/SimpleWeightedSumDataLayoutStrategyScorer.java create mode 100644 libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/TopKDataLayoutCandidateSelector.java create mode 100644 libs/datalayout/src/test/java/com/linkedin/openhouse/datalayout/ranker/WeightedSumDataLayoutStrategyScorerTest.java diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/BaseDataLayoutCandidateSelector.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/BaseDataLayoutCandidateSelector.java new file mode 100644 index 00000000..5627e701 --- /dev/null +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/BaseDataLayoutCandidateSelector.java @@ -0,0 +1,58 @@ +package com.linkedin.openhouse.datalayout.ranker; + +import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import java.util.Comparator; +import java.util.List; +import java.util.PriorityQueue; +import org.apache.commons.lang3.tuple.Pair; + +/** + * Base class for data layout candidate selector. It provides the basic functionality to provide + * consistent ranking strategy across multiple candidate selection strategies. + */ +public abstract class BaseDataLayoutCandidateSelector implements DataLayoutCandidateSelector { + + /** + * Rank the candidates to perform data layout optimizations based on the scores. + * + * @param dataLayoutStrategies all data layout strategies with scores computed. + * @return index of the selected data layout strategies ordered by scores. + */ + @Override + public List select(List dataLayoutStrategies) { + PriorityQueue> maxHeap = + new PriorityQueue<>( + new Comparator>() { + + /** + * Compares its two arguments for order. Returns a negative integer, zero, or a + * positive integer as the first argument is less than, equal to, or greater than the + * second. + * + *

+ * + * @param o1 the first object to be compared. + * @param o2 the second object to be compared. + * @return a negative integer, zero, or a positive integer as the first argument is + * less than, equal to, or greater than the second. + * @throws NullPointerException if an argument is null and this comparator does not + * permit null arguments + * @throws ClassCastException if the arguments' types prevent them from being compared + * by this comparator. + */ + @Override + public int compare( + Pair o1, Pair o2) { + return Double.compare(o2.getLeft().getScore(), o1.getLeft().getScore()); + } + }); + + for (int i = 0; i < dataLayoutStrategies.size(); i++) { + maxHeap.add(Pair.of(dataLayoutStrategies.get(i), i)); + } + + return filter(maxHeap); + } + + protected abstract List filter(PriorityQueue> maxHeap); +} diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutCandidateSelector.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutCandidateSelector.java new file mode 100644 index 00000000..cd56bf8e --- /dev/null +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutCandidateSelector.java @@ -0,0 +1,15 @@ +package com.linkedin.openhouse.datalayout.ranker; + +import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import java.util.List; + +public interface DataLayoutCandidateSelector { + + /** + * Pick the candidates to perform data layout optimizations based on the scores. + * + * @param dataLayoutStrategies all data layout strategies with scores computed. + * @return index of the selected data layout strategies. + */ + List select(List dataLayoutStrategies); +} diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutStrategyScorer.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutStrategyScorer.java new file mode 100644 index 00000000..f9e22bb7 --- /dev/null +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutStrategyScorer.java @@ -0,0 +1,15 @@ +package com.linkedin.openhouse.datalayout.ranker; + +import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import java.util.List; + +/** Interface for scoring data layout strategies. */ +public interface DataLayoutStrategyScorer { + /** + * Compute scores for the data layout strategies based on the input data. + * + * @param dataLayoutStrategies the data layout strategies to score + * @return the data layout strategies w/ computed scores + */ + List scoreDataLayoutStrategies(List dataLayoutStrategies); +} diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/GreedyMaxBudgetCandidateSelector.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/GreedyMaxBudgetCandidateSelector.java new file mode 100644 index 00000000..0fe40fd8 --- /dev/null +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/GreedyMaxBudgetCandidateSelector.java @@ -0,0 +1,39 @@ +package com.linkedin.openhouse.datalayout.ranker; + +import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import java.util.ArrayList; +import java.util.List; +import java.util.PriorityQueue; +import org.apache.commons.lang3.tuple.Pair; + +/** + * A greedy candidate selector that selects the top K data layout strategies based on the max + * budget. The max budget is defined by the max estimated compute cost or the max number of tables + * whatever reaches first. + */ +public class GreedyMaxBudgetCandidateSelector extends BaseDataLayoutCandidateSelector { + + private final double maxEstimatedComputeCost; + private final int maxTables; + + public GreedyMaxBudgetCandidateSelector(double maxEstimatedComputeCost, int maxTablesBudget) { + this.maxEstimatedComputeCost = maxEstimatedComputeCost; + this.maxTables = maxTablesBudget; + } + + @Override + protected List filter(PriorityQueue> maxHeap) { + List result = new ArrayList<>(); + double totalEstimatedComputeCost = 0; + int totalTables = 0; + while (!maxHeap.isEmpty() + && totalEstimatedComputeCost < this.maxEstimatedComputeCost + && totalTables < this.maxTables) { + Pair dataLayoutStrategyIntegerPair = maxHeap.poll(); + result.add(dataLayoutStrategyIntegerPair.getRight()); + totalEstimatedComputeCost += dataLayoutStrategyIntegerPair.getLeft().getCost(); + totalTables += 1; + } + return result; + } +} diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/SimpleWeightedSumDataLayoutStrategyScorer.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/SimpleWeightedSumDataLayoutStrategyScorer.java new file mode 100644 index 00000000..4d9534c5 --- /dev/null +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/SimpleWeightedSumDataLayoutStrategyScorer.java @@ -0,0 +1,83 @@ +package com.linkedin.openhouse.datalayout.ranker; + +import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import java.util.ArrayList; +import java.util.List; + +/** + * Optimizing across multiple objective such as (x) Maximize file count reduction (x) Minimize + * compute cost used for compaction by first normalizing the objectives on a common scale and then + * performing scalarization using a weighted sum technique. + */ +public class SimpleWeightedSumDataLayoutStrategyScorer implements DataLayoutStrategyScorer { + + public static final int MINIMIZATION_OBJECTIVE_FACTOR = -1; + + private final double costWeight; + private final double gainWeight; + + public SimpleWeightedSumDataLayoutStrategyScorer(double gainWeight, double costWeight) { + this.gainWeight = gainWeight; + this.costWeight = costWeight; + } + + /** + * Compute scores for the data layout strategies based on the input data. + * + * @param dataLayoutStrategies the data layout strategies to score + * @return the data layout strategies w/ scores + */ + @Override + public List scoreDataLayoutStrategies( + List dataLayoutStrategies) { + + List normalizedDataLayoutStrategies = new ArrayList<>(); + double minCost = minCost(dataLayoutStrategies); + double maxCost = maxCost(dataLayoutStrategies); + double minGain = minGain(dataLayoutStrategies); + double maxGain = maxGain(dataLayoutStrategies); + + for (DataLayoutStrategy dataLayoutStrategy : dataLayoutStrategies) { + double normalizedCost = minMaxNormalize(dataLayoutStrategy.getCost(), minCost, maxCost); + double normalizedGain = minMaxNormalize(dataLayoutStrategy.getGain(), minGain, maxGain); + DataLayoutStrategy normalizedDataLayoutStrategy = + DataLayoutStrategy.builder() + .config(dataLayoutStrategy.getConfig()) + .entropy(dataLayoutStrategy.getEntropy()) + .cost(dataLayoutStrategy.getCost()) + .gain(dataLayoutStrategy.getGain()) + .normalizedComputeCost(normalizedCost) + .normalizedFileCountReduction(normalizedGain) + .score( + (gainWeight * normalizedGain) + + (MINIMIZATION_OBJECTIVE_FACTOR * costWeight * normalizedCost)) + .build(); + normalizedDataLayoutStrategies.add(normalizedDataLayoutStrategy); + } + + return normalizedDataLayoutStrategies; + } + + private double minMaxNormalize(double value, double min, double max) { + if (max == min) { + return 0.0; + } + return (value - min) / (max - min); + } + + private double minCost(List dataLayoutStrategies) { + return dataLayoutStrategies.stream().mapToDouble(DataLayoutStrategy::getCost).min().orElse(0); + } + + private double maxCost(List dataLayoutStrategies) { + return dataLayoutStrategies.stream().mapToDouble(DataLayoutStrategy::getCost).max().orElse(0); + } + + private double minGain(List dataLayoutStrategies) { + return dataLayoutStrategies.stream().mapToDouble(DataLayoutStrategy::getGain).min().orElse(0); + } + + private double maxGain(List dataLayoutStrategies) { + return dataLayoutStrategies.stream().mapToDouble(DataLayoutStrategy::getGain).max().orElse(0); + } +} diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/TopKDataLayoutCandidateSelector.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/TopKDataLayoutCandidateSelector.java new file mode 100644 index 00000000..98603031 --- /dev/null +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/TopKDataLayoutCandidateSelector.java @@ -0,0 +1,27 @@ +package com.linkedin.openhouse.datalayout.ranker; + +import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import java.util.ArrayList; +import java.util.List; +import java.util.PriorityQueue; +import org.apache.commons.lang3.tuple.Pair; + +/** A greedy candidate selector that selects the top K data layout strategies, ranked by scores. */ +public class TopKDataLayoutCandidateSelector extends BaseDataLayoutCandidateSelector { + + private int k; + + public TopKDataLayoutCandidateSelector(int k) { + this.k = k; + } + + @Override + protected List filter(PriorityQueue> maxHeap) { + List result = new ArrayList<>(); + while (!maxHeap.isEmpty() && result.size() < this.k) { + Pair dataLayoutStrategyIntegerPair = maxHeap.poll(); + result.add(dataLayoutStrategyIntegerPair.getRight()); + } + return result; + } +} diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/DataLayoutStrategy.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/DataLayoutStrategy.java index 93f31079..2dbad17a 100644 --- a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/DataLayoutStrategy.java +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/DataLayoutStrategy.java @@ -13,8 +13,11 @@ public class DataLayoutStrategy { private final double score; private final double entropy; + // TODO: refactor cost -> estimated_compute_cost, gain -> estimated_file_count_reduction private final double cost; private final double gain; private final DataCompactionConfig config; + private final double normalizedComputeCost; + private final double normalizedFileCountReduction; // TODO: support sorting config } diff --git a/libs/datalayout/src/test/java/com/linkedin/openhouse/datalayout/ranker/WeightedSumDataLayoutStrategyScorerTest.java b/libs/datalayout/src/test/java/com/linkedin/openhouse/datalayout/ranker/WeightedSumDataLayoutStrategyScorerTest.java new file mode 100644 index 00000000..db1cfc43 --- /dev/null +++ b/libs/datalayout/src/test/java/com/linkedin/openhouse/datalayout/ranker/WeightedSumDataLayoutStrategyScorerTest.java @@ -0,0 +1,99 @@ +package com.linkedin.openhouse.datalayout.ranker; + +import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import java.util.ArrayList; +import java.util.List; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class WeightedSumDataLayoutStrategyScorerTest { + + private List testSampleDataLayoutStrategies; + + @BeforeEach + void init() { + DataLayoutStrategy lowScoreStrategy = DataLayoutStrategy.builder().cost(0.5).gain(1.0).build(); + DataLayoutStrategy midScoreStrategy = + DataLayoutStrategy.builder().cost(50.0).gain(100.0).build(); + DataLayoutStrategy highScoreStrategy = + DataLayoutStrategy.builder().cost(500.0).gain(1000.0).build(); + testSampleDataLayoutStrategies = new ArrayList(); + testSampleDataLayoutStrategies.add(lowScoreStrategy); + testSampleDataLayoutStrategies.add(midScoreStrategy); + testSampleDataLayoutStrategies.add(highScoreStrategy); + } + + @Test + public void testWeightedSumDataLayoutStrategyScorer() { + DataLayoutStrategyScorer dataLayoutStrategyScorer = + new SimpleWeightedSumDataLayoutStrategyScorer(0.7, 0.3); + List normalizedStrategies = + dataLayoutStrategyScorer.scoreDataLayoutStrategies(testSampleDataLayoutStrategies); + Assertions.assertEquals(normalizedStrategies.size(), 3); + + Assertions.assertEquals(normalizedStrategies.get(0).getNormalizedComputeCost(), 0); + Assertions.assertEquals(normalizedStrategies.get(0).getNormalizedFileCountReduction(), 0); + Assertions.assertEquals(normalizedStrategies.get(0).getScore(), 0); + + Assertions.assertEquals( + normalizedStrategies.get(1).getNormalizedComputeCost(), (50.0 - 0.5) / (500.0 - 0.5)); + Assertions.assertEquals( + normalizedStrategies.get(1).getNormalizedFileCountReduction(), + (100 - 1.0) / (1000.0 - 1.0)); + Assertions.assertEquals( + normalizedStrategies.get(1).getScore(), + (0.7 * ((100 - 1.0) / (1000.0 - 1.0))) - (0.3 * ((50.0 - 0.5) / (500.0 - 0.5)))); + + Assertions.assertEquals(normalizedStrategies.get(2).getNormalizedComputeCost(), 1.0); + Assertions.assertEquals(normalizedStrategies.get(2).getNormalizedFileCountReduction(), 1.0); + Assertions.assertEquals(normalizedStrategies.get(2).getScore(), (0.7 * 1.0) - (0.3 * 1.0)); + } + + @Test + public void testWeightSumScorerTopKCandidateSelector() { + DataLayoutStrategyScorer dataLayoutStrategyScorer = + new SimpleWeightedSumDataLayoutStrategyScorer(0.7, 0.3); + List normalizedStrategies = + dataLayoutStrategyScorer.scoreDataLayoutStrategies(testSampleDataLayoutStrategies); + Assertions.assertEquals(normalizedStrategies.size(), 3); + for (int k = 1; k <= 3; k++) { + DataLayoutCandidateSelector dataLayoutCandidateSelector = + new TopKDataLayoutCandidateSelector(k); + List topK = dataLayoutCandidateSelector.select(normalizedStrategies); + Assertions.assertEquals(k, topK.size()); + for (int j = 1; j <= k; j++) { + Assertions.assertEquals(3 - j, topK.get(j - 1)); + } + } + } + + @Test + public void testWeightSumScorerMaxBudgetCandidateSelector() { + DataLayoutStrategyScorer dataLayoutStrategyScorer = + new SimpleWeightedSumDataLayoutStrategyScorer(0.7, 0.3); + List normalizedStrategies = + dataLayoutStrategyScorer.scoreDataLayoutStrategies(testSampleDataLayoutStrategies); + Assertions.assertEquals(3, normalizedStrategies.size()); + + DataLayoutCandidateSelector selectAll = new GreedyMaxBudgetCandidateSelector(550.5, 3); + List selectedAllStrategies = selectAll.select(normalizedStrategies); + Assertions.assertEquals(3, selectedAllStrategies.size()); + Assertions.assertArrayEquals(new Integer[] {2, 1, 0}, selectedAllStrategies.toArray()); + + DataLayoutCandidateSelector selectTwo = new GreedyMaxBudgetCandidateSelector(550.0, 3); + List selectedTwoStrategies = selectTwo.select(normalizedStrategies); + Assertions.assertEquals(2, selectedTwoStrategies.size()); + Assertions.assertArrayEquals(new Integer[] {2, 1}, selectedTwoStrategies.toArray()); + + DataLayoutCandidateSelector selectOne = new GreedyMaxBudgetCandidateSelector(500.0, 3); + List selectedOneStrategies = selectOne.select(normalizedStrategies); + Assertions.assertEquals(1, selectedOneStrategies.size()); + Assertions.assertArrayEquals(new Integer[] {2}, selectedOneStrategies.toArray()); + + DataLayoutCandidateSelector selectNone = new GreedyMaxBudgetCandidateSelector(5000.0, 0); + List selectedNoneStrategies = selectNone.select(normalizedStrategies); + Assertions.assertEquals(0, selectedNoneStrategies.size()); + Assertions.assertArrayEquals(new Integer[] {}, selectedNoneStrategies.toArray()); + } +} From deee98a5f45d5ef58c6b27afcff1c3c5afe90563 Mon Sep 17 00:00:00 2001 From: Sumedh Sakdeo Date: Thu, 19 Sep 2024 14:04:51 -0700 Subject: [PATCH 2/2] Separate ScoredDataLayoutStrategy from raw traits in DataLayoutStrategy --- .../BaseDataLayoutCandidateSelector.java | 14 +++++----- .../ranker/DataLayoutCandidateSelector.java | 4 +-- .../ranker/DataLayoutStrategyScorer.java | 4 ++- .../GreedyMaxBudgetCandidateSelector.java | 9 ++++--- ...leWeightedSumDataLayoutStrategyScorer.java | 20 ++++++++------ .../TopKDataLayoutCandidateSelector.java | 27 ------------------- .../strategy/DataLayoutStrategy.java | 8 ++---- .../strategy/ScoredDataLayoutStrategy.java | 14 ++++++++++ ...ightedSumDataLayoutStrategyScorerTest.java | 9 ++++--- 9 files changed, 51 insertions(+), 58 deletions(-) delete mode 100644 libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/TopKDataLayoutCandidateSelector.java create mode 100644 libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/ScoredDataLayoutStrategy.java diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/BaseDataLayoutCandidateSelector.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/BaseDataLayoutCandidateSelector.java index 5627e701..33a40f49 100644 --- a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/BaseDataLayoutCandidateSelector.java +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/BaseDataLayoutCandidateSelector.java @@ -1,6 +1,6 @@ package com.linkedin.openhouse.datalayout.ranker; -import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import com.linkedin.openhouse.datalayout.strategy.ScoredDataLayoutStrategy; import java.util.Comparator; import java.util.List; import java.util.PriorityQueue; @@ -19,10 +19,10 @@ public abstract class BaseDataLayoutCandidateSelector implements DataLayoutCandi * @return index of the selected data layout strategies ordered by scores. */ @Override - public List select(List dataLayoutStrategies) { - PriorityQueue> maxHeap = + public List select(List dataLayoutStrategies) { + PriorityQueue> maxHeap = new PriorityQueue<>( - new Comparator>() { + new Comparator>() { /** * Compares its two arguments for order. Returns a negative integer, zero, or a @@ -42,7 +42,8 @@ public List select(List dataLayoutStrategies) { */ @Override public int compare( - Pair o1, Pair o2) { + Pair o1, + Pair o2) { return Double.compare(o2.getLeft().getScore(), o1.getLeft().getScore()); } }); @@ -54,5 +55,6 @@ public int compare( return filter(maxHeap); } - protected abstract List filter(PriorityQueue> maxHeap); + protected abstract List filter( + PriorityQueue> maxHeap); } diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutCandidateSelector.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutCandidateSelector.java index cd56bf8e..49e884bd 100644 --- a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutCandidateSelector.java +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutCandidateSelector.java @@ -1,6 +1,6 @@ package com.linkedin.openhouse.datalayout.ranker; -import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import com.linkedin.openhouse.datalayout.strategy.ScoredDataLayoutStrategy; import java.util.List; public interface DataLayoutCandidateSelector { @@ -11,5 +11,5 @@ public interface DataLayoutCandidateSelector { * @param dataLayoutStrategies all data layout strategies with scores computed. * @return index of the selected data layout strategies. */ - List select(List dataLayoutStrategies); + List select(List dataLayoutStrategies); } diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutStrategyScorer.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutStrategyScorer.java index f9e22bb7..3804a15a 100644 --- a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutStrategyScorer.java +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/DataLayoutStrategyScorer.java @@ -1,6 +1,7 @@ package com.linkedin.openhouse.datalayout.ranker; import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import com.linkedin.openhouse.datalayout.strategy.ScoredDataLayoutStrategy; import java.util.List; /** Interface for scoring data layout strategies. */ @@ -11,5 +12,6 @@ public interface DataLayoutStrategyScorer { * @param dataLayoutStrategies the data layout strategies to score * @return the data layout strategies w/ computed scores */ - List scoreDataLayoutStrategies(List dataLayoutStrategies); + List scoreDataLayoutStrategies( + List dataLayoutStrategies); } diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/GreedyMaxBudgetCandidateSelector.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/GreedyMaxBudgetCandidateSelector.java index 0fe40fd8..e5e306ad 100644 --- a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/GreedyMaxBudgetCandidateSelector.java +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/GreedyMaxBudgetCandidateSelector.java @@ -1,6 +1,6 @@ package com.linkedin.openhouse.datalayout.ranker; -import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import com.linkedin.openhouse.datalayout.strategy.ScoredDataLayoutStrategy; import java.util.ArrayList; import java.util.List; import java.util.PriorityQueue; @@ -22,16 +22,17 @@ public GreedyMaxBudgetCandidateSelector(double maxEstimatedComputeCost, int maxT } @Override - protected List filter(PriorityQueue> maxHeap) { + protected List filter(PriorityQueue> maxHeap) { List result = new ArrayList<>(); double totalEstimatedComputeCost = 0; int totalTables = 0; while (!maxHeap.isEmpty() && totalEstimatedComputeCost < this.maxEstimatedComputeCost && totalTables < this.maxTables) { - Pair dataLayoutStrategyIntegerPair = maxHeap.poll(); + Pair dataLayoutStrategyIntegerPair = maxHeap.poll(); result.add(dataLayoutStrategyIntegerPair.getRight()); - totalEstimatedComputeCost += dataLayoutStrategyIntegerPair.getLeft().getCost(); + totalEstimatedComputeCost += + dataLayoutStrategyIntegerPair.getLeft().getDataLayoutStrategy().getCost(); totalTables += 1; } return result; diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/SimpleWeightedSumDataLayoutStrategyScorer.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/SimpleWeightedSumDataLayoutStrategyScorer.java index 4d9534c5..db91a067 100644 --- a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/SimpleWeightedSumDataLayoutStrategyScorer.java +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/SimpleWeightedSumDataLayoutStrategyScorer.java @@ -1,6 +1,7 @@ package com.linkedin.openhouse.datalayout.ranker; import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import com.linkedin.openhouse.datalayout.strategy.ScoredDataLayoutStrategy; import java.util.ArrayList; import java.util.List; @@ -28,10 +29,10 @@ public SimpleWeightedSumDataLayoutStrategyScorer(double gainWeight, double costW * @return the data layout strategies w/ scores */ @Override - public List scoreDataLayoutStrategies( + public List scoreDataLayoutStrategies( List dataLayoutStrategies) { - List normalizedDataLayoutStrategies = new ArrayList<>(); + List normalizedDataLayoutStrategies = new ArrayList<>(); double minCost = minCost(dataLayoutStrategies); double maxCost = maxCost(dataLayoutStrategies); double minGain = minGain(dataLayoutStrategies); @@ -40,12 +41,15 @@ public List scoreDataLayoutStrategies( for (DataLayoutStrategy dataLayoutStrategy : dataLayoutStrategies) { double normalizedCost = minMaxNormalize(dataLayoutStrategy.getCost(), minCost, maxCost); double normalizedGain = minMaxNormalize(dataLayoutStrategy.getGain(), minGain, maxGain); - DataLayoutStrategy normalizedDataLayoutStrategy = - DataLayoutStrategy.builder() - .config(dataLayoutStrategy.getConfig()) - .entropy(dataLayoutStrategy.getEntropy()) - .cost(dataLayoutStrategy.getCost()) - .gain(dataLayoutStrategy.getGain()) + ScoredDataLayoutStrategy normalizedDataLayoutStrategy = + ScoredDataLayoutStrategy.builder() + .dataLayoutStrategy( + DataLayoutStrategy.builder() + .config(dataLayoutStrategy.getConfig()) + .entropy(dataLayoutStrategy.getEntropy()) + .cost(dataLayoutStrategy.getCost()) + .gain(dataLayoutStrategy.getGain()) + .build()) .normalizedComputeCost(normalizedCost) .normalizedFileCountReduction(normalizedGain) .score( diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/TopKDataLayoutCandidateSelector.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/TopKDataLayoutCandidateSelector.java deleted file mode 100644 index 98603031..00000000 --- a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/ranker/TopKDataLayoutCandidateSelector.java +++ /dev/null @@ -1,27 +0,0 @@ -package com.linkedin.openhouse.datalayout.ranker; - -import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; -import java.util.ArrayList; -import java.util.List; -import java.util.PriorityQueue; -import org.apache.commons.lang3.tuple.Pair; - -/** A greedy candidate selector that selects the top K data layout strategies, ranked by scores. */ -public class TopKDataLayoutCandidateSelector extends BaseDataLayoutCandidateSelector { - - private int k; - - public TopKDataLayoutCandidateSelector(int k) { - this.k = k; - } - - @Override - protected List filter(PriorityQueue> maxHeap) { - List result = new ArrayList<>(); - while (!maxHeap.isEmpty() && result.size() < this.k) { - Pair dataLayoutStrategyIntegerPair = maxHeap.poll(); - result.add(dataLayoutStrategyIntegerPair.getRight()); - } - return result; - } -} diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/DataLayoutStrategy.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/DataLayoutStrategy.java index 2dbad17a..329b08d9 100644 --- a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/DataLayoutStrategy.java +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/DataLayoutStrategy.java @@ -4,20 +4,16 @@ import lombok.Builder; import lombok.Data; -/** - * Configuration for data layout optimization. Supports compaction and optionally sorting. The score - * is used to rank the layout configurations. - */ +/** Raw traits for table to feed into data layout optimization strategy generator. */ @Data @Builder public class DataLayoutStrategy { + // TODO: remove score from here. private final double score; private final double entropy; // TODO: refactor cost -> estimated_compute_cost, gain -> estimated_file_count_reduction private final double cost; private final double gain; private final DataCompactionConfig config; - private final double normalizedComputeCost; - private final double normalizedFileCountReduction; // TODO: support sorting config } diff --git a/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/ScoredDataLayoutStrategy.java b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/ScoredDataLayoutStrategy.java new file mode 100644 index 00000000..8f321142 --- /dev/null +++ b/libs/datalayout/src/main/java/com/linkedin/openhouse/datalayout/strategy/ScoredDataLayoutStrategy.java @@ -0,0 +1,14 @@ +package com.linkedin.openhouse.datalayout.strategy; + +import lombok.Builder; +import lombok.Data; + +/** Data layout strategy with a score, and normalized units for traits. */ +@Data +@Builder +public class ScoredDataLayoutStrategy { + private final DataLayoutStrategy dataLayoutStrategy; + private final double score; + private final double normalizedComputeCost; + private final double normalizedFileCountReduction; +} diff --git a/libs/datalayout/src/test/java/com/linkedin/openhouse/datalayout/ranker/WeightedSumDataLayoutStrategyScorerTest.java b/libs/datalayout/src/test/java/com/linkedin/openhouse/datalayout/ranker/WeightedSumDataLayoutStrategyScorerTest.java index db1cfc43..0c5c771b 100644 --- a/libs/datalayout/src/test/java/com/linkedin/openhouse/datalayout/ranker/WeightedSumDataLayoutStrategyScorerTest.java +++ b/libs/datalayout/src/test/java/com/linkedin/openhouse/datalayout/ranker/WeightedSumDataLayoutStrategyScorerTest.java @@ -1,6 +1,7 @@ package com.linkedin.openhouse.datalayout.ranker; import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy; +import com.linkedin.openhouse.datalayout.strategy.ScoredDataLayoutStrategy; import java.util.ArrayList; import java.util.List; import org.junit.jupiter.api.Assertions; @@ -28,7 +29,7 @@ void init() { public void testWeightedSumDataLayoutStrategyScorer() { DataLayoutStrategyScorer dataLayoutStrategyScorer = new SimpleWeightedSumDataLayoutStrategyScorer(0.7, 0.3); - List normalizedStrategies = + List normalizedStrategies = dataLayoutStrategyScorer.scoreDataLayoutStrategies(testSampleDataLayoutStrategies); Assertions.assertEquals(normalizedStrategies.size(), 3); @@ -54,12 +55,12 @@ public void testWeightedSumDataLayoutStrategyScorer() { public void testWeightSumScorerTopKCandidateSelector() { DataLayoutStrategyScorer dataLayoutStrategyScorer = new SimpleWeightedSumDataLayoutStrategyScorer(0.7, 0.3); - List normalizedStrategies = + List normalizedStrategies = dataLayoutStrategyScorer.scoreDataLayoutStrategies(testSampleDataLayoutStrategies); Assertions.assertEquals(normalizedStrategies.size(), 3); for (int k = 1; k <= 3; k++) { DataLayoutCandidateSelector dataLayoutCandidateSelector = - new TopKDataLayoutCandidateSelector(k); + new GreedyMaxBudgetCandidateSelector(Double.MAX_VALUE, k); List topK = dataLayoutCandidateSelector.select(normalizedStrategies); Assertions.assertEquals(k, topK.size()); for (int j = 1; j <= k; j++) { @@ -72,7 +73,7 @@ public void testWeightSumScorerTopKCandidateSelector() { public void testWeightSumScorerMaxBudgetCandidateSelector() { DataLayoutStrategyScorer dataLayoutStrategyScorer = new SimpleWeightedSumDataLayoutStrategyScorer(0.7, 0.3); - List normalizedStrategies = + List normalizedStrategies = dataLayoutStrategyScorer.scoreDataLayoutStrategies(testSampleDataLayoutStrategies); Assertions.assertEquals(3, normalizedStrategies.size());