Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DLO: Multi-objective optimization for auto-compaction #201

Merged
merged 2 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package com.linkedin.openhouse.datalayout.ranker;

import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;
import org.apache.commons.lang3.tuple.Pair;

/**
* Base class for data layout candidate selector. It provides the basic functionality to provide
* consistent ranking strategy across multiple candidate selection strategies.
*/
public abstract class BaseDataLayoutCandidateSelector implements DataLayoutCandidateSelector {

/**
* Rank the candidates to perform data layout optimizations based on the scores.
*
* @param dataLayoutStrategies all data layout strategies with scores computed.
* @return index of the selected data layout strategies ordered by scores.
*/
@Override
public List<Integer> select(List<DataLayoutStrategy> dataLayoutStrategies) {
PriorityQueue<Pair<DataLayoutStrategy, Integer>> maxHeap =
new PriorityQueue<>(
new Comparator<Pair<DataLayoutStrategy, Integer>>() {
sumedhsakdeo marked this conversation as resolved.
Show resolved Hide resolved

/**
* Compares its two arguments for order. Returns a negative integer, zero, or a
* positive integer as the first argument is less than, equal to, or greater than the
* second.
*
* <p>
*
* @param o1 the first object to be compared.
* @param o2 the second object to be compared.
* @return a negative integer, zero, or a positive integer as the first argument is
* less than, equal to, or greater than the second.
* @throws NullPointerException if an argument is null and this comparator does not
* permit null arguments
* @throws ClassCastException if the arguments' types prevent them from being compared
* by this comparator.
*/
sumedhsakdeo marked this conversation as resolved.
Show resolved Hide resolved
@Override
public int compare(
Pair<DataLayoutStrategy, Integer> o1, Pair<DataLayoutStrategy, Integer> o2) {
return Double.compare(o2.getLeft().getScore(), o1.getLeft().getScore());
}
});

for (int i = 0; i < dataLayoutStrategies.size(); i++) {
maxHeap.add(Pair.of(dataLayoutStrategies.get(i), i));
}

return filter(maxHeap);
}

protected abstract List<Integer> filter(PriorityQueue<Pair<DataLayoutStrategy, Integer>> maxHeap);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.linkedin.openhouse.datalayout.ranker;

import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy;
import java.util.List;

public interface DataLayoutCandidateSelector {

/**
* Pick the candidates to perform data layout optimizations based on the scores.
*
* @param dataLayoutStrategies all data layout strategies with scores computed.
* @return index of the selected data layout strategies.
*/
List<Integer> select(List<DataLayoutStrategy> dataLayoutStrategies);
sumedhsakdeo marked this conversation as resolved.
Show resolved Hide resolved
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.linkedin.openhouse.datalayout.ranker;

import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy;
import java.util.List;

/** Interface for scoring data layout strategies. */
public interface DataLayoutStrategyScorer {
/**
* Compute scores for the data layout strategies based on the input data.
*
* @param dataLayoutStrategies the data layout strategies to score
* @return the data layout strategies w/ computed scores
*/
List<DataLayoutStrategy> scoreDataLayoutStrategies(List<DataLayoutStrategy> dataLayoutStrategies);
sumedhsakdeo marked this conversation as resolved.
Show resolved Hide resolved
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package com.linkedin.openhouse.datalayout.ranker;

import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy;
import java.util.ArrayList;
import java.util.List;
import java.util.PriorityQueue;
import org.apache.commons.lang3.tuple.Pair;

/**
* A greedy candidate selector that selects the top K data layout strategies based on the max
* budget. The max budget is defined by the max estimated compute cost or the max number of tables
* whatever reaches first.
*/
public class GreedyMaxBudgetCandidateSelector extends BaseDataLayoutCandidateSelector {

private final double maxEstimatedComputeCost;
private final int maxTables;
sumedhsakdeo marked this conversation as resolved.
Show resolved Hide resolved

public GreedyMaxBudgetCandidateSelector(double maxEstimatedComputeCost, int maxTablesBudget) {
this.maxEstimatedComputeCost = maxEstimatedComputeCost;
this.maxTables = maxTablesBudget;
}

@Override
protected List<Integer> filter(PriorityQueue<Pair<DataLayoutStrategy, Integer>> maxHeap) {
List<Integer> result = new ArrayList<>();
double totalEstimatedComputeCost = 0;
int totalTables = 0;
while (!maxHeap.isEmpty()
&& totalEstimatedComputeCost < this.maxEstimatedComputeCost
&& totalTables < this.maxTables) {
Pair<DataLayoutStrategy, Integer> dataLayoutStrategyIntegerPair = maxHeap.poll();
result.add(dataLayoutStrategyIntegerPair.getRight());
totalEstimatedComputeCost += dataLayoutStrategyIntegerPair.getLeft().getCost();
totalTables += 1;
}
return result;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package com.linkedin.openhouse.datalayout.ranker;

import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy;
import java.util.ArrayList;
import java.util.List;

/**
* Optimizing across multiple objective such as (x) Maximize file count reduction (x) Minimize
* compute cost used for compaction by first normalizing the objectives on a common scale and then
* performing scalarization using a weighted sum technique.
*/
public class SimpleWeightedSumDataLayoutStrategyScorer implements DataLayoutStrategyScorer {

public static final int MINIMIZATION_OBJECTIVE_FACTOR = -1;
sumedhsakdeo marked this conversation as resolved.
Show resolved Hide resolved

private final double costWeight;
private final double gainWeight;

public SimpleWeightedSumDataLayoutStrategyScorer(double gainWeight, double costWeight) {
this.gainWeight = gainWeight;
this.costWeight = costWeight;
}

/**
* Compute scores for the data layout strategies based on the input data.
*
* @param dataLayoutStrategies the data layout strategies to score
* @return the data layout strategies w/ scores
*/
@Override
public List<DataLayoutStrategy> scoreDataLayoutStrategies(
List<DataLayoutStrategy> dataLayoutStrategies) {

List<DataLayoutStrategy> normalizedDataLayoutStrategies = new ArrayList<>();
double minCost = minCost(dataLayoutStrategies);
double maxCost = maxCost(dataLayoutStrategies);
double minGain = minGain(dataLayoutStrategies);
double maxGain = maxGain(dataLayoutStrategies);

for (DataLayoutStrategy dataLayoutStrategy : dataLayoutStrategies) {
double normalizedCost = minMaxNormalize(dataLayoutStrategy.getCost(), minCost, maxCost);
double normalizedGain = minMaxNormalize(dataLayoutStrategy.getGain(), minGain, maxGain);
DataLayoutStrategy normalizedDataLayoutStrategy =
DataLayoutStrategy.builder()
.config(dataLayoutStrategy.getConfig())
.entropy(dataLayoutStrategy.getEntropy())
.cost(dataLayoutStrategy.getCost())
.gain(dataLayoutStrategy.getGain())
sumedhsakdeo marked this conversation as resolved.
Show resolved Hide resolved
.normalizedComputeCost(normalizedCost)
.normalizedFileCountReduction(normalizedGain)
.score(
(gainWeight * normalizedGain)
+ (MINIMIZATION_OBJECTIVE_FACTOR * costWeight * normalizedCost))
.build();
teamurko marked this conversation as resolved.
Show resolved Hide resolved
normalizedDataLayoutStrategies.add(normalizedDataLayoutStrategy);
}

return normalizedDataLayoutStrategies;
}

private double minMaxNormalize(double value, double min, double max) {
if (max == min) {
return 0.0;
}
return (value - min) / (max - min);
}

private double minCost(List<DataLayoutStrategy> dataLayoutStrategies) {
return dataLayoutStrategies.stream().mapToDouble(DataLayoutStrategy::getCost).min().orElse(0);
}

private double maxCost(List<DataLayoutStrategy> dataLayoutStrategies) {
return dataLayoutStrategies.stream().mapToDouble(DataLayoutStrategy::getCost).max().orElse(0);
}

private double minGain(List<DataLayoutStrategy> dataLayoutStrategies) {
return dataLayoutStrategies.stream().mapToDouble(DataLayoutStrategy::getGain).min().orElse(0);
}

private double maxGain(List<DataLayoutStrategy> dataLayoutStrategies) {
return dataLayoutStrategies.stream().mapToDouble(DataLayoutStrategy::getGain).max().orElse(0);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package com.linkedin.openhouse.datalayout.ranker;

import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy;
import java.util.ArrayList;
import java.util.List;
import java.util.PriorityQueue;
import org.apache.commons.lang3.tuple.Pair;

/** A greedy candidate selector that selects the top K data layout strategies, ranked by scores. */
public class TopKDataLayoutCandidateSelector extends BaseDataLayoutCandidateSelector {
sumedhsakdeo marked this conversation as resolved.
Show resolved Hide resolved

private int k;

public TopKDataLayoutCandidateSelector(int k) {
this.k = k;
}

@Override
protected List<Integer> filter(PriorityQueue<Pair<DataLayoutStrategy, Integer>> maxHeap) {
List<Integer> result = new ArrayList<>();
while (!maxHeap.isEmpty() && result.size() < this.k) {
Pair<DataLayoutStrategy, Integer> dataLayoutStrategyIntegerPair = maxHeap.poll();
result.add(dataLayoutStrategyIntegerPair.getRight());
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@
public class DataLayoutStrategy {
private final double score;
private final double entropy;
// TODO: refactor cost -> estimated_compute_cost, gain -> estimated_file_count_reduction
private final double cost;
private final double gain;
private final DataCompactionConfig config;
private final double normalizedComputeCost;
private final double normalizedFileCountReduction;
// TODO: support sorting config
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package com.linkedin.openhouse.datalayout.ranker;

import com.linkedin.openhouse.datalayout.strategy.DataLayoutStrategy;
import java.util.ArrayList;
import java.util.List;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

public class WeightedSumDataLayoutStrategyScorerTest {

private List<DataLayoutStrategy> testSampleDataLayoutStrategies;

@BeforeEach
void init() {
DataLayoutStrategy lowScoreStrategy = DataLayoutStrategy.builder().cost(0.5).gain(1.0).build();
DataLayoutStrategy midScoreStrategy =
DataLayoutStrategy.builder().cost(50.0).gain(100.0).build();
DataLayoutStrategy highScoreStrategy =
DataLayoutStrategy.builder().cost(500.0).gain(1000.0).build();
testSampleDataLayoutStrategies = new ArrayList<DataLayoutStrategy>();
testSampleDataLayoutStrategies.add(lowScoreStrategy);
testSampleDataLayoutStrategies.add(midScoreStrategy);
testSampleDataLayoutStrategies.add(highScoreStrategy);
}

@Test
public void testWeightedSumDataLayoutStrategyScorer() {
DataLayoutStrategyScorer dataLayoutStrategyScorer =
new SimpleWeightedSumDataLayoutStrategyScorer(0.7, 0.3);
List<DataLayoutStrategy> normalizedStrategies =
dataLayoutStrategyScorer.scoreDataLayoutStrategies(testSampleDataLayoutStrategies);
Assertions.assertEquals(normalizedStrategies.size(), 3);

Assertions.assertEquals(normalizedStrategies.get(0).getNormalizedComputeCost(), 0);
Assertions.assertEquals(normalizedStrategies.get(0).getNormalizedFileCountReduction(), 0);
Assertions.assertEquals(normalizedStrategies.get(0).getScore(), 0);

Assertions.assertEquals(
normalizedStrategies.get(1).getNormalizedComputeCost(), (50.0 - 0.5) / (500.0 - 0.5));
Assertions.assertEquals(
normalizedStrategies.get(1).getNormalizedFileCountReduction(),
(100 - 1.0) / (1000.0 - 1.0));
Assertions.assertEquals(
normalizedStrategies.get(1).getScore(),
(0.7 * ((100 - 1.0) / (1000.0 - 1.0))) - (0.3 * ((50.0 - 0.5) / (500.0 - 0.5))));

Assertions.assertEquals(normalizedStrategies.get(2).getNormalizedComputeCost(), 1.0);
Assertions.assertEquals(normalizedStrategies.get(2).getNormalizedFileCountReduction(), 1.0);
Assertions.assertEquals(normalizedStrategies.get(2).getScore(), (0.7 * 1.0) - (0.3 * 1.0));
}

@Test
public void testWeightSumScorerTopKCandidateSelector() {
DataLayoutStrategyScorer dataLayoutStrategyScorer =
new SimpleWeightedSumDataLayoutStrategyScorer(0.7, 0.3);
List<DataLayoutStrategy> normalizedStrategies =
dataLayoutStrategyScorer.scoreDataLayoutStrategies(testSampleDataLayoutStrategies);
Assertions.assertEquals(normalizedStrategies.size(), 3);
for (int k = 1; k <= 3; k++) {
DataLayoutCandidateSelector dataLayoutCandidateSelector =
new TopKDataLayoutCandidateSelector(k);
List<Integer> topK = dataLayoutCandidateSelector.select(normalizedStrategies);
Assertions.assertEquals(k, topK.size());
for (int j = 1; j <= k; j++) {
Assertions.assertEquals(3 - j, topK.get(j - 1));
}
}
}

@Test
public void testWeightSumScorerMaxBudgetCandidateSelector() {
DataLayoutStrategyScorer dataLayoutStrategyScorer =
new SimpleWeightedSumDataLayoutStrategyScorer(0.7, 0.3);
List<DataLayoutStrategy> normalizedStrategies =
dataLayoutStrategyScorer.scoreDataLayoutStrategies(testSampleDataLayoutStrategies);
Assertions.assertEquals(3, normalizedStrategies.size());

DataLayoutCandidateSelector selectAll = new GreedyMaxBudgetCandidateSelector(550.5, 3);
List<Integer> selectedAllStrategies = selectAll.select(normalizedStrategies);
Assertions.assertEquals(3, selectedAllStrategies.size());
Assertions.assertArrayEquals(new Integer[] {2, 1, 0}, selectedAllStrategies.toArray());

DataLayoutCandidateSelector selectTwo = new GreedyMaxBudgetCandidateSelector(550.0, 3);
List<Integer> selectedTwoStrategies = selectTwo.select(normalizedStrategies);
Assertions.assertEquals(2, selectedTwoStrategies.size());
Assertions.assertArrayEquals(new Integer[] {2, 1}, selectedTwoStrategies.toArray());

DataLayoutCandidateSelector selectOne = new GreedyMaxBudgetCandidateSelector(500.0, 3);
List<Integer> selectedOneStrategies = selectOne.select(normalizedStrategies);
Assertions.assertEquals(1, selectedOneStrategies.size());
Assertions.assertArrayEquals(new Integer[] {2}, selectedOneStrategies.toArray());

DataLayoutCandidateSelector selectNone = new GreedyMaxBudgetCandidateSelector(5000.0, 0);
List<Integer> selectedNoneStrategies = selectNone.select(normalizedStrategies);
Assertions.assertEquals(0, selectedNoneStrategies.size());
Assertions.assertArrayEquals(new Integer[] {}, selectedNoneStrategies.toArray());
}
}
Loading