Skip to content

Commit

Permalink
[RFC] general support for Adaptive Pooling Ops (#2661)
Browse files Browse the repository at this point in the history
Adaptive pooling ops can only be decomposed into their non-adaptive
counterparts in trivial cases.

For example, the current decomposition for AtenAdaptiveAvgPool1dOp in
DecomposeComplexOps.cpp supports outSize = inSize (i.e., do literally
nothing), and outSize = 1 (i.e., do a batched average).

The reason adaptive pooling ops are difficult to lower to linalg is that
they are not constantly strided. They are computed by taking an input
tensor of shape (N, C, Hin), and an output size Hout, and computing the
output tensor at position (n,c, h) in the following way:

1. compute st(h) = (h*Hin)//Hout
2. compute en(h) = 1 + ((h+1)*Hin -1)//Hout
3. apply a computation (max or avg) to the slice: INPUT[n, c,
st(h):en(h)]

The provided sample implementation (for ConvertAtenAdaptiveAvgPool1dOp)
uses tensor.extract to access the input tensor inside the payload of a
linalg generic op. This is likely an unattractive use of linalg generic
ops, which is why I am asking for some more targeted feedback on the
validity of this approach before attempting to support the many other
adaptive pooling ops.

Specifically:

- Is the performance of this implementation bad enough to warrant
targeting different dialects entirely? e.g. TMtensor/linalg ext/ etc.
- If the provided implementation is of acceptable performance to the
community, then is it permissable to remove the Adaptive pooling
decompositions from DecomposeComplexOps.cpp? Based on the current
structure of the -torch-decompose-complex-ops pass, it does not seem
possible to only decompose the adaptive ops in special cases (it seems
to get stuck in an infinite loop on a match failure). I would be happy
to instead incorporate the case logic into the conversion directly, and
remove the decompositions once they are rendered completely obsolete.

As long as this approach is acceptable, I can clean up the
implementation with some helper functions, and quickly add support for
each of the remaining Adaptive pooling ops.
  • Loading branch information
zjgarvey authored Jan 9, 2024
1 parent 4dd17f0 commit 07d0645
Show file tree
Hide file tree
Showing 4 changed files with 280 additions and 33 deletions.
237 changes: 212 additions & 25 deletions lib/Conversion/TorchToLinalg/Pooling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,18 +90,19 @@ static LogicalResult createPoolingOp(
SmallVector<int64_t> lowPaddingIncludingNC = {0, 0};
lowPaddingIncludingNC.append(paddingInts);
SmallVector<int64_t> highPaddingIncludingNC = lowPaddingIncludingNC;

if (ceilMode) {
for (int64_t i = 0; i < dimensionality; ++i) {
highPaddingIncludingNC[i + 2] += strideInts[i];
}
}

Value initValue = rewriter.create<arith::ConstantOp>(loc, cast<TypedAttr>(initValueAttr));
Value initValue =
rewriter.create<arith::ConstantOp>(loc, cast<TypedAttr>(initValueAttr));
paddedInput = torch_to_linalg::getPaddedTensor(
op, rewriter, self, lowPaddingIncludingNC, highPaddingIncludingNC,
initValue);

Value N = getDimOp(rewriter, loc, self, 0);
Value C = getDimOp(rewriter, loc, self, 1);

Expand Down Expand Up @@ -141,7 +142,6 @@ static LogicalResult createPoolingOp(
return success();
}


namespace {
class ConvertAtenMaxPool2dOp : public OpConversionPattern<AtenMaxPool2dOp> {
public:
Expand All @@ -163,7 +163,8 @@ class ConvertAtenMaxPool2dOp : public OpConversionPattern<AtenMaxPool2dOp> {
bool ceilMode;
SmallVector<Value, 2> kernelSizeIntValues;
SmallVector<int64_t, 2> strideInts, paddingInts, dilationInts;
if (!matchPattern(op.getDilation(), m_TorchListOfConstantInts(dilationInts)))
if (!matchPattern(op.getDilation(),
m_TorchListOfConstantInts(dilationInts)))
return rewriter.notifyMatchFailure(op,
"only support constant int dilations");
if (failed(checkAndGetPoolingParameters<AtenMaxPool2dOp>(
Expand Down Expand Up @@ -241,7 +242,8 @@ class ConvertAtenMaxPool2dWithIndicesOp
bool ceilMode;
SmallVector<Value, 2> kernelSizeIntValues;
SmallVector<int64_t, 2> strideInts, paddingInts, dilationInts;
if (!matchPattern(op.getDilation(), m_TorchListOfConstantInts(dilationInts)))
if (!matchPattern(op.getDilation(),
m_TorchListOfConstantInts(dilationInts)))
return rewriter.notifyMatchFailure(op,
"only support constant int dilations");
if (failed(checkAndGetPoolingParameters<AtenMaxPool2dWithIndicesOp>(
Expand Down Expand Up @@ -372,7 +374,6 @@ class ConvertAtenMaxPool2dWithIndicesOp
};
} // namespace


namespace {
template <typename OpTy, typename PoolingOpTy, int Dim>
class ConvertAtenAvgPoolOp : public OpConversionPattern<OpTy> {
Expand All @@ -383,7 +384,7 @@ class ConvertAtenAvgPoolOp : public OpConversionPattern<OpTy> {
ConversionPatternRewriter &rewriter) const override {
if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
return failure();

Location loc = op->getLoc();
const TypeConverter *typeConverter = this->getTypeConverter();
Value self = adaptor.getSelf();
Expand All @@ -397,9 +398,9 @@ class ConvertAtenAvgPoolOp : public OpConversionPattern<OpTy> {
bool ceilMode;
SmallVector<Value, Dim> kernelSizeIntValues;
SmallVector<int64_t, Dim> strideInts, paddingInts, dilationInts(Dim, 1);
if (failed(checkAndGetPoolingParameters<OpTy>(
op, rewriter, typeConverter, ceilMode, kernelSizeIntValues,
strideInts, paddingInts)))
if (failed(checkAndGetPoolingParameters<OpTy>(op, rewriter, typeConverter,
ceilMode, kernelSizeIntValues,
strideInts, paddingInts)))
return rewriter.notifyMatchFailure(op, "invalid pooling parameters");

// TODO: Add support for count_include_pad equal to `False`.
Expand All @@ -415,30 +416,32 @@ class ConvertAtenAvgPoolOp : public OpConversionPattern<OpTy> {

// `sumPool` contains the result of sumpool operation over the input.
Value sumPool, paddedInput;
SmallVector<Value, Dim+2> outTensorShape;
SmallVector<Value, Dim + 2> outTensorShape;
if (failed(createPoolingOp<PoolingOpTy>(
op, rewriter, self, /*supportNonFPInput=*/true, ceilMode,
/*dimensionality=*/Dim, kernelSizeIntValues, strideInts, paddingInts,
dilationInts, rewriter.getZeroAttr(inputElementType), outTensorShape,
paddedInput, sumPool)))
/*dimensionality=*/Dim, kernelSizeIntValues, strideInts,
paddingInts, dilationInts, rewriter.getZeroAttr(inputElementType),
outTensorShape, paddedInput, sumPool)))
return rewriter.notifyMatchFailure(op, "unable to compute sumpool");
Value divisor;
if constexpr (std::is_same<OpTy, AtenAvgPool2dOp>()) {
Value kHtimeskW = rewriter.create<arith::MulIOp>(
loc, kernelSizeIntValues[0], kernelSizeIntValues[1]);
divisor = op.getDivisorOverride().getType().template isa<Torch::NoneType>()
? kHtimeskW
: adaptor.getDivisorOverride();
divisor =
op.getDivisorOverride().getType().template isa<Torch::NoneType>()
? kHtimeskW
: adaptor.getDivisorOverride();
} else {
divisor = kernelSizeIntValues[0];
}
divisor = convertScalarToDtype(rewriter, loc, divisor, resultElementType);

Value outputTensor = rewriter.create<tensor::EmptyOp>(
loc, getAsOpFoldResult(outTensorShape), resultElementType);
SmallVector<AffineMap> indexingMapsAvg(2, rewriter.getMultiDimIdentityMap(Dim+2));
SmallVector<AffineMap> indexingMapsAvg(
2, rewriter.getMultiDimIdentityMap(Dim + 2));
SmallVector<utils::IteratorType> iteratorTypesAvg(
Dim+2, utils::IteratorType::parallel);
Dim + 2, utils::IteratorType::parallel);
Value avgPool =
rewriter
.create<linalg::GenericOp>(
Expand All @@ -459,8 +462,188 @@ class ConvertAtenAvgPoolOp : public OpConversionPattern<OpTy> {
return success();
}
};
}
} // namespace

/*
This section is for lowering adaptive pooling ops, which cannot generally be
decomposed into typical pooling ops. Given an input tensor of rank (N,C,Hin) and
an output spatial size Hout, an element of the output tensor at position (n, c,
h) is computed as follows.
1. compute st(h) = (h*Hin)//Hout
2. compute en(h) = 1 + ((h+1)*Hin - 1)//Hout
3. apply the operation (max or avg) over input[n, c, st(h):en(h)]
This is problematic for linalg ops for a few reasons:
1. The access to the input tensor is not constantly strided
2. The size of the window itself is not contant: en(h) - st(h) can vary with
h! Although it is a bit like using a hammer to paint, our workaround is to use
tensor.extract to access the elements of the input tensor inside our linalg
generic op's payload.
Current TODO's:
1. gather most of the boilerplate out of this op and make it into an
adaptive pooling helper function.
2. figure out what to do with the conflicting decompositions in
DecomposeComplexOps.cpp
3. Implement more efficient passes for when the kernel-size, input spatial
dims, and output spatial dims are constant.
*/

namespace {
class ConvertAtenAdaptiveAvgPool1dOp
: public OpConversionPattern<AtenAdaptiveAvgPool1dOp> {
public:
using OpConversionPattern::OpConversionPattern;
LogicalResult
matchAndRewrite(AtenAdaptiveAvgPool1dOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {

Location loc = op->getLoc();
const TypeConverter *typeConverter = getTypeConverter();

// get rank of input (same as rank of output)
int64_t rank =
adaptor.getSelf().getType().cast<RankedTensorType>().getRank();
// input operand should be NCH (i.e. rank 3)
if (rank != 3) {
return rewriter.notifyMatchFailure(op, "only supports input type NCH");
}

// input tensor and output shape
Value input = adaptor.getSelf();
Value outputShape = op.getOutputSize();
SmallVector<Value> outShapeVector;
getListConstructElements(outputShape, outShapeVector);
outShapeVector =
getTypeConvertedValues(rewriter, loc, typeConverter, outShapeVector);
Value hIn = getDimOp(rewriter, loc, input, 2);
Value hOut = outShapeVector[0];
Value hOutIndex = castIntToIndex(rewriter, loc, hOut);
RankedTensorType inputType = input.getType().cast<RankedTensorType>();
RankedTensorType outputType =
typeConverter->convertType(op.getResult().getType())
.cast<RankedTensorType>();

// get elementType of input tensor
Type elementType = inputType.getElementType();

// make an iteration space of size kMax = 1 + ceildiv (hIn - 1) , hOut
Type boolType = rewriter.getI1Type();
Value kIter;
Value constantOne =
rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1));
Value hInPlusOne = rewriter.create<arith::SubIOp>(loc, hIn, constantOne);
Value kMaxMinusOne =
rewriter.create<arith::CeilDivSIOp>(loc, hInPlusOne, hOutIndex);
Value kMax = rewriter.create<arith::AddIOp>(loc, constantOne, kMaxMinusOne);
kIter = rewriter.create<tensor::EmptyOp>(
loc, getAsOpFoldResult(ValueRange({kMax})), boolType);

// need to buffer input, else there will possibly be an out of bounds access
// later buffVal = 0 for avg pooling and -inf for max pooling
Value buffVal = rewriter.create<arith::ConstantOp>(
loc, elementType, rewriter.getFloatAttr(elementType, 0));
SmallVector<int64_t> lowPadding = {0, 0, 0};
SmallVector<int64_t> highPadding = {0, 0, 1};
Value buffInput = torch_to_linalg::getPaddedTensor(
op, rewriter, input, lowPadding, highPadding, buffVal);

// make a list of outputSizes
SmallVector<Value> outputSizes;
for (unsigned i = 0; i < rank - 1; i++) {
outputSizes.push_back(getDimOp(rewriter, loc, input, i));
}
outputSizes.push_back(hOutIndex);

// initialize a kernel size tensor (only for avg pooling)
Value kSizeTensor = rewriter.create<tensor::EmptyOp>(
loc, getAsOpFoldResult(ValueRange({hOutIndex})), elementType);

// initialize an output tensor
Value initOutput =
createInitTensor(rewriter, loc, outputSizes, elementType, buffVal);

// setup indexing maps and iterator types for linalg generic op
// for kIter (d0,d1,d2,d3) -> (d3)
// for output (d0,d1,d2,d3) -> (d0,d1,d2)
// for kSizeTensor (d0,d1,d2,d3) -> (d2)
SmallVector<AffineExpr> kIterExprs, outputExprs, kSizeTensorExprs;
for (unsigned i = 0; i < 3; i++) {
outputExprs.push_back(rewriter.getAffineDimExpr(i));
}
kSizeTensorExprs.push_back(rewriter.getAffineDimExpr(2));
kIterExprs.push_back(rewriter.getAffineDimExpr(3));
SmallVector<AffineMap> indexingMaps = AffineMap::inferFromExprList(
{kIterExprs, outputExprs, kSizeTensorExprs});
SmallVector<utils::IteratorType> iteratorTypes(
3, utils::IteratorType::parallel);
iteratorTypes.push_back(utils::IteratorType::reduction);

Value indexOne = rewriter.create<arith::ConstantIndexOp>(loc, 1);
auto sumPool = rewriter.create<linalg::GenericOp>(
loc, /*resultTensorTypes=*/
TypeRange({initOutput.getType(), kSizeTensor.getType()}),
/*inputs=*/ValueRange({kIter}),
/*outputs=*/ValueRange({initOutput, kSizeTensor}),
/*indexingMaps=*/indexingMaps,
/*iteratorTypes=*/iteratorTypes,
[&](OpBuilder &b, Location loc, ValueRange args) {
Value res = args[1];
Value ind0 = b.create<linalg::IndexOp>(loc, 0);
Value ind1 = b.create<linalg::IndexOp>(loc, 1);
Value ind2 = b.create<linalg::IndexOp>(loc, 2);
Value ind3 = b.create<linalg::IndexOp>(loc, 3);
// compute start and end indices
// st = s1( s0(ind2 * Hin) // Hout )
Value s0 = b.create<arith::MulIOp>(loc, ind2, hIn);
Value s1 = b.create<arith::FloorDivSIOp>(loc, s0, hOutIndex);
// en = e4( 1 + e3( e2( e1( e0(ind2 + 1) * hIn ) - 1 ) // hOut ) )
Value e0 = b.create<arith::AddIOp>(loc, ind2, indexOne);
Value e1 = b.create<arith::MulIOp>(loc, e0, hIn);
Value e2 = b.create<arith::SubIOp>(loc, e1, indexOne);
Value e3 = b.create<arith::FloorDivSIOp>(loc, e2, hOutIndex);
Value e4 = b.create<arith::AddIOp>(loc, indexOne, e3);
// get input element @ st + ind3:
Value wIndex = b.create<arith::AddIOp>(loc, s1, ind3);
Value inElt = b.create<tensor::ExtractOp>(
loc, elementType, buffInput, ValueRange({ind0, ind1, wIndex}));
// check if we extracted at windex < end index
Value cond =
b.create<arith::CmpIOp>(loc, arith::CmpIPredicate(6), wIndex, e4);
// if inElt is in bounds, include it in the computation
// else, use buffVal = 0 (for max pool use -infinity)
Value out1 = b.create<arith::SelectOp>(loc, cond, inElt, buffVal);
// compute Kernel size: we store this to kwTensor
Value kSize = b.create<arith::SubIOp>(loc, e4, s1);
Value kSizeInt = castIndexToInt64(b, loc, kSize);
Value kSizeF = b.create<arith::SIToFPOp>(loc, elementType, kSizeInt);
// accumulate out2 to res = args[1]
Value out2 = b.create<arith::AddFOp>(loc, res, out1);
b.create<linalg::YieldOp>(loc, ValueRange({out2, kSizeF}));
});

// make a linalg generic to divide each element by the corresponding
// Kernel Width. This step is only necessary for avg pooling.
SmallVector<AffineMap> indexingMaps1 =
AffineMap::inferFromExprList({kSizeTensorExprs, outputExprs});
SmallVector<utils::IteratorType> iteratorTypes1(
3, utils::IteratorType::parallel);
auto output = rewriter.create<linalg::GenericOp>(
loc, /*resultTensorTypes=*/initOutput.getType(),
/*inputs=*/sumPool.getResultTensors()[1],
/*outputs=*/sumPool.getResultTensors()[0],
/*indexingMaps=*/indexingMaps1,
/*iteratorTypes=*/iteratorTypes1,
[&](OpBuilder &b, Location loc, ValueRange args) {
Value q = b.create<arith::DivFOp>(loc, args[1], args[0]);
b.create<linalg::YieldOp>(loc, q);
});

rewriter.replaceOpWithNewOp<tensor::CastOp>(op, outputType,
output.getResultTensors());
return success();
}
};
} // namespace

void mlir::torch::torch_to_linalg::populatePoolingPatternsAndLegality(
TypeConverter &typeConverter, RewritePatternSet &patterns,
Expand All @@ -471,8 +654,12 @@ void mlir::torch::torch_to_linalg::populatePoolingPatternsAndLegality(
target.addIllegalOp<AtenMaxPool2dWithIndicesOp>();
patterns.add<ConvertAtenMaxPool2dWithIndicesOp>(typeConverter, context);
target.addIllegalOp<AtenAvgPool1dOp, AtenAvgPool2dOp>();
patterns.add<ConvertAtenAvgPoolOp<AtenAvgPool1dOp, linalg::PoolingNcwSumOp, 1>>(
typeConverter, context);
patterns.add<ConvertAtenAvgPoolOp<AtenAvgPool2dOp, linalg::PoolingNchwSumOp, 2>>(
typeConverter, context);
patterns
.add<ConvertAtenAvgPoolOp<AtenAvgPool1dOp, linalg::PoolingNcwSumOp, 1>>(
typeConverter, context);
patterns
.add<ConvertAtenAvgPoolOp<AtenAvgPool2dOp, linalg::PoolingNchwSumOp, 2>>(
typeConverter, context);
target.addIllegalOp<AtenAdaptiveAvgPool1dOp>();
patterns.add<ConvertAtenAdaptiveAvgPool1dOp>(typeConverter, context);
}
3 changes: 3 additions & 0 deletions projects/pt1/e2e_testing/xfail_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,8 @@
# ERROR: Exception: Unsupported: missing default value for argument 0 in schema for aten.div.Tensor_mode
"ElementwiseDivRoundingModeFloorModule_basic",
"ElementwiseDivRoundingModeTruncModule_basic",
"AdaptiveAvgPool1dStaticLargerOutput_basic",
"AdaptiveAvgPool1dGeneralDynamic_basic",

# ERROR: Exception: Unsupported op: get_attr
"NumToTensorFloatModule_basic",
Expand Down Expand Up @@ -1324,6 +1326,7 @@
### Tests additionally passing in make_fx_tosa
"AdaptiveAvgPool1dNonUnitOutputSizeStaticModule_basic",
"AdaptiveAvgPool1dUnitOutputSizeStaticModule_basic",
"AdaptiveAvgPool1dStaticEvenMultiple_basic",
"NativeGroupNormBackwardModule_basic",
"SliceWholeTensorModule_basic",
"TensorFloatModule_basic",
Expand Down
2 changes: 1 addition & 1 deletion projects/pt1/python/torch_mlir/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def _get_for_tracing(
# compiler where each backend can "own" its set of legal ops.
BACKEND_LEGAL_OPS = {
OutputType.TOSA: ['aten.flatten.using_ints', 'aten.native_layer_norm', 'aten.linear'],
OutputType.LINALG_ON_TENSORS: ['aten.flatten.using_ints', ],
OutputType.LINALG_ON_TENSORS: ['aten.flatten.using_ints','aten.adaptive_avg_pool1d'],
OutputType.STABLEHLO: [],
}

Expand Down
Loading

0 comments on commit 07d0645

Please sign in to comment.