Skip to content

Commit 2e4c89e

Browse files
efikspytorchmergebot
authored andcommitted
[torch] Unify batch_box_cox implementations into perfkernels folder (pytorch#86569)
Summary: 1) Adding MKL/AVX2 based implementation into perfkernels. This implementation is similar to caffe2/operators/batch_box_cox_op.cc 2) Migrating batch_box_cox_op of caffe2 use this implementation Test Plan: CI Differential Revision: D40208074 Pull Request resolved: pytorch#86569 Approved by: https://github.com/hyuen
1 parent 0d2baed commit 2e4c89e

File tree

6 files changed

+478
-332
lines changed

6 files changed

+478
-332
lines changed

caffe2/operators/batch_box_cox_op.cc

+27-273
Original file line numberDiff line numberDiff line change
@@ -2,72 +2,34 @@
22

33
#include "caffe2/core/operator.h"
44
#include "caffe2/core/tensor.h"
5-
6-
#ifdef CAFFE2_USE_MKL
7-
#include <mkl.h>
8-
#endif // CAFFE2_USE_MKL
5+
#include "caffe2/perfkernels/batch_box_cox.h"
96

107
namespace caffe2 {
118

12-
#ifdef CAFFE2_USE_MKL
139
namespace {
14-
15-
// Helpers for copying parameters.
1610
template <typename T>
17-
void TileArrayIntoVector(const T* a, int D, int K, vector<T>* b) {
18-
b->resize(K * D);
19-
for (int k = 0; k < K; k++) {
20-
std::copy(a, a + D, b->begin() + k * D);
21-
}
22-
}
23-
24-
void TileIndicesInPlace(vector<int>* v, int D, int K) {
25-
int n = v->size();
26-
v->resize(K * n);
27-
for (int k = 1; k < K; k++) {
28-
for (int j = 0; j < n; j++) {
29-
(*v)[k * n + j] = (*v)[j] + k * D;
11+
void BoxCoxNaive(
12+
int64_t N,
13+
int64_t D,
14+
const T* data_ptr,
15+
const T* lambda1_ptr,
16+
const T* lambda2_ptr,
17+
T* output_ptr) {
18+
constexpr T k_eps = static_cast<T>(1e-6);
19+
for (int64_t i = 0; i < N; i++) {
20+
for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) {
21+
T lambda1_v = lambda1_ptr[j];
22+
T lambda2_v = lambda2_ptr[j];
23+
T tmp = std::max(*data_ptr + lambda2_v, k_eps);
24+
if (lambda1_v == 0) {
25+
*output_ptr = std::log(tmp);
26+
} else {
27+
*output_ptr = (std::pow(tmp, lambda1_v) - 1) / lambda1_v;
28+
}
3029
}
3130
}
3231
}
33-
34-
// MKL VML function templates.
35-
template <typename T>
36-
void PackV(const int N, const T* a, const int* ia, T* y);
37-
template <typename T>
38-
void UnpackV(const int N, const T* a, T* y, const int* iy);
39-
template <typename T>
40-
void Pow(const int N, const T* a, const T* b, T* y);
41-
42-
#define DELEGATE_PACKV_FUNCTION(T, OriginalFunc) \
43-
template <> \
44-
void PackV<T>(const int N, const T* a, const int* ia, T* y) { \
45-
OriginalFunc(N, a, ia, y); \
46-
}
47-
DELEGATE_PACKV_FUNCTION(float, vsPackV)
48-
DELEGATE_PACKV_FUNCTION(double, vdPackV)
49-
#undef DELEGATE_PACKV_FUNCTION
50-
51-
#define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc) \
52-
template <> \
53-
void UnpackV<T>(const int N, const T* a, T* y, const int* iy) { \
54-
OriginalFunc(N, a, y, iy); \
55-
}
56-
DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV)
57-
DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV)
58-
#undef DELEGATE_UNPACKV_FUNCTION
59-
60-
#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \
61-
template <> \
62-
void Funcname<T>(const int N, const T* a, const T* b, T* y) { \
63-
OriginalFunc(N, a, b, y); \
64-
}
65-
DELEGATE_SIMPLE_BINARY_FUNCTION(float, Pow, vsPow)
66-
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Pow, vdPow)
67-
#undef DELEGATE_SIMPLE_BINARY_FUNCTION
68-
69-
} // namespace
70-
#endif // CAFFE2_USE_MKL
32+
}
7133

7234
template <>
7335
template <typename T>
@@ -93,227 +55,19 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
9355
const auto* lambda1_ptr = lambda1.template data<T>();
9456
const auto* lambda2_ptr = lambda2.template data<T>();
9557

96-
const T k_eps = static_cast<T>(1e-6);
97-
9858
#ifdef CAFFE2_USE_MKL
9959
if (min_block_size_ < 1) {
100-
BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
101-
} else {
102-
// Find zero-valued columns, since they get special treatment.
103-
nonzeros_.clear();
104-
zeros_.clear();
105-
nonzeros_.reserve(D);
106-
zeros_.reserve(D);
107-
for (int64_t j = 0; j < D; j++) {
108-
if (lambda1_ptr[j] == 0) {
109-
zeros_.push_back(j);
110-
} else {
111-
nonzeros_.push_back(j);
112-
}
113-
}
114-
115-
// Process K rows at a time for effective vectorization with small rows.
116-
const int K = std::min(N, (min_block_size_ + D - 1) / D);
117-
118-
// Avoid copying data if all lambda1 values are zero, or if all are nonzero.
119-
// In each of the three cases here, when K > 1, first process batches of K
120-
// rows by replicating the input parameters K times. Then finish row-by-row.
121-
TypedCachedBuffers<T>& b = GetBuffers<T>();
122-
if (nonzeros_.size() == D) {
123-
int64_t i = 0;
124-
if (K > 1) {
125-
TileArrayIntoVector(lambda1_ptr, D, K, &b.lambda1_);
126-
TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_);
127-
TORCH_DCHECK_EQ(K * D, b.lambda1_.size());
128-
TORCH_DCHECK_EQ(K * D, b.lambda2_.size());
129-
for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
130-
BoxCoxNonzeroLambda(
131-
K * D,
132-
data_ptr,
133-
b.lambda1_.data(),
134-
b.lambda2_.data(),
135-
k_eps,
136-
output_ptr);
137-
}
138-
}
139-
for (; i < N; i++, data_ptr += D, output_ptr += D) {
140-
BoxCoxNonzeroLambda(
141-
D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
142-
}
143-
} else if (zeros_.size() == D) {
144-
int64_t i = 0;
145-
if (K > 1) {
146-
TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_z_);
147-
TORCH_DCHECK_EQ(K * D, b.lambda2_z_.size());
148-
for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
149-
BoxCoxZeroLambda(
150-
K * D, data_ptr, b.lambda2_z_.data(), k_eps, output_ptr);
151-
}
152-
}
153-
for (; i < N; i++, data_ptr += D, output_ptr += D) {
154-
BoxCoxZeroLambda(D, data_ptr, lambda2_ptr, k_eps, output_ptr);
155-
}
156-
} else { // General case of mixed zero and non-zero lambda1 values.
157-
int n = nonzeros_.size();
158-
if (K > 1) {
159-
TileIndicesInPlace(&nonzeros_, 0, K);
160-
TileIndicesInPlace(&zeros_, 0, K);
161-
}
162-
163-
// Gather parameter values into contiguous memory.
164-
b.lambda1_.resize(nonzeros_.size());
165-
b.lambda2_.resize(nonzeros_.size());
166-
b.lambda2_z_.resize(zeros_.size());
167-
PackV(nonzeros_.size(), lambda1_ptr, nonzeros_.data(), b.lambda1_.data());
168-
PackV(nonzeros_.size(), lambda2_ptr, nonzeros_.data(), b.lambda2_.data());
169-
PackV(zeros_.size(), lambda2_ptr, zeros_.data(), b.lambda2_z_.data());
170-
171-
int64_t i = 0;
172-
b.accumulator_.resize(std::max(nonzeros_.size(), zeros_.size()));
173-
if (K > 1) {
174-
// Truncate to original size, and re-tile with offsets this time.
175-
nonzeros_.resize(n);
176-
zeros_.resize(D - n);
177-
TileIndicesInPlace(&nonzeros_, D, K);
178-
TileIndicesInPlace(&zeros_, D, K);
179-
TORCH_DCHECK_EQ(nonzeros_.size(), b.lambda1_.size());
180-
TORCH_DCHECK_EQ(nonzeros_.size(), b.lambda2_.size());
181-
TORCH_DCHECK_EQ(zeros_.size(), b.lambda2_z_.size());
182-
for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
183-
BoxCoxMixedLambda(
184-
data_ptr,
185-
nonzeros_,
186-
zeros_,
187-
b.lambda1_.data(),
188-
b.lambda2_.data(),
189-
b.lambda2_z_.data(),
190-
k_eps,
191-
b.accumulator_.data(),
192-
output_ptr);
193-
}
194-
// Truncate to original size.
195-
nonzeros_.resize(n);
196-
zeros_.resize(D - n);
197-
}
198-
for (; i < N; i++, data_ptr += D, output_ptr += D) {
199-
BoxCoxMixedLambda(
200-
data_ptr,
201-
nonzeros_,
202-
zeros_,
203-
b.lambda1_.data(),
204-
b.lambda2_.data(),
205-
b.lambda2_z_.data(),
206-
k_eps,
207-
b.accumulator_.data(),
208-
output_ptr);
209-
}
210-
}
60+
BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr);
61+
return true;
21162
}
212-
#else // CAFFE2_USE_MKL
213-
BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
214-
#endif // CAFFE2_USE_MKL
63+
caffe2::compute_batch_box_cox(
64+
N, D, min_block_size_, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr);
65+
#else
66+
BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr);
67+
#endif
21568
return true;
21669
}
21770

218-
template <>
219-
template <typename T>
220-
void BatchBoxCoxOp<CPUContext>::BoxCoxNaive(
221-
int64_t N,
222-
int64_t D,
223-
const T* data_ptr,
224-
const T* lambda1_ptr,
225-
const T* lambda2_ptr,
226-
T k_eps,
227-
T* output_ptr) {
228-
for (int64_t i = 0; i < N; i++) {
229-
for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) {
230-
T lambda1_v = lambda1_ptr[j];
231-
T lambda2_v = lambda2_ptr[j];
232-
T tmp = std::max(*data_ptr + lambda2_v, k_eps);
233-
if (lambda1_v == 0) {
234-
*output_ptr = std::log(tmp);
235-
} else {
236-
*output_ptr = (std::pow(tmp, lambda1_v) - 1) / lambda1_v;
237-
}
238-
}
239-
}
240-
}
241-
242-
#ifdef CAFFE2_USE_MKL
243-
244-
template <>
245-
template <typename T>
246-
void BatchBoxCoxOp<CPUContext>::BoxCoxNonzeroLambda(
247-
int64_t D,
248-
const T* data_ptr,
249-
const T* lambda1,
250-
const T* lambda2,
251-
T k_eps,
252-
T* out) {
253-
caffe2::math::Add(D, data_ptr, lambda2, out, &context_);
254-
for (int64_t j = 0; j < D; j++) {
255-
out[j] = std::max(out[j], k_eps);
256-
}
257-
Pow(D, out, lambda1, out);
258-
for (int64_t j = 0; j < D; j++) {
259-
out[j] -= 1.0;
260-
}
261-
caffe2::math::Div(D, out, lambda1, out, &context_);
262-
}
263-
264-
template <>
265-
template <typename T>
266-
void BatchBoxCoxOp<CPUContext>::BoxCoxZeroLambda(
267-
int64_t D,
268-
const T* data_ptr,
269-
const T* lambda2,
270-
T k_eps,
271-
T* output_ptr) {
272-
caffe2::math::Add(D, data_ptr, lambda2, output_ptr, &context_);
273-
for (int64_t j = 0; j < D; j++) {
274-
output_ptr[j] = std::max(output_ptr[j], k_eps);
275-
}
276-
caffe2::math::Log(D, output_ptr, output_ptr, &context_);
277-
}
278-
279-
template <>
280-
template <typename T>
281-
void BatchBoxCoxOp<CPUContext>::BoxCoxMixedLambda(
282-
const T* data_ptr,
283-
const vector<int>& nonzeros,
284-
const vector<int>& zeros,
285-
const T* lambda1,
286-
const T* lambda2,
287-
const T* lambda2_z,
288-
T k_eps,
289-
T* buffer,
290-
T* output_ptr) {
291-
PackV(nonzeros.size(), data_ptr, nonzeros.data(), buffer);
292-
BoxCoxNonzeroLambda(nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer);
293-
UnpackV(nonzeros.size(), buffer, output_ptr, nonzeros.data());
294-
295-
PackV(zeros.size(), data_ptr, zeros.data(), buffer);
296-
BoxCoxZeroLambda(zeros.size(), buffer, lambda2_z, k_eps, buffer);
297-
UnpackV(zeros.size(), buffer, output_ptr, zeros.data());
298-
}
299-
300-
// Helpers to access cached buffers.
301-
#define DEFINE_CACHED_BUFFERS(T, tag) \
302-
template <> \
303-
template <> \
304-
BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>& \
305-
BatchBoxCoxOp<CPUContext>::GetBuffers<T>() { \
306-
if (!buffers_ || buffers_->type_ != tag) { \
307-
buffers_.reset(new BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>()); \
308-
buffers_->type_ = tag; \
309-
} \
310-
return *static_cast<TypedCachedBuffers<T>*>(buffers_.get()); \
311-
}
312-
DEFINE_CACHED_BUFFERS(float, 1);
313-
DEFINE_CACHED_BUFFERS(double, 2);
314-
#undef DEFINE_CACHED_BUFFERS
315-
316-
#endif // CAFFE2_USE_MKL
31771

31872
namespace {
31973

caffe2/operators/batch_box_cox_op.h

+1-59
Original file line numberDiff line numberDiff line change
@@ -29,65 +29,7 @@ class BatchBoxCoxOp final : public Operator<Context> {
2929
bool DoRunWithType();
3030

3131
protected:
32-
template <typename T>
33-
void BoxCoxNaive(
34-
int64_t N,
35-
int64_t D,
36-
const T* data_ptr,
37-
const T* lambda1_ptr,
38-
const T* lambda2_ptr,
39-
T k_eps,
40-
T* output_ptr);
41-
42-
#ifdef CAFFE2_USE_MKL
43-
template <typename T>
44-
void BoxCoxNonzeroLambda(
45-
int64_t D,
46-
const T* data_ptr,
47-
const T* lambda1,
48-
const T* lambda2,
49-
T k_eps,
50-
T* output_ptr);
51-
52-
template <typename T>
53-
void BoxCoxZeroLambda(
54-
int64_t D,
55-
const T* data_ptr,
56-
const T* lambda2,
57-
T k_eps,
58-
T* output_ptr);
59-
60-
template <typename T>
61-
void BoxCoxMixedLambda(
62-
const T* data_ptr,
63-
const vector<int>& nonzeros,
64-
const vector<int>& zeros,
65-
const T* lambda1,
66-
const T* lambda2,
67-
const T* lambda2_z,
68-
T k_eps,
69-
T* buffer,
70-
T* output_ptr);
71-
72-
vector<int> nonzeros_, zeros_;
73-
74-
// Buffers used by the MKL version are cached across calls.
75-
struct CachedBuffers {
76-
virtual ~CachedBuffers() {}
77-
int type_;
78-
};
79-
template <typename T>
80-
struct TypedCachedBuffers : public CachedBuffers {
81-
vector<T> lambda1_, lambda2_, lambda2_z_;
82-
vector<T> accumulator_;
83-
};
84-
template <typename T>
85-
TypedCachedBuffers<T>& GetBuffers();
86-
unique_ptr<CachedBuffers> buffers_;
87-
88-
#endif // CAFFE2_USE_MKL
89-
90-
int min_block_size_;
32+
std::size_t min_block_size_;
9133

9234
INPUT_TAGS(DATA, LAMBDA1, LAMBDA2);
9335
};

0 commit comments

Comments
 (0)