2
2
3
3
#include " caffe2/core/operator.h"
4
4
#include " caffe2/core/tensor.h"
5
-
6
- #ifdef CAFFE2_USE_MKL
7
- #include < mkl.h>
8
- #endif // CAFFE2_USE_MKL
5
+ #include " caffe2/perfkernels/batch_box_cox.h"
9
6
10
7
namespace caffe2 {
11
8
12
- #ifdef CAFFE2_USE_MKL
13
9
namespace {
14
-
15
- // Helpers for copying parameters.
16
10
template <typename T>
17
- void TileArrayIntoVector (const T* a, int D, int K, vector<T>* b) {
18
- b->resize (K * D);
19
- for (int k = 0 ; k < K; k++) {
20
- std::copy (a, a + D, b->begin () + k * D);
21
- }
22
- }
23
-
24
- void TileIndicesInPlace (vector<int >* v, int D, int K) {
25
- int n = v->size ();
26
- v->resize (K * n);
27
- for (int k = 1 ; k < K; k++) {
28
- for (int j = 0 ; j < n; j++) {
29
- (*v)[k * n + j] = (*v)[j] + k * D;
11
+ void BoxCoxNaive (
12
+ int64_t N,
13
+ int64_t D,
14
+ const T* data_ptr,
15
+ const T* lambda1_ptr,
16
+ const T* lambda2_ptr,
17
+ T* output_ptr) {
18
+ constexpr T k_eps = static_cast <T>(1e-6 );
19
+ for (int64_t i = 0 ; i < N; i++) {
20
+ for (int64_t j = 0 ; j < D; j++, data_ptr++, output_ptr++) {
21
+ T lambda1_v = lambda1_ptr[j];
22
+ T lambda2_v = lambda2_ptr[j];
23
+ T tmp = std::max (*data_ptr + lambda2_v, k_eps);
24
+ if (lambda1_v == 0 ) {
25
+ *output_ptr = std::log (tmp);
26
+ } else {
27
+ *output_ptr = (std::pow (tmp, lambda1_v) - 1 ) / lambda1_v;
28
+ }
30
29
}
31
30
}
32
31
}
33
-
34
- // MKL VML function templates.
35
- template <typename T>
36
- void PackV (const int N, const T* a, const int * ia, T* y);
37
- template <typename T>
38
- void UnpackV (const int N, const T* a, T* y, const int * iy);
39
- template <typename T>
40
- void Pow (const int N, const T* a, const T* b, T* y);
41
-
42
- #define DELEGATE_PACKV_FUNCTION (T, OriginalFunc ) \
43
- template <> \
44
- void PackV<T>(const int N, const T* a, const int * ia, T* y) { \
45
- OriginalFunc (N, a, ia, y); \
46
- }
47
- DELEGATE_PACKV_FUNCTION (float , vsPackV)
48
- DELEGATE_PACKV_FUNCTION (double , vdPackV)
49
- #undef DELEGATE_PACKV_FUNCTION
50
-
51
- #define DELEGATE_UNPACKV_FUNCTION (T, OriginalFunc ) \
52
- template <> \
53
- void UnpackV<T>(const int N, const T* a, T* y, const int * iy) { \
54
- OriginalFunc (N, a, y, iy); \
55
- }
56
- DELEGATE_UNPACKV_FUNCTION (float , vsUnpackV)
57
- DELEGATE_UNPACKV_FUNCTION (double , vdUnpackV)
58
- #undef DELEGATE_UNPACKV_FUNCTION
59
-
60
- #define DELEGATE_SIMPLE_BINARY_FUNCTION (T, Funcname, OriginalFunc ) \
61
- template <> \
62
- void Funcname<T>(const int N, const T* a, const T* b, T* y) { \
63
- OriginalFunc (N, a, b, y); \
64
- }
65
- DELEGATE_SIMPLE_BINARY_FUNCTION (float , Pow, vsPow)
66
- DELEGATE_SIMPLE_BINARY_FUNCTION (double , Pow, vdPow)
67
- #undef DELEGATE_SIMPLE_BINARY_FUNCTION
68
-
69
- } // namespace
70
- #endif // CAFFE2_USE_MKL
32
+ }
71
33
72
34
template <>
73
35
template <typename T>
@@ -93,227 +55,19 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
93
55
const auto * lambda1_ptr = lambda1.template data <T>();
94
56
const auto * lambda2_ptr = lambda2.template data <T>();
95
57
96
- const T k_eps = static_cast <T>(1e-6 );
97
-
98
58
#ifdef CAFFE2_USE_MKL
99
59
if (min_block_size_ < 1 ) {
100
- BoxCoxNaive (N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
101
- } else {
102
- // Find zero-valued columns, since they get special treatment.
103
- nonzeros_.clear ();
104
- zeros_.clear ();
105
- nonzeros_.reserve (D);
106
- zeros_.reserve (D);
107
- for (int64_t j = 0 ; j < D; j++) {
108
- if (lambda1_ptr[j] == 0 ) {
109
- zeros_.push_back (j);
110
- } else {
111
- nonzeros_.push_back (j);
112
- }
113
- }
114
-
115
- // Process K rows at a time for effective vectorization with small rows.
116
- const int K = std::min (N, (min_block_size_ + D - 1 ) / D);
117
-
118
- // Avoid copying data if all lambda1 values are zero, or if all are nonzero.
119
- // In each of the three cases here, when K > 1, first process batches of K
120
- // rows by replicating the input parameters K times. Then finish row-by-row.
121
- TypedCachedBuffers<T>& b = GetBuffers<T>();
122
- if (nonzeros_.size () == D) {
123
- int64_t i = 0 ;
124
- if (K > 1 ) {
125
- TileArrayIntoVector (lambda1_ptr, D, K, &b.lambda1_ );
126
- TileArrayIntoVector (lambda2_ptr, D, K, &b.lambda2_ );
127
- TORCH_DCHECK_EQ (K * D, b.lambda1_ .size ());
128
- TORCH_DCHECK_EQ (K * D, b.lambda2_ .size ());
129
- for (; i < N - K + 1 ; i += K, data_ptr += K * D, output_ptr += K * D) {
130
- BoxCoxNonzeroLambda (
131
- K * D,
132
- data_ptr,
133
- b.lambda1_ .data (),
134
- b.lambda2_ .data (),
135
- k_eps,
136
- output_ptr);
137
- }
138
- }
139
- for (; i < N; i++, data_ptr += D, output_ptr += D) {
140
- BoxCoxNonzeroLambda (
141
- D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
142
- }
143
- } else if (zeros_.size () == D) {
144
- int64_t i = 0 ;
145
- if (K > 1 ) {
146
- TileArrayIntoVector (lambda2_ptr, D, K, &b.lambda2_z_ );
147
- TORCH_DCHECK_EQ (K * D, b.lambda2_z_ .size ());
148
- for (; i < N - K + 1 ; i += K, data_ptr += K * D, output_ptr += K * D) {
149
- BoxCoxZeroLambda (
150
- K * D, data_ptr, b.lambda2_z_ .data (), k_eps, output_ptr);
151
- }
152
- }
153
- for (; i < N; i++, data_ptr += D, output_ptr += D) {
154
- BoxCoxZeroLambda (D, data_ptr, lambda2_ptr, k_eps, output_ptr);
155
- }
156
- } else { // General case of mixed zero and non-zero lambda1 values.
157
- int n = nonzeros_.size ();
158
- if (K > 1 ) {
159
- TileIndicesInPlace (&nonzeros_, 0 , K);
160
- TileIndicesInPlace (&zeros_, 0 , K);
161
- }
162
-
163
- // Gather parameter values into contiguous memory.
164
- b.lambda1_ .resize (nonzeros_.size ());
165
- b.lambda2_ .resize (nonzeros_.size ());
166
- b.lambda2_z_ .resize (zeros_.size ());
167
- PackV (nonzeros_.size (), lambda1_ptr, nonzeros_.data (), b.lambda1_ .data ());
168
- PackV (nonzeros_.size (), lambda2_ptr, nonzeros_.data (), b.lambda2_ .data ());
169
- PackV (zeros_.size (), lambda2_ptr, zeros_.data (), b.lambda2_z_ .data ());
170
-
171
- int64_t i = 0 ;
172
- b.accumulator_ .resize (std::max (nonzeros_.size (), zeros_.size ()));
173
- if (K > 1 ) {
174
- // Truncate to original size, and re-tile with offsets this time.
175
- nonzeros_.resize (n);
176
- zeros_.resize (D - n);
177
- TileIndicesInPlace (&nonzeros_, D, K);
178
- TileIndicesInPlace (&zeros_, D, K);
179
- TORCH_DCHECK_EQ (nonzeros_.size (), b.lambda1_ .size ());
180
- TORCH_DCHECK_EQ (nonzeros_.size (), b.lambda2_ .size ());
181
- TORCH_DCHECK_EQ (zeros_.size (), b.lambda2_z_ .size ());
182
- for (; i < N - K + 1 ; i += K, data_ptr += K * D, output_ptr += K * D) {
183
- BoxCoxMixedLambda (
184
- data_ptr,
185
- nonzeros_,
186
- zeros_,
187
- b.lambda1_ .data (),
188
- b.lambda2_ .data (),
189
- b.lambda2_z_ .data (),
190
- k_eps,
191
- b.accumulator_ .data (),
192
- output_ptr);
193
- }
194
- // Truncate to original size.
195
- nonzeros_.resize (n);
196
- zeros_.resize (D - n);
197
- }
198
- for (; i < N; i++, data_ptr += D, output_ptr += D) {
199
- BoxCoxMixedLambda (
200
- data_ptr,
201
- nonzeros_,
202
- zeros_,
203
- b.lambda1_ .data (),
204
- b.lambda2_ .data (),
205
- b.lambda2_z_ .data (),
206
- k_eps,
207
- b.accumulator_ .data (),
208
- output_ptr);
209
- }
210
- }
60
+ BoxCoxNaive (N, D, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr);
61
+ return true ;
211
62
}
212
- #else // CAFFE2_USE_MKL
213
- BoxCoxNaive (N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
214
- #endif // CAFFE2_USE_MKL
63
+ caffe2::compute_batch_box_cox (
64
+ N, D, min_block_size_, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr);
65
+ #else
66
+ BoxCoxNaive (N, D, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr);
67
+ #endif
215
68
return true ;
216
69
}
217
70
218
- template <>
219
- template <typename T>
220
- void BatchBoxCoxOp<CPUContext>::BoxCoxNaive(
221
- int64_t N,
222
- int64_t D,
223
- const T* data_ptr,
224
- const T* lambda1_ptr,
225
- const T* lambda2_ptr,
226
- T k_eps,
227
- T* output_ptr) {
228
- for (int64_t i = 0 ; i < N; i++) {
229
- for (int64_t j = 0 ; j < D; j++, data_ptr++, output_ptr++) {
230
- T lambda1_v = lambda1_ptr[j];
231
- T lambda2_v = lambda2_ptr[j];
232
- T tmp = std::max (*data_ptr + lambda2_v, k_eps);
233
- if (lambda1_v == 0 ) {
234
- *output_ptr = std::log (tmp);
235
- } else {
236
- *output_ptr = (std::pow (tmp, lambda1_v) - 1 ) / lambda1_v;
237
- }
238
- }
239
- }
240
- }
241
-
242
- #ifdef CAFFE2_USE_MKL
243
-
244
- template <>
245
- template <typename T>
246
- void BatchBoxCoxOp<CPUContext>::BoxCoxNonzeroLambda(
247
- int64_t D,
248
- const T* data_ptr,
249
- const T* lambda1,
250
- const T* lambda2,
251
- T k_eps,
252
- T* out) {
253
- caffe2::math::Add (D, data_ptr, lambda2, out, &context_);
254
- for (int64_t j = 0 ; j < D; j++) {
255
- out[j] = std::max (out[j], k_eps);
256
- }
257
- Pow (D, out, lambda1, out);
258
- for (int64_t j = 0 ; j < D; j++) {
259
- out[j] -= 1.0 ;
260
- }
261
- caffe2::math::Div (D, out, lambda1, out, &context_);
262
- }
263
-
264
- template <>
265
- template <typename T>
266
- void BatchBoxCoxOp<CPUContext>::BoxCoxZeroLambda(
267
- int64_t D,
268
- const T* data_ptr,
269
- const T* lambda2,
270
- T k_eps,
271
- T* output_ptr) {
272
- caffe2::math::Add (D, data_ptr, lambda2, output_ptr, &context_);
273
- for (int64_t j = 0 ; j < D; j++) {
274
- output_ptr[j] = std::max (output_ptr[j], k_eps);
275
- }
276
- caffe2::math::Log (D, output_ptr, output_ptr, &context_);
277
- }
278
-
279
- template <>
280
- template <typename T>
281
- void BatchBoxCoxOp<CPUContext>::BoxCoxMixedLambda(
282
- const T* data_ptr,
283
- const vector<int >& nonzeros,
284
- const vector<int >& zeros,
285
- const T* lambda1,
286
- const T* lambda2,
287
- const T* lambda2_z,
288
- T k_eps,
289
- T* buffer,
290
- T* output_ptr) {
291
- PackV (nonzeros.size (), data_ptr, nonzeros.data (), buffer);
292
- BoxCoxNonzeroLambda (nonzeros.size (), buffer, lambda1, lambda2, k_eps, buffer);
293
- UnpackV (nonzeros.size (), buffer, output_ptr, nonzeros.data ());
294
-
295
- PackV (zeros.size (), data_ptr, zeros.data (), buffer);
296
- BoxCoxZeroLambda (zeros.size (), buffer, lambda2_z, k_eps, buffer);
297
- UnpackV (zeros.size (), buffer, output_ptr, zeros.data ());
298
- }
299
-
300
- // Helpers to access cached buffers.
301
- #define DEFINE_CACHED_BUFFERS (T, tag ) \
302
- template <> \
303
- template <> \
304
- BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>& \
305
- BatchBoxCoxOp<CPUContext>::GetBuffers<T>() { \
306
- if (!buffers_ || buffers_->type_ != tag) { \
307
- buffers_.reset (new BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>()); \
308
- buffers_->type_ = tag; \
309
- } \
310
- return *static_cast <TypedCachedBuffers<T>*>(buffers_.get ()); \
311
- }
312
- DEFINE_CACHED_BUFFERS (float , 1 );
313
- DEFINE_CACHED_BUFFERS (double , 2 );
314
- #undef DEFINE_CACHED_BUFFERS
315
-
316
- #endif // CAFFE2_USE_MKL
317
71
318
72
namespace {
319
73
0 commit comments