@@ -128,162 +128,195 @@ vector<float> SimpleNet::TEST_Benchmark(
128
128
<< " . Iters per second: " << 1000.0 * main_runs / millis
129
129
<< std::endl;
130
130
}
131
- vector<float > time_per_op (operators_.size (), 0 );
132
- vector<uint64_t > flops_per_op;
133
- vector<uint64_t > memory_bytes_read_per_op;
134
- vector<uint64_t > memory_bytes_written_per_op;
135
- vector<uint64_t > param_bytes_per_op;
136
- CaffeMap<string, float > time_per_op_type;
137
- CaffeMap<string, float > flops_per_op_type;
138
- CaffeMap<string, float > memory_bytes_read_per_op_type;
139
- CaffeMap<string, float > memory_bytes_written_per_op_type;
140
- CaffeMap<string, float > param_bytes_per_op_type;
131
+
132
+ auto operators = GetOperators ();
133
+ auto results = IndividualMetrics (operators);
141
134
if (run_individual) {
142
135
for (int i = 0 ; i < main_runs; ++i) {
143
- for (auto & op : operators_) {
144
- op->ResetEvent ();
145
- }
146
- int idx = 0 ;
147
- for (auto & op : operators_) {
148
- const string& op_type = op->debug_def ().type ();
149
- if (i == 0 ) { // Gather flops on the first run.
150
- auto * schema = OpSchemaRegistry::Schema (op_type);
151
- if (schema && schema->HasCostInferenceFunction ()) {
152
- vector<TensorShape> shapes = op->InputTensorShapes ();
153
-
154
- auto all_good_shapes = std::accumulate (
155
- shapes.begin (),
156
- shapes.end (),
157
- true ,
158
- [](bool acc, const TensorShape& shape) {
159
- return acc && !shape.unknown_shape ();
160
- });
161
- OpSchema::Cost cost;
162
- if (all_good_shapes) {
163
- cost = schema->InferCost (op->debug_def (), shapes);
164
- }
136
+ results.RunOpsWithProfiling ();
137
+ }
138
+ results.PrintOperatorProfilingResults ();
139
+ }
140
+ // We will reuse time_per_op to return the result of BenchmarkNet.
141
+ std::vector<float > time_per_op (results.GetTimePerOp ());
142
+ for (size_t i = 0 ; i < time_per_op.size (); ++i) {
143
+ time_per_op[i] /= main_runs;
144
+ }
145
+ if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
146
+ time_per_op.insert (time_per_op.begin (), millis / main_runs);
147
+ }
148
+ return time_per_op;
149
+ }
165
150
166
- flops_per_op.emplace_back (cost.flops );
167
- memory_bytes_read_per_op.emplace_back (cost.bytes_read );
168
- memory_bytes_written_per_op.emplace_back (cost.bytes_written );
169
- param_bytes_per_op.emplace_back (cost.params_bytes );
151
+ void IndividualMetrics::RunOpsWithProfiling () {
152
+ int idx = 0 ;
153
+ Timer timer;
154
+ for (auto * op : operators_) {
155
+ const string& op_type = op->debug_def ().type ();
156
+ if (main_runs_ == 0 ) { // Gather flops on the first run.
157
+ auto * schema = OpSchemaRegistry::Schema (op_type);
158
+ if (schema && schema->HasCostInferenceFunction ()) {
159
+ vector<TensorShape> shapes = op->InputTensorShapes ();
170
160
171
- flops_per_op_type[op_type] += cost. flops ;
172
- memory_bytes_read_per_op_type[op_type] += cost. bytes_read ;
173
- memory_bytes_written_per_op_type[op_type] += cost. bytes_written ;
174
- param_bytes_per_op_type[op_type] += cost. params_bytes ;
175
- } else {
176
- flops_per_op. emplace_back ( 0 );
177
- memory_bytes_read_per_op. emplace_back ( 0 );
178
- memory_bytes_written_per_op. emplace_back ( 0 ) ;
179
- param_bytes_per_op. emplace_back ( 0 );
180
- }
161
+ auto all_good_shapes = std::accumulate (
162
+ shapes. begin (),
163
+ shapes. end (),
164
+ true ,
165
+ []( bool acc, const TensorShape& shape) {
166
+ return acc && !shape. unknown_shape ( );
167
+ } );
168
+ OpSchema::Cost cost ;
169
+ if (all_good_shapes) {
170
+ cost = schema-> InferCost (op-> debug_def (), shapes);
181
171
}
182
- timer.Start ();
183
- CAFFE_ENFORCE (
184
- op->Run (),
185
- " operator " ,
186
- op->debug_def ().name (),
187
- " (" ,
188
- op_type,
189
- " ) has failed." );
190
- float spent = timer.MilliSeconds ();
191
- time_per_op[idx] += spent;
192
- time_per_op_type[op_type] += spent;
193
- ++idx;
172
+
173
+ flops_per_op.emplace_back (cost.flops );
174
+ memory_bytes_read_per_op.emplace_back (cost.bytes_read );
175
+ memory_bytes_written_per_op.emplace_back (cost.bytes_written );
176
+ param_bytes_per_op.emplace_back (cost.params_bytes );
177
+
178
+ flops_per_op_type[op_type] += cost.flops ;
179
+ memory_bytes_read_per_op_type[op_type] += cost.bytes_read ;
180
+ memory_bytes_written_per_op_type[op_type] += cost.bytes_written ;
181
+ param_bytes_per_op_type[op_type] += cost.params_bytes ;
182
+ } else {
183
+ flops_per_op.emplace_back (0 );
184
+ memory_bytes_read_per_op.emplace_back (0 );
185
+ memory_bytes_written_per_op.emplace_back (0 );
186
+ param_bytes_per_op.emplace_back (0 );
194
187
}
195
188
}
196
- size_t idx = 0 ;
197
- for (auto & op : operators_) {
198
- const string& op_type = op->debug_def ().type ();
199
- const string& print_name =
200
- (op->debug_def ().name ().size ()
201
- ? op->debug_def ().name ()
202
- : (op->debug_def ().output_size () ? op->debug_def ().output (0 )
203
- : " NO_OUTPUT" ));
204
- std::stringstream flops_str;
205
- if (idx < flops_per_op.size () && flops_per_op[idx]) {
206
- flops_str << " (" << to_string (1.0e-9 * flops_per_op[idx]) << " GFLOP, "
207
- << to_string (
208
- 1.0e-6 * flops_per_op[idx] / time_per_op[idx] *
209
- main_runs)
210
- << " GFLOPS)" ;
211
- }
212
- std::stringstream memory_bytes_read_str;
213
- if (idx < memory_bytes_read_per_op.size () &&
214
- memory_bytes_read_per_op[idx]) {
215
- memory_bytes_read_str
216
- << " (" << to_string (1.0e-6 * memory_bytes_read_per_op[idx])
217
- << " MB)" ;
218
- }
219
- std::stringstream memory_bytes_written_str;
220
- if (idx < memory_bytes_written_per_op.size () &&
221
- memory_bytes_written_per_op[idx]) {
222
- memory_bytes_written_str
223
- << " (" << to_string (1.0e-6 * memory_bytes_written_per_op[idx])
224
- << " MB)" ;
225
- }
226
- std::stringstream param_bytes_str;
227
- if (idx < param_bytes_per_op.size () && param_bytes_per_op[idx]) {
228
- param_bytes_str << " (" << to_string (1.0e-6 * param_bytes_per_op[idx])
229
- << " MB)" ;
230
- }
231
- std::cout << " Operator #" << idx << " (" << print_name << " , " << op_type
232
- << " ) " << time_per_op[idx] / main_runs << " ms/iter"
233
- << flops_str.str () << memory_bytes_written_str.str ()
234
- << param_bytes_str.str () << std::endl;
235
- ++idx;
189
+ timer.Start ();
190
+ CAFFE_ENFORCE (
191
+ op->Run (),
192
+ " operator " ,
193
+ op->debug_def ().name (),
194
+ " (" ,
195
+ op_type,
196
+ " ) has failed." );
197
+ float spent = timer.MilliSeconds ();
198
+ time_per_op[idx] += spent;
199
+ time_per_op_type[op_type] += spent;
200
+ ++idx;
201
+ }
202
+ ++main_runs_;
203
+ }
204
+
205
+ void IndividualMetrics::PrintOperatorProfilingResults () {
206
+ for (auto & op : operators_) {
207
+ op->ResetEvent ();
208
+ }
209
+ size_t idx = 0 ;
210
+ for (auto & op : operators_) {
211
+ const string& op_type = op->debug_def ().type ();
212
+ num_ops_per_op_type_[op_type]++;
213
+ const string& print_name =
214
+ (op->debug_def ().name ().size ()
215
+ ? op->debug_def ().name ()
216
+ : (op->debug_def ().output_size () ? op->debug_def ().output (0 )
217
+ : " NO_OUTPUT" ));
218
+ std::stringstream flops_str;
219
+ if (idx < flops_per_op.size () && flops_per_op[idx]) {
220
+ flops_str << " (" << to_string (1.0e-9 * flops_per_op[idx]) << " GFLOP, "
221
+ << to_string (
222
+ 1.0e-6 * flops_per_op[idx] / time_per_op[idx] *
223
+ main_runs_)
224
+ << " GFLOPS)" ;
225
+ }
226
+ std::stringstream memory_bytes_read_str;
227
+ if (idx < memory_bytes_read_per_op.size () &&
228
+ memory_bytes_read_per_op[idx]) {
229
+ memory_bytes_read_str << " ("
230
+ << to_string (1.0e-6 * memory_bytes_read_per_op[idx])
231
+ << " MB)" ;
236
232
}
237
- const std::vector<string> metric ({" Time" ,
238
- " FLOP" ,
239
- " Feature Memory Read" ,
240
- " Feature Memory Written" ,
241
- " Parameter Memory" });
242
- const std::vector<double > normalizer (
243
- {1.0 / main_runs, 1.0e-9 , 1.0e-6 , 1.0e-6 , 1.0e-6 });
244
- const std::vector<string> unit ({" ms" , " GFLOP" , " MB" , " MB" , " MB" });
233
+ std::stringstream memory_bytes_written_str;
234
+ if (idx < memory_bytes_written_per_op.size () &&
235
+ memory_bytes_written_per_op[idx]) {
236
+ memory_bytes_written_str
237
+ << " (" << to_string (1.0e-6 * memory_bytes_written_per_op[idx])
238
+ << " MB)" ;
239
+ }
240
+ std::stringstream param_bytes_str;
241
+ if (idx < param_bytes_per_op.size () && param_bytes_per_op[idx]) {
242
+ param_bytes_str << " (" << to_string (1.0e-6 * param_bytes_per_op[idx])
243
+ << " MB)" ;
244
+ }
245
+ std::cout << " Operator #" << idx << " (" << print_name << " , " << op_type
246
+ << " ) " << time_per_op[idx] / main_runs_ << " ms/iter"
247
+ << flops_str.str () << memory_bytes_written_str.str ()
248
+ << param_bytes_str.str () << std::endl;
249
+ ++idx;
250
+ }
251
+ const std::vector<string> metric (
252
+ {" Time" ,
253
+ " FLOP" ,
254
+ " Feature Memory Read" ,
255
+ " Feature Memory Written" ,
256
+ " Parameter Memory" });
257
+ const std::vector<double > normalizer (
258
+ {1.0 / main_runs_, 1.0e-9 , 1.0e-6 , 1.0e-6 , 1.0e-6 });
259
+ const std::vector<string> unit ({" ms" , " GFLOP" , " MB" , " MB" , " MB" });
245
260
246
- std::vector<CaffeMap<string, float >*> metric_per_op_type_vec_vec;
247
- metric_per_op_type_vec_vec.emplace_back (&time_per_op_type);
248
- metric_per_op_type_vec_vec.emplace_back (&flops_per_op_type);
249
- metric_per_op_type_vec_vec.emplace_back (&memory_bytes_read_per_op_type);
250
- metric_per_op_type_vec_vec.emplace_back (&memory_bytes_written_per_op_type);
251
- metric_per_op_type_vec_vec.emplace_back (¶m_bytes_per_op_type);
252
- for (size_t i = 0 ; i < metric_per_op_type_vec_vec.size (); ++i) {
261
+ std::vector<CaffeMap<string, float >*> metric_per_op_type_vec_vec;
262
+ metric_per_op_type_vec_vec.emplace_back (&time_per_op_type);
263
+ metric_per_op_type_vec_vec.emplace_back (&flops_per_op_type);
264
+ metric_per_op_type_vec_vec.emplace_back (&memory_bytes_read_per_op_type);
265
+ metric_per_op_type_vec_vec.emplace_back (&memory_bytes_written_per_op_type);
266
+ metric_per_op_type_vec_vec.emplace_back (¶m_bytes_per_op_type);
267
+ for (size_t i = 0 ; i < metric_per_op_type_vec_vec.size (); ++i) {
268
+ auto * item = metric_per_op_type_vec_vec[i];
269
+ std::vector<std::pair<string, float >> metric_per_op_type_vec (
270
+ (*item).begin (), (*item).end ());
271
+ std::sort (
272
+ metric_per_op_type_vec.begin (),
273
+ metric_per_op_type_vec.end (),
274
+ PairLargerThan<string, float >);
275
+ float total_metric = 0 .;
276
+ for (const auto & op_item : metric_per_op_type_vec) {
277
+ total_metric += op_item.second * normalizer[i];
278
+ }
279
+ if (total_metric > 0 .) {
253
280
std::cout << metric[i] << " per operator type:" << std::endl;
254
- auto * item = metric_per_op_type_vec_vec[i];
255
- std::vector<std::pair<string, float >> metric_per_op_type_vec (
256
- (*item).begin (), (*item).end ());
257
- std::sort (
258
- metric_per_op_type_vec.begin (),
259
- metric_per_op_type_vec.end (),
260
- PairLargerThan<string, float >);
261
- float total_metric = 0 .;
262
- for (const auto & op_item : metric_per_op_type_vec) {
263
- total_metric += op_item.second * normalizer[i];
264
- }
265
- for (const auto & op_item : metric_per_op_type_vec) {
266
- float percent = 0 .;
267
- if (total_metric > 0 .) {
268
- percent = (100.0 * op_item.second * normalizer[i] / total_metric);
269
- }
270
- std::cout << std::setw (15 ) << std::setfill (' ' )
271
- << op_item.second * normalizer[i] << " " << unit[i] << " . "
272
- << std::setw (10 ) << std::setfill (' ' ) << percent << " %. "
273
- << op_item.first << std::endl;
281
+ }
282
+ for (const auto & op_item : metric_per_op_type_vec) {
283
+ float percent = 0 .;
284
+ const string& op = op_item.first ;
285
+ float value = op_item.second ;
286
+ if (total_metric > 0 .) {
287
+ percent = (100.0 * value * normalizer[i] / total_metric);
274
288
}
289
+ std::cout << std::setw (15 ) << std::setfill (' ' ) << value * normalizer[i]
290
+ << " " << unit[i] << " . " << std::setw (10 ) << std::setfill (' ' )
291
+ << percent << " %. " << op << " (" << num_ops_per_op_type_[op]
292
+ << " ops)" << std::endl;
293
+ }
294
+ if (total_metric > 0 .) {
275
295
std::cout << std::setw (15 ) << std::setfill (' ' ) << total_metric << " "
276
296
<< unit[i] << " in Total" << std::endl;
277
297
}
298
+ if (i == 0 ) {
299
+ if (setup_time > 0 ) {
300
+ std::cout << " BlackBoxPredictor setup time: "
301
+ << setup_time * normalizer[i] << " " << unit[i] << " \n " ;
302
+ }
303
+ if (memory_alloc_time > 0 ) {
304
+ std::cout << " Memory allocation time: "
305
+ << memory_alloc_time * normalizer[i] << " " << unit[i]
306
+ << " \n " ;
307
+ }
308
+ if (memory_dealloc_time > 0 ) {
309
+ std::cout << " Memory deallocation time: "
310
+ << memory_dealloc_time * normalizer[i] << " " << unit[i]
311
+ << std::endl;
312
+ }
313
+ if (output_dealloc_time > 0 ) {
314
+ std::cout << " Output deallocation time: "
315
+ << output_dealloc_time * normalizer[i] << " " << unit[i]
316
+ << std::endl;
317
+ }
318
+ }
278
319
}
279
- // We will reuse time_per_op to return the result of BenchmarkNet.
280
- for (size_t i = 0 ; i < time_per_op.size (); ++i) {
281
- time_per_op[i] /= main_runs;
282
- }
283
- if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
284
- time_per_op.insert (time_per_op.begin (), millis / main_runs);
285
- }
286
- return time_per_op;
287
320
}
288
321
289
322
REGISTER_NET (simple, SimpleNet);
0 commit comments