Skip to content

Commit a296fa3

Browse files
Hao Lufacebook-github-bot
Hao Lu
authored andcommitted
[Caffe2] Implement BlackBoxPredictor::BenchmarkIndividualOps (pytorch#52903)
Summary: Pull Request resolved: pytorch#52903 Implement BlackBoxPredictor::BenchmarkIndividualOps so that we can clean up the output tensors properly after each iteration and get more accurate per operator timing. Add four more metrics to track setup_time, memory_alloc_time, memory_dealloc_time, and output_dealloc_time. Reviewed By: ajyu Differential Revision: D26657473 fbshipit-source-id: 1cf282192b531513b9ee40b37252087818412f81
1 parent 249c213 commit a296fa3

File tree

2 files changed

+214
-141
lines changed

2 files changed

+214
-141
lines changed

caffe2/core/net_simple.cc

Lines changed: 174 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -128,162 +128,195 @@ vector<float> SimpleNet::TEST_Benchmark(
128128
<< ". Iters per second: " << 1000.0 * main_runs / millis
129129
<< std::endl;
130130
}
131-
vector<float> time_per_op(operators_.size(), 0);
132-
vector<uint64_t> flops_per_op;
133-
vector<uint64_t> memory_bytes_read_per_op;
134-
vector<uint64_t> memory_bytes_written_per_op;
135-
vector<uint64_t> param_bytes_per_op;
136-
CaffeMap<string, float> time_per_op_type;
137-
CaffeMap<string, float> flops_per_op_type;
138-
CaffeMap<string, float> memory_bytes_read_per_op_type;
139-
CaffeMap<string, float> memory_bytes_written_per_op_type;
140-
CaffeMap<string, float> param_bytes_per_op_type;
131+
132+
auto operators = GetOperators();
133+
auto results = IndividualMetrics(operators);
141134
if (run_individual) {
142135
for (int i = 0; i < main_runs; ++i) {
143-
for (auto& op : operators_) {
144-
op->ResetEvent();
145-
}
146-
int idx = 0;
147-
for (auto& op : operators_) {
148-
const string& op_type = op->debug_def().type();
149-
if (i == 0) { // Gather flops on the first run.
150-
auto* schema = OpSchemaRegistry::Schema(op_type);
151-
if (schema && schema->HasCostInferenceFunction()) {
152-
vector<TensorShape> shapes = op->InputTensorShapes();
153-
154-
auto all_good_shapes = std::accumulate(
155-
shapes.begin(),
156-
shapes.end(),
157-
true,
158-
[](bool acc, const TensorShape& shape) {
159-
return acc && !shape.unknown_shape();
160-
});
161-
OpSchema::Cost cost;
162-
if (all_good_shapes) {
163-
cost = schema->InferCost(op->debug_def(), shapes);
164-
}
136+
results.RunOpsWithProfiling();
137+
}
138+
results.PrintOperatorProfilingResults();
139+
}
140+
// We will reuse time_per_op to return the result of BenchmarkNet.
141+
std::vector<float> time_per_op(results.GetTimePerOp());
142+
for (size_t i = 0; i < time_per_op.size(); ++i) {
143+
time_per_op[i] /= main_runs;
144+
}
145+
if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
146+
time_per_op.insert(time_per_op.begin(), millis / main_runs);
147+
}
148+
return time_per_op;
149+
}
165150

166-
flops_per_op.emplace_back(cost.flops);
167-
memory_bytes_read_per_op.emplace_back(cost.bytes_read);
168-
memory_bytes_written_per_op.emplace_back(cost.bytes_written);
169-
param_bytes_per_op.emplace_back(cost.params_bytes);
151+
void IndividualMetrics::RunOpsWithProfiling() {
152+
int idx = 0;
153+
Timer timer;
154+
for (auto* op : operators_) {
155+
const string& op_type = op->debug_def().type();
156+
if (main_runs_ == 0) { // Gather flops on the first run.
157+
auto* schema = OpSchemaRegistry::Schema(op_type);
158+
if (schema && schema->HasCostInferenceFunction()) {
159+
vector<TensorShape> shapes = op->InputTensorShapes();
170160

171-
flops_per_op_type[op_type] += cost.flops;
172-
memory_bytes_read_per_op_type[op_type] += cost.bytes_read;
173-
memory_bytes_written_per_op_type[op_type] += cost.bytes_written;
174-
param_bytes_per_op_type[op_type] += cost.params_bytes;
175-
} else {
176-
flops_per_op.emplace_back(0);
177-
memory_bytes_read_per_op.emplace_back(0);
178-
memory_bytes_written_per_op.emplace_back(0);
179-
param_bytes_per_op.emplace_back(0);
180-
}
161+
auto all_good_shapes = std::accumulate(
162+
shapes.begin(),
163+
shapes.end(),
164+
true,
165+
[](bool acc, const TensorShape& shape) {
166+
return acc && !shape.unknown_shape();
167+
});
168+
OpSchema::Cost cost;
169+
if (all_good_shapes) {
170+
cost = schema->InferCost(op->debug_def(), shapes);
181171
}
182-
timer.Start();
183-
CAFFE_ENFORCE(
184-
op->Run(),
185-
"operator ",
186-
op->debug_def().name(),
187-
"(",
188-
op_type,
189-
") has failed.");
190-
float spent = timer.MilliSeconds();
191-
time_per_op[idx] += spent;
192-
time_per_op_type[op_type] += spent;
193-
++idx;
172+
173+
flops_per_op.emplace_back(cost.flops);
174+
memory_bytes_read_per_op.emplace_back(cost.bytes_read);
175+
memory_bytes_written_per_op.emplace_back(cost.bytes_written);
176+
param_bytes_per_op.emplace_back(cost.params_bytes);
177+
178+
flops_per_op_type[op_type] += cost.flops;
179+
memory_bytes_read_per_op_type[op_type] += cost.bytes_read;
180+
memory_bytes_written_per_op_type[op_type] += cost.bytes_written;
181+
param_bytes_per_op_type[op_type] += cost.params_bytes;
182+
} else {
183+
flops_per_op.emplace_back(0);
184+
memory_bytes_read_per_op.emplace_back(0);
185+
memory_bytes_written_per_op.emplace_back(0);
186+
param_bytes_per_op.emplace_back(0);
194187
}
195188
}
196-
size_t idx = 0;
197-
for (auto& op : operators_) {
198-
const string& op_type = op->debug_def().type();
199-
const string& print_name =
200-
(op->debug_def().name().size()
201-
? op->debug_def().name()
202-
: (op->debug_def().output_size() ? op->debug_def().output(0)
203-
: "NO_OUTPUT"));
204-
std::stringstream flops_str;
205-
if (idx < flops_per_op.size() && flops_per_op[idx]) {
206-
flops_str << " (" << to_string(1.0e-9 * flops_per_op[idx]) << " GFLOP, "
207-
<< to_string(
208-
1.0e-6 * flops_per_op[idx] / time_per_op[idx] *
209-
main_runs)
210-
<< " GFLOPS)";
211-
}
212-
std::stringstream memory_bytes_read_str;
213-
if (idx < memory_bytes_read_per_op.size() &&
214-
memory_bytes_read_per_op[idx]) {
215-
memory_bytes_read_str
216-
<< " (" << to_string(1.0e-6 * memory_bytes_read_per_op[idx])
217-
<< " MB)";
218-
}
219-
std::stringstream memory_bytes_written_str;
220-
if (idx < memory_bytes_written_per_op.size() &&
221-
memory_bytes_written_per_op[idx]) {
222-
memory_bytes_written_str
223-
<< " (" << to_string(1.0e-6 * memory_bytes_written_per_op[idx])
224-
<< " MB)";
225-
}
226-
std::stringstream param_bytes_str;
227-
if (idx < param_bytes_per_op.size() && param_bytes_per_op[idx]) {
228-
param_bytes_str << " (" << to_string(1.0e-6 * param_bytes_per_op[idx])
229-
<< " MB)";
230-
}
231-
std::cout << "Operator #" << idx << " (" << print_name << ", " << op_type
232-
<< ") " << time_per_op[idx] / main_runs << " ms/iter"
233-
<< flops_str.str() << memory_bytes_written_str.str()
234-
<< param_bytes_str.str() << std::endl;
235-
++idx;
189+
timer.Start();
190+
CAFFE_ENFORCE(
191+
op->Run(),
192+
"operator ",
193+
op->debug_def().name(),
194+
"(",
195+
op_type,
196+
") has failed.");
197+
float spent = timer.MilliSeconds();
198+
time_per_op[idx] += spent;
199+
time_per_op_type[op_type] += spent;
200+
++idx;
201+
}
202+
++main_runs_;
203+
}
204+
205+
void IndividualMetrics::PrintOperatorProfilingResults() {
206+
for (auto& op : operators_) {
207+
op->ResetEvent();
208+
}
209+
size_t idx = 0;
210+
for (auto& op : operators_) {
211+
const string& op_type = op->debug_def().type();
212+
num_ops_per_op_type_[op_type]++;
213+
const string& print_name =
214+
(op->debug_def().name().size()
215+
? op->debug_def().name()
216+
: (op->debug_def().output_size() ? op->debug_def().output(0)
217+
: "NO_OUTPUT"));
218+
std::stringstream flops_str;
219+
if (idx < flops_per_op.size() && flops_per_op[idx]) {
220+
flops_str << " (" << to_string(1.0e-9 * flops_per_op[idx]) << " GFLOP, "
221+
<< to_string(
222+
1.0e-6 * flops_per_op[idx] / time_per_op[idx] *
223+
main_runs_)
224+
<< " GFLOPS)";
225+
}
226+
std::stringstream memory_bytes_read_str;
227+
if (idx < memory_bytes_read_per_op.size() &&
228+
memory_bytes_read_per_op[idx]) {
229+
memory_bytes_read_str << " ("
230+
<< to_string(1.0e-6 * memory_bytes_read_per_op[idx])
231+
<< " MB)";
236232
}
237-
const std::vector<string> metric({"Time",
238-
"FLOP",
239-
"Feature Memory Read",
240-
"Feature Memory Written",
241-
"Parameter Memory"});
242-
const std::vector<double> normalizer(
243-
{1.0 / main_runs, 1.0e-9, 1.0e-6, 1.0e-6, 1.0e-6});
244-
const std::vector<string> unit({"ms", "GFLOP", "MB", "MB", "MB"});
233+
std::stringstream memory_bytes_written_str;
234+
if (idx < memory_bytes_written_per_op.size() &&
235+
memory_bytes_written_per_op[idx]) {
236+
memory_bytes_written_str
237+
<< " (" << to_string(1.0e-6 * memory_bytes_written_per_op[idx])
238+
<< " MB)";
239+
}
240+
std::stringstream param_bytes_str;
241+
if (idx < param_bytes_per_op.size() && param_bytes_per_op[idx]) {
242+
param_bytes_str << " (" << to_string(1.0e-6 * param_bytes_per_op[idx])
243+
<< " MB)";
244+
}
245+
std::cout << "Operator #" << idx << " (" << print_name << ", " << op_type
246+
<< ") " << time_per_op[idx] / main_runs_ << " ms/iter"
247+
<< flops_str.str() << memory_bytes_written_str.str()
248+
<< param_bytes_str.str() << std::endl;
249+
++idx;
250+
}
251+
const std::vector<string> metric(
252+
{"Time",
253+
"FLOP",
254+
"Feature Memory Read",
255+
"Feature Memory Written",
256+
"Parameter Memory"});
257+
const std::vector<double> normalizer(
258+
{1.0 / main_runs_, 1.0e-9, 1.0e-6, 1.0e-6, 1.0e-6});
259+
const std::vector<string> unit({"ms", "GFLOP", "MB", "MB", "MB"});
245260

246-
std::vector<CaffeMap<string, float>*> metric_per_op_type_vec_vec;
247-
metric_per_op_type_vec_vec.emplace_back(&time_per_op_type);
248-
metric_per_op_type_vec_vec.emplace_back(&flops_per_op_type);
249-
metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type);
250-
metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type);
251-
metric_per_op_type_vec_vec.emplace_back(&param_bytes_per_op_type);
252-
for (size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) {
261+
std::vector<CaffeMap<string, float>*> metric_per_op_type_vec_vec;
262+
metric_per_op_type_vec_vec.emplace_back(&time_per_op_type);
263+
metric_per_op_type_vec_vec.emplace_back(&flops_per_op_type);
264+
metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type);
265+
metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type);
266+
metric_per_op_type_vec_vec.emplace_back(&param_bytes_per_op_type);
267+
for (size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) {
268+
auto* item = metric_per_op_type_vec_vec[i];
269+
std::vector<std::pair<string, float>> metric_per_op_type_vec(
270+
(*item).begin(), (*item).end());
271+
std::sort(
272+
metric_per_op_type_vec.begin(),
273+
metric_per_op_type_vec.end(),
274+
PairLargerThan<string, float>);
275+
float total_metric = 0.;
276+
for (const auto& op_item : metric_per_op_type_vec) {
277+
total_metric += op_item.second * normalizer[i];
278+
}
279+
if (total_metric > 0.) {
253280
std::cout << metric[i] << " per operator type:" << std::endl;
254-
auto* item = metric_per_op_type_vec_vec[i];
255-
std::vector<std::pair<string, float>> metric_per_op_type_vec(
256-
(*item).begin(), (*item).end());
257-
std::sort(
258-
metric_per_op_type_vec.begin(),
259-
metric_per_op_type_vec.end(),
260-
PairLargerThan<string, float>);
261-
float total_metric = 0.;
262-
for (const auto& op_item : metric_per_op_type_vec) {
263-
total_metric += op_item.second * normalizer[i];
264-
}
265-
for (const auto& op_item : metric_per_op_type_vec) {
266-
float percent = 0.;
267-
if (total_metric > 0.) {
268-
percent = (100.0 * op_item.second * normalizer[i] / total_metric);
269-
}
270-
std::cout << std::setw(15) << std::setfill(' ')
271-
<< op_item.second * normalizer[i] << " " << unit[i] << ". "
272-
<< std::setw(10) << std::setfill(' ') << percent << "%. "
273-
<< op_item.first << std::endl;
281+
}
282+
for (const auto& op_item : metric_per_op_type_vec) {
283+
float percent = 0.;
284+
const string& op = op_item.first;
285+
float value = op_item.second;
286+
if (total_metric > 0.) {
287+
percent = (100.0 * value * normalizer[i] / total_metric);
274288
}
289+
std::cout << std::setw(15) << std::setfill(' ') << value * normalizer[i]
290+
<< " " << unit[i] << ". " << std::setw(10) << std::setfill(' ')
291+
<< percent << "%. " << op << " (" << num_ops_per_op_type_[op]
292+
<< " ops)" << std::endl;
293+
}
294+
if (total_metric > 0.) {
275295
std::cout << std::setw(15) << std::setfill(' ') << total_metric << " "
276296
<< unit[i] << " in Total" << std::endl;
277297
}
298+
if (i == 0) {
299+
if (setup_time > 0) {
300+
std::cout << "BlackBoxPredictor setup time: "
301+
<< setup_time * normalizer[i] << " " << unit[i] << "\n";
302+
}
303+
if (memory_alloc_time > 0) {
304+
std::cout << "Memory allocation time: "
305+
<< memory_alloc_time * normalizer[i] << " " << unit[i]
306+
<< "\n";
307+
}
308+
if (memory_dealloc_time > 0) {
309+
std::cout << "Memory deallocation time: "
310+
<< memory_dealloc_time * normalizer[i] << " " << unit[i]
311+
<< std::endl;
312+
}
313+
if (output_dealloc_time > 0) {
314+
std::cout << "Output deallocation time: "
315+
<< output_dealloc_time * normalizer[i] << " " << unit[i]
316+
<< std::endl;
317+
}
318+
}
278319
}
279-
// We will reuse time_per_op to return the result of BenchmarkNet.
280-
for (size_t i = 0; i < time_per_op.size(); ++i) {
281-
time_per_op[i] /= main_runs;
282-
}
283-
if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
284-
time_per_op.insert(time_per_op.begin(), millis / main_runs);
285-
}
286-
return time_per_op;
287320
}
288321

289322
REGISTER_NET(simple, SimpleNet);

caffe2/core/net_simple.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,46 @@
1313

1414
namespace caffe2 {
1515

16+
struct IndividualMetrics {
17+
public:
18+
explicit IndividualMetrics(const std::vector<OperatorBase*>& operators)
19+
: main_runs_(0), operators_(operators) {
20+
const auto num_ops = operators_.size();
21+
time_per_op.resize(num_ops, 0.0);
22+
}
23+
// run ops while collecting profiling results
24+
void RunOpsWithProfiling();
25+
26+
// print out profiling results
27+
void PrintOperatorProfilingResults();
28+
29+
const vector<float>& GetTimePerOp() {
30+
return time_per_op;
31+
}
32+
33+
float setup_time{0.0};
34+
float memory_alloc_time{0.0};
35+
float memory_dealloc_time{0.0};
36+
float output_dealloc_time{0.0};
37+
38+
private:
39+
int main_runs_;
40+
const std::vector<OperatorBase*>& operators_;
41+
42+
vector<float> time_per_op;
43+
vector<uint64_t> flops_per_op;
44+
vector<uint64_t> memory_bytes_read_per_op;
45+
vector<uint64_t> memory_bytes_written_per_op;
46+
vector<uint64_t> param_bytes_per_op;
47+
48+
CaffeMap<string, int> num_ops_per_op_type_;
49+
CaffeMap<string, float> time_per_op_type;
50+
CaffeMap<string, float> flops_per_op_type;
51+
CaffeMap<string, float> memory_bytes_read_per_op_type;
52+
CaffeMap<string, float> memory_bytes_written_per_op_type;
53+
CaffeMap<string, float> param_bytes_per_op_type;
54+
};
55+
1656
// This is the very basic structure you need to run a network - all it
1757
// does is simply to run everything in sequence. If you want more fancy control
1858
// such as a DAG-like execution, check out other better net implementations.

0 commit comments

Comments
 (0)