Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit a44f1d3

Browse files
Simplify basic C++ instructions in README.md
1 parent 795b595 commit a44f1d3

File tree

1 file changed

+23
-53
lines changed

1 file changed

+23
-53
lines changed

README.md

Lines changed: 23 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -41,29 +41,27 @@ def tensordot(float(N, C1, C2, H, W) I0,
4141
O(n, c1, c3, h, w) +=! I0(n, c1, r_c2, h, w) * I1(n, r_c2, c3, h, w)
4242
}
4343
)TC";
44-
tc::ATenCompilationUnit<tc::CudaBackend> atCompl;
45-
atCompl.define(tc);
4644

4745
// 2. Allocate tensors with random data.
4846
at::Tensor I0 = at::CUDA(at::kFloat).rand({32, 8, 16, 17, 25});
4947
at::Tensor I1 = at::CUDA(at::kFloat).rand({32, 16, 2, 17, 25});
5048

5149
// 3. Run autotuning with evolutionary search starting from a naive option.
52-
auto options = tc::CudaMappingOptions::makeNaiveMappingOptions();
53-
tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc);
54-
auto bestOption = geneticAutotuneATen.tune(
55-
"/tmp/save_results", "tensordot", {I0, I1}, options);
56-
57-
// 4. Compile and run the TC with the best option.
58-
// Outputs get allocated; could also be pre-allocated and passed.
59-
auto handle = atCompl.compile("tensordot", {I0, I1}, bestOption.getValue());
60-
std::vector<at::Tensor> outputs;
61-
auto duration = atCompl.run("tensordot", {I0, I1}, outputs, handle, true);
62-
std::cout
63-
<< "tensordot size I0: " << I0.sizes() << ", "
64-
<< "size I1: " << I1.sizes() << " ran in: "
65-
<< std::chrono::duration_cast<std::chrono::microseconds>(duration).count()
66-
<< "us\n";
50+
auto naiveOptions = Backend::MappingOptionsType::makeNaiveMappingOptions();
51+
tc::aten::ATenAutotuner<tc::CudaBackend, tc::autotune::GeneticSearch>
52+
geneticAutotuneATen(tc);
53+
auto bestOption =
54+
geneticAutotuneATen.tune("tensordot", {I0, I1}, {naiveOptions});
55+
56+
// 4. Compile and run the TC with the best option after allocating output
57+
// tensors.
58+
auto pExecutor =
59+
tc::aten::compile<Backend>(tc, "tensordot", {I0, I1}, bestOption[0]);
60+
auto outputs = tc::aten::prepareOutputs(tc, "tensordot", {I0, I1});
61+
auto timings = tc::aten::profile(*pExecutor, {I0, I1}, outputs);
62+
std::cout << "tensordot size I0: " << I0.sizes() << ", "
63+
<< "size I1: " << I1.sizes()
64+
<< " ran in: " << timings.kernelRuntime.toMicroSeconds() << "us\n";
6765
}
6866
```
6967
@@ -76,15 +74,15 @@ for (auto sizes : std::vector<std::pair<at::IntList, at::IntList>>{
7674
{{4, 9, 7, 16, 14}, {4, 7, 3, 16, 14}},
7775
{{8, 5, 11, 10, 10}, {8, 11, 16, 10, 10}},
7876
}) {
79-
at::Tensor I0 = at::CUDA(at::kFloat).rand(sizes.first);
80-
at::Tensor I1 = at::CUDA(at::kFloat).rand(sizes.second);
81-
auto handle = atCompl.compile("tensordot", {I0, I1}, bestOption.getValue());
82-
std::vector<at::Tensor> outputs;
83-
auto duration = atCompl.run("tensordot", {I0, I1}, outputs, handle, true);
77+
at::Tensor I0 = makeATenTensor<Backend>(sizes.first);
78+
at::Tensor I1 = makeATenTensor<Backend>(sizes.second);
79+
auto pExecutor =
80+
tc::aten::compile<Backend>(tc, "tensordot", {I0, I1}, bestOption[0]);
81+
auto outputs = tc::aten::prepareOutputs(tc, "tensordot", {I0, I1});
82+
auto timings = tc::aten::profile(*pExecutor, {I0, I1}, outputs);
8483
std::cout << "tensordot size I0: " << I0.sizes() << ", "
85-
<< "size I1: " << I1.sizes() << " ran in: "
86-
<< std::chrono::duration_cast<std::chrono::microseconds>(duration)
87-
.count()
84+
<< "size I1: " << I1.sizes()
85+
<< " ran in: " << timings.kernelRuntime.toMicroSeconds()
8886
<< "us\n";
8987
}
9088
```
@@ -96,11 +94,9 @@ Putting it all together, one may see:
9694
[----------] Global test environment set-up.
9795
[----------] 1 test from TensorDot
9896
[ RUN ] TensorDot.SimpleAutotune
99-
Loading proto from: /tmp/save_results.options and /tmp/save_results.cuda
10097
Generation 0 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 226/4238/7345
10198
Generation 1 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 220/221/233
10299
Generation 2 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 220/221/234
103-
Dumping cache to /tmp/save_results.cuda/options
104100
tensordot size I0: [16, 8, 16, 17, 25], size I1: [16, 16, 2, 17, 25] ran in: 239us
105101
tensordot size I0: [4, 9, 7, 16, 14], size I1: [4, 7, 3, 16, 14] ran in: 56us
106102
tensordot size I0: [8, 5, 11, 10, 10], size I1: [8, 11, 16, 10, 10] ran in: 210us
@@ -112,32 +108,6 @@ tensordot size I0: [8, 5, 11, 10, 10], size I1: [8, 11, 16, 10, 10] ran in: 210u
112108
[ PASSED ] 1 test.
113109
```
114110
115-
Tuning results are then available and reusable in ```/tmp/save_results.cuda``` and ```/tmp/save_results.proto```.
116-
117-
Interestingly, note that running the same example again will start form the best saved results and improve upon them.
118-
Of course this has diminishing returns:
119-
```shell
120-
> build$ ./examples/example_simple
121-
[==========] Running 1 test from 1 test case.
122-
[----------] Global test environment set-up.
123-
[----------] 1 test from TensorDot
124-
[ RUN ] TensorDot.SimpleAutotune
125-
Loading proto from: /tmp/save_results.options and /tmp/save_results.cuda
126-
Generation 0 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 256/258/270
127-
Generation 1 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 158/255/616
128-
Generation 2 Jobs(Compiled, GPU)/total (10, 10)/10 (best/median/worst)us: 157/252/720
129-
Dumping cache to /tmp/save_results.cuda/options
130-
tensordot size I0: [16, 8, 16, 17, 25], size I1: [16, 16, 2, 17, 25] ran in: 172us
131-
tensordot size I0: [4, 9, 7, 16, 14], size I1: [4, 7, 3, 16, 14] ran in: 44us
132-
tensordot size I0: [8, 5, 11, 10, 10], size I1: [8, 11, 16, 10, 10] ran in: 88us
133-
[ OK ] TensorDot.SimpleAutotune (28232 ms)
134-
[----------] 1 test from TensorDot (28232 ms total)
135-
136-
[----------] Global test environment tear-down
137-
[==========] 1 test from 1 test case ran. (28232 ms total)
138-
[ PASSED ] 1 test.
139-
```
140-
141111
We have not yet characterized the precise fraction of peak performance we obtain but it is not uncommon to obtain 80%+ of peak shared memory bandwidth after autotuning. Solid register-level optimizations are still in the work but TC in its current form already addresses the productivity gap between the needs of research and the needs of production. Which is why we are excited to share it with the entire community and bring this collaborative effort in the open.
142112
143113
# Documentation

0 commit comments

Comments
 (0)