Skip to content

Commit ac506f5

Browse files
bwastifacebook-github-bot
authored andcommitted
Back out "[nomnigraph][executor] computeChains with nomnigraph" (pytorch#15451)
Summary: Pull Request resolved: pytorch#15451 Original commit changeset: ccd050bfead6 Reviewed By: ilia-cher Differential Revision: D13533161 fbshipit-source-id: 1d0dcd54c2e3875aab015f3e996693e67a449b87
1 parent acbd9c4 commit ac506f5

File tree

4 files changed

+10
-187
lines changed

4 files changed

+10
-187
lines changed

caffe2/core/net_async_base.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ AsyncNetBase::AsyncNetBase(
7474
if (FLAGS_caffe2_net_async_inference_mode) {
7575
execution_chains_ = dag_utils::computeGroups(operator_nodes_);
7676
} else {
77-
execution_chains_ = dag_utils::computeChains(*net_def, operator_nodes_);
77+
execution_chains_ = dag_utils::computeChains(operator_nodes_);
7878
}
7979
chains_.reserve(execution_chains_.size());
8080
for (const auto& kv : execution_chains_) {

caffe2/core/net_dag_utils.cc

Lines changed: 0 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,9 @@
88
#include "caffe2/core/operator.h"
99
#include "caffe2/core/static_tracepoint.h"
1010
#include "caffe2/core/timer.h"
11-
#include "caffe2/opt/converter.h"
1211
#include "caffe2/proto/caffe2_pb.h"
1312
#include "caffe2/utils/proto_utils.h"
1413

15-
#include "nomnigraph/Graph/Algorithms.h"
16-
1714
namespace caffe2 {
1815
namespace dag_utils {
1916

@@ -123,146 +120,6 @@ void updateOperatorNodes(
123120
}
124121
} // namespace
125122

126-
using namespace nom::repr;
127-
using DepGraph = nom::Graph<NNGraph::NodeRef>;
128-
129-
// \brief This function prunes edges in the dependency
130-
// graph to increase the chaining opportunity.
131-
// It does not eliminate parallelism opportunity.
132-
void optimizeDependencyGraph(DepGraph* deps) {
133-
auto edges = deps->getMutableEdges();
134-
for (const auto& edge : edges) {
135-
auto tail = edge->tail();
136-
auto head = edge->head();
137-
deps->deleteEdge(edge);
138-
std::unordered_set<DepGraph::NodeRef> seen;
139-
nom::algorithm::reachable<DepGraph>(tail, nullptr, &seen);
140-
// Removing that edge removes a dominator, which is invalid
141-
if (!seen.count(head)) {
142-
deps->createEdge(tail, head);
143-
}
144-
}
145-
}
146-
147-
ExecutionChains computeChains(
148-
const caffe2::NetDef& predict_net,
149-
std::vector<OperatorNode>& orig_nodes) {
150-
// These serve as the map into predict_net.op()
151-
std::vector<NNGraph::NodeRef> nom_ops;
152-
auto nn = convertToNNModule(predict_net, false, &nom_ops);
153-
CAFFE_ENFORCE_EQ(nom_ops.size(), predict_net.op().size());
154-
155-
// Create a map from NodeRef to index into predict_net.op()
156-
// Now we can use pure nomnigraph functions and map back later
157-
std::unordered_map<NNGraph::NodeRef, int> nom_op_to_pos;
158-
for (auto idx = 0; idx < nom_ops.size(); ++idx) {
159-
nom_op_to_pos[nom_ops[idx]] = idx;
160-
}
161-
162-
// The algorithm:
163-
// 1) create dependency graph of ops
164-
// 2) for all nodes thats have multiple in edges, remove all in edges
165-
// 3) for all nodes thats have multiple out edges, remove all out edges
166-
// 4) return the components as chains
167-
168-
// Caveats that can easily be handled
169-
// 1) Cannot have a chain that crosses device options
170-
// insert extra edge at each boundary
171-
// 2) All CPU async ops have to be the last op in a chain
172-
// insert extra out edge
173-
DepGraph deps;
174-
175-
// Map NodeRef to the node in the dependency graph
176-
std::unordered_map<NNGraph::NodeRef, DepGraph::NodeRef> dep_map;
177-
for (const auto& node : nn::filter<NeuralNetOperator>(nn)) {
178-
dep_map[node] = deps.createNode(node);
179-
}
180-
181-
// 1) Create dependency graph
182-
for (const auto& node : nn::filter<NeuralNetOperator>(nn)) {
183-
for (const auto& output : nn::getOutputs(node)) {
184-
for (const auto& consumer : nn::getConsumers(output)) {
185-
// Record single dependencies first
186-
if (!deps.hasEdge(dep_map[node], dep_map[consumer])) {
187-
deps.createEdge(dep_map[node], dep_map[consumer]);
188-
}
189-
}
190-
}
191-
}
192-
193-
optimizeDependencyGraph(&deps);
194-
195-
// Fixup device boundary and async op issues
196-
for (const auto& dep : deps.getMutableNodes()) {
197-
int op_idx = nom_op_to_pos[dep->data()];
198-
auto d1 = orig_nodes.at(op_idx).operator_->device_option();
199-
auto outEdges = dep->getOutEdges();
200-
for (const auto& outEdge : outEdges) {
201-
int op2_idx = nom_op_to_pos[outEdge->head()->data()];
202-
auto d2 = orig_nodes.at(op2_idx).operator_->device_option();
203-
if (!IsSameDevice(d1, d2)) {
204-
deps.createEdge(dep, outEdge->head());
205-
}
206-
}
207-
if (d1.device_type() == PROTO_CUDA) {
208-
continue;
209-
}
210-
if (orig_nodes.at(op_idx).operator_->HasAsyncPart()) {
211-
outEdges = dep->getOutEdges();
212-
for (const auto& outEdge : outEdges) {
213-
// Clone out edges
214-
deps.createEdge(outEdge->tail(), outEdge->head());
215-
}
216-
}
217-
}
218-
219-
// 2) Prune in edges if multiplicity > 1
220-
// 3) Prune out edges if multiplicity > 1
221-
for (const auto& dep : deps.getMutableNodes()) {
222-
auto inEdges = dep->getInEdges();
223-
if (inEdges.size() > 1) {
224-
for (const auto& inEdge : inEdges) {
225-
NOM_REQUIRE_OR_CONT(inEdge);
226-
deps.deleteEdge(inEdge);
227-
}
228-
}
229-
auto outEdges = dep->getOutEdges();
230-
if (outEdges.size() > 1) {
231-
for (const auto& outEdge : outEdges) {
232-
NOM_REQUIRE_OR_CONT(outEdge);
233-
deps.deleteEdge(outEdge);
234-
}
235-
}
236-
}
237-
238-
// 4) Return components as chains
239-
std::vector<DepGraph::NodeRef> chain_starts;
240-
for (const auto& dep : deps.getMutableNodes()) {
241-
if (dep->getInEdges().size() == 0) {
242-
chain_starts.emplace_back(dep);
243-
}
244-
}
245-
246-
ExecutionChains chains;
247-
for (const auto& dep : chain_starts) {
248-
DepGraph::NodeRef front = dep;
249-
std::vector<int> ops;
250-
do {
251-
ops.emplace_back(nom_op_to_pos[front->data()]);
252-
auto outEdges = front->getOutEdges();
253-
if (outEdges.size()) {
254-
front = outEdges.at(0)->head();
255-
} else {
256-
front = nullptr;
257-
}
258-
} while (front);
259-
chains[nom_op_to_pos[dep->data()]] = ops;
260-
}
261-
262-
updateOperatorNodes(orig_nodes, chains);
263-
return chains;
264-
}
265-
266123
ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes) {
267124
const std::vector<OpGraphNode> nodes = pruneOpNodeGraph(orig_nodes);
268125
vector<int> initial_frontier;

caffe2/core/net_dag_utils.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,6 @@ struct OpGraphNode {
4343

4444
using ExecutionChains = std::unordered_map<int, std::vector<int>>;
4545

46-
C10_EXPORT ExecutionChains computeChains(
47-
const caffe2::NetDef& predict_net,
48-
std::vector<OperatorNode>& orig_nodes);
4946
C10_EXPORT ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes);
5047

5148
// Instead of breaking down the DAG into chains, we partition it into clusters

caffe2/python/test/executor_test.py

Lines changed: 9 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,14 @@
22
from __future__ import division
33
from __future__ import print_function
44

5-
from caffe2.python import core, workspace, model_helper
6-
import random
5+
from caffe2.python import core, workspace
76
from caffe2.python.test.executor_test_util import (
87
build_conv_model,
98
build_resnet50_dataparallel_model,
109
run_resnet50_epoch,
1110
ExecutorTestBase,
1211
executor_test_settings,
13-
executor_test_model_names,
14-
)
12+
executor_test_model_names)
1513

1614
from caffe2.python.test_util import TestCase
1715

@@ -26,12 +24,10 @@
2624

2725

2826
class ExecutorCPUConvNetTest(ExecutorTestBase):
29-
@given(
30-
executor=st.sampled_from(EXECUTORS),
31-
model_name=st.sampled_from(executor_test_model_names()),
32-
batch_size=st.sampled_from([1]),
33-
num_workers=st.sampled_from([8]),
34-
)
27+
@given(executor=st.sampled_from(EXECUTORS),
28+
model_name=st.sampled_from(executor_test_model_names()),
29+
batch_size=st.sampled_from([1]),
30+
num_workers=st.sampled_from([8]))
3531
@executor_test_settings
3632
def test_executor(self, executor, model_name, batch_size, num_workers):
3733
model = build_conv_model(model_name, batch_size)
@@ -54,7 +50,8 @@ def run_model():
5450
@unittest.skipIf(not workspace.has_gpu_support
5551
and not workspace.has_hip_support, "no gpu")
5652
class ExecutorGPUResNetTest(ExecutorTestBase):
57-
@given(executor=st.sampled_from(EXECUTORS), num_workers=st.sampled_from([8]))
53+
@given(executor=st.sampled_from(EXECUTORS),
54+
num_workers=st.sampled_from([8]))
5855
@executor_test_settings
5956
def test_executor(self, executor, num_workers):
6057
model = build_resnet50_dataparallel_model(
@@ -103,33 +100,5 @@ def create_failing_net(throw_exception):
103100
self.assertFalse(res)
104101

105102

106-
class ExecutorFuzzTest(ExecutorTestBase):
107-
def test_fuzzy_model(self):
108-
model = model_helper.ModelHelper(name="test")
109-
inits = []
110-
for i in range(100):
111-
init = model.param_init_net.ConstantFill(
112-
[], "ONE" + str(i), shape=[1], value=1.0
113-
)
114-
inits.append(init)
115-
adds = []
116-
for i in range(1000):
117-
add = model.net.Add(
118-
[random.choice(inits + adds), random.choice(inits + adds)],
119-
"ADD" + str(i),
120-
)
121-
adds.append(add)
122-
123-
def run_model():
124-
workspace.RunNet(model.net, 100)
125-
126-
self.compare_executors(
127-
model,
128-
ref_executor="simple",
129-
test_executor="async_scheduling",
130-
model_run_func=run_model,
131-
)
132-
133-
134-
if __name__ == "__main__":
103+
if __name__ == '__main__':
135104
unittest.main()

0 commit comments

Comments
 (0)