Skip to content

Commit 14c53e4

Browse files
chcostGitHub Enterprise
authored andcommitted
Merge pull request #20 from codeflare/lineage_semantics
Resolved conflicts and merging lineage semantics into develop.
2 parents 0408629 + 4eea26d commit 14c53e4

File tree

5 files changed

+277
-233
lines changed

5 files changed

+277
-233
lines changed

codeflare/pipelines/Datamodel.py

Lines changed: 78 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1-
from sklearn.base import BaseEstimator
21
from abc import ABC, abstractmethod
2+
import uuid
3+
from enum import Enum
4+
35

6+
import sklearn.base as base
7+
from sklearn.base import TransformerMixin
8+
from sklearn.base import BaseEstimator
49

510
class Xy:
611
"""
@@ -35,9 +40,12 @@ class XYRef:
3540
computed), these holders are essential to the pipeline constructs.
3641
"""
3742

38-
def __init__(self, Xref, yref):
43+
def __init__(self, Xref, yref, prev_noderef=None, curr_noderef=None, prev_Xyrefs = None):
3944
self.__Xref__ = Xref
4045
self.__yref__ = yref
46+
self.__prevnoderef__ = prev_noderef
47+
self.__currnoderef__ = curr_noderef
48+
self.__prev_Xyrefs__ = prev_Xyrefs
4149

4250
def get_Xref(self):
4351
"""
@@ -51,6 +59,32 @@ def get_yref(self):
5159
"""
5260
return self.__yref__
5361

62+
def get_prevnoderef(self):
63+
return self.__prevnoderef__
64+
65+
def get_currnoderef(self):
66+
return self.__currnoderef__
67+
68+
def get_prev_xyrefs(self):
69+
return self.__prev_Xyrefs__
70+
71+
72+
class NodeInputType(Enum):
73+
OR = 0,
74+
AND = 1
75+
76+
77+
class NodeFiringType(Enum):
78+
ANY = 0,
79+
ALL = 1
80+
81+
82+
class NodeStateType(Enum):
83+
STATELESS = 0,
84+
IMMUTABLE = 1,
85+
MUTABLE_SEQUENTIAL = 2,
86+
MUTABLE_AGGREGATE = 3
87+
5488

5589
class Node(ABC):
5690
"""
@@ -59,20 +93,40 @@ class Node(ABC):
5993
node name and the type of the node match.
6094
"""
6195

96+
def __init__(self, node_name, node_input_type: NodeInputType, node_firing_type: NodeFiringType, node_state_type: NodeStateType):
97+
self.__node_name__ = node_name
98+
self.__node_input_type__ = node_input_type
99+
self.__node_firing_type__ = node_firing_type
100+
self.__node_state_type__ = node_state_type
101+
self.__id__ = uuid.uuid4()
102+
62103
def __str__(self):
63104
return self.__node_name__
64105

106+
def get_id(self):
107+
return self.__id__
108+
109+
def get_node_input_type(self):
110+
return self.__node_input_type__
111+
112+
def get_node_firing_type(self):
113+
return self.__node_firing_type__
114+
115+
def get_node_state_type(self):
116+
return self.__node_state_type__
117+
65118
@abstractmethod
66-
def get_and_flag(self):
67-
raise NotImplementedError("Please implement this method")
119+
def clone(self):
120+
raise NotImplementedError("Please implement the clone method")
68121

69122
def __hash__(self):
70123
"""
71124
Hash code, defined as the hash code of the node name
72125
73126
:return: Hash code
74127
"""
75-
return self.__node_name__.__hash__()
128+
129+
return self.__id__.__hash__()
76130

77131
def __eq__(self, other):
78132
"""
@@ -84,16 +138,16 @@ def __eq__(self, other):
84138
"""
85139
return (
86140
self.__class__ == other.__class__ and
141+
self.__id__ == other.__id__ and
87142
self.__node_name__ == other.__node_name__
88143
)
89144

90145

91-
class OrNode(Node):
146+
class EstimatorNode(Node):
92147
"""
93148
Or node, which is the basic node that would be the equivalent of any SKlearn pipeline
94149
stage. This node is initialized with an estimator that needs to extend sklearn.BaseEstimator.
95150
"""
96-
__estimator__ = None
97151

98152
def __init__(self, node_name: str, estimator: BaseEstimator):
99153
"""
@@ -102,7 +156,8 @@ def __init__(self, node_name: str, estimator: BaseEstimator):
102156
:param node_name: Name of the node
103157
:param estimator: The base estimator
104158
"""
105-
self.__node_name__ = node_name
159+
160+
super().__init__(node_name, NodeInputType.OR, NodeFiringType.ANY, NodeStateType.IMMUTABLE)
106161
self.__estimator__ = estimator
107162

108163
def get_estimator(self) -> BaseEstimator:
@@ -113,37 +168,33 @@ def get_estimator(self) -> BaseEstimator:
113168
"""
114169
return self.__estimator__
115170

116-
def get_and_flag(self):
117-
"""
118-
A flag to check if node is AND or not. By definition, this is NOT
119-
an AND node.
120-
:return: False, always
121-
"""
122-
return False
171+
def clone(self):
172+
cloned_estimator = base.clone(self.__estimator__)
173+
return EstimatorNode(self.__node_name__, cloned_estimator)
123174

124175

125-
class AndFunc(ABC):
126-
"""
127-
Or nodes are init-ed from the
128-
"""
176+
class AndTransform(TransformerMixin, BaseEstimator):
177+
@abstractmethod
178+
def transform(self, xy_list: list) -> Xy:
179+
raise NotImplementedError("Please implement this method")
129180

181+
182+
class GeneralTransform(TransformerMixin, BaseEstimator):
130183
@abstractmethod
131-
def eval(self, xy_list: list) -> Xy:
184+
def transform(self, xy: Xy) -> Xy:
132185
raise NotImplementedError("Please implement this method")
133186

134187

135188
class AndNode(Node):
136-
__andfunc__ = None
137-
138-
def __init__(self, node_name: str, and_func: AndFunc):
139-
self.__node_name__ = node_name
189+
def __init__(self, node_name: str, and_func: AndTransform):
190+
super().__init__(node_name, NodeInputType.AND, NodeFiringType.ANY, NodeStateType.STATELESS)
140191
self.__andfunc__ = and_func
141192

142-
def get_and_func(self) -> AndFunc:
193+
def get_and_func(self) -> AndTransform:
143194
return self.__andfunc__
144195

145-
def get_and_flag(self):
146-
return True
196+
def clone(self):
197+
return AndNode(self.__node_name__, self.__andfunc__)
147198

148199

149200
class Edge:

codeflare/pipelines/Runtime.py

Lines changed: 45 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
import ray
22

3-
from codeflare.pipelines.Datamodel import OrNode
3+
4+
from codeflare.pipelines.Datamodel import EstimatorNode
45
from codeflare.pipelines.Datamodel import AndNode
56
from codeflare.pipelines.Datamodel import Edge
67
from codeflare.pipelines.Datamodel import Pipeline
78
from codeflare.pipelines.Datamodel import XYRef
89
from codeflare.pipelines.Datamodel import Xy
10+
from codeflare.pipelines.Datamodel import NodeInputType
11+
from codeflare.pipelines.Datamodel import NodeStateType
12+
from codeflare.pipelines.Datamodel import NodeFiringType
913

1014
import sklearn.base as base
1115
from enum import Enum
@@ -18,47 +22,60 @@ class ExecutionType(Enum):
1822

1923

2024
@ray.remote
21-
def execute_or_node_inner(node: OrNode, train_mode: ExecutionType, Xy: XYRef):
25+
def execute_or_node_remote(node: EstimatorNode, train_mode: ExecutionType, xy_ref: XYRef):
2226
estimator = node.get_estimator()
2327
# Blocking operation -- not avoidable
24-
X = ray.get(Xy.get_Xref())
25-
y = ray.get(Xy.get_yref())
28+
X = ray.get(xy_ref.get_Xref())
29+
y = ray.get(xy_ref.get_yref())
2630

31+
# TODO: Can optimize the node pointers without replicating them
2732
if train_mode == ExecutionType.FIT:
33+
cloned_node = node.clone()
34+
prev_node_ptr = ray.put(node)
35+
2836
if base.is_classifier(estimator) or base.is_regressor(estimator):
2937
# Always clone before fit, else fit is invalid
30-
cloned_estimator = base.clone(estimator)
38+
cloned_estimator = cloned_node.get_estimator()
3139
cloned_estimator.fit(X, y)
40+
41+
curr_node_ptr = ray.put(cloned_node)
3242
# TODO: For now, make yref passthrough - this has to be fixed more comprehensively
3343
res_Xref = ray.put(cloned_estimator.predict(X))
34-
result = XYRef(res_Xref, Xy.get_yref())
44+
result = XYRef(res_Xref, xy_ref.get_yref(), prev_node_ptr, curr_node_ptr, [xy_ref])
3545
return result
3646
else:
37-
# No need to clone as it is a transform pass through on the fitted estimator
38-
res_Xref = ray.put(estimator.fit_transform(X, y))
39-
result = XYRef(res_Xref, Xy.get_yref())
47+
cloned_estimator = cloned_node.get_estimator()
48+
res_Xref = ray.put(cloned_estimator.fit_transform(X, y))
49+
curr_node_ptr = ray.put(cloned_node)
50+
result = XYRef(res_Xref, xy_ref.get_yref(), prev_node_ptr, curr_node_ptr, [xy_ref])
4051
return result
4152
elif train_mode == ExecutionType.SCORE:
53+
cloned_node = node.clone()
54+
prev_node_ptr = ray.put(node)
55+
4256
if base.is_classifier(estimator) or base.is_regressor(estimator):
43-
cloned_estimator = base.clone(estimator)
57+
cloned_estimator = cloned_node.get_estimator()
4458
cloned_estimator.fit(X, y)
59+
curr_node_ptr = ray.put(cloned_node)
4560
res_Xref = ray.put(cloned_estimator.score(X, y))
46-
result = XYRef(res_Xref, Xy.get_yref())
61+
result = XYRef(res_Xref, xy_ref.get_yref(), prev_node_ptr, curr_node_ptr, [xy_ref])
4762
return result
4863
else:
49-
# No need to clone as it is a transform pass through on the fitted estimator
50-
res_Xref = ray.put(estimator.fit_transform(X, y))
51-
result = XYRef(res_Xref, Xy.get_yref())
64+
cloned_estimator = cloned_node.get_estimator()
65+
res_Xref = ray.put(cloned_estimator.fit_transform(X, y))
66+
curr_node_ptr = ray.put(cloned_node)
67+
result = XYRef(res_Xref, xy_ref.get_yref(), prev_node_ptr, curr_node_ptr, [xy_ref])
68+
5269
return result
5370
elif train_mode == ExecutionType.PREDICT:
5471
# Test mode does not clone as it is a simple predict or transform
5572
if base.is_classifier(estimator) or base.is_regressor(estimator):
5673
res_Xref = estimator.predict(X)
57-
result = XYRef(res_Xref, Xy.get_yref())
74+
result = XYRef(res_Xref, xy_ref.get_yref())
5875
return result
5976
else:
6077
res_Xref = estimator.transform(X)
61-
result = XYRef(res_Xref, Xy.get_yref())
78+
result = XYRef(res_Xref, xy_ref.get_yref())
6279
return result
6380

6481

@@ -68,7 +85,7 @@ def execute_or_node(node, pre_edges, edge_args, post_edges, mode: ExecutionType)
6885
exec_xyrefs = []
6986
for xy_ref_ptr in Xyref_ptrs:
7087
xy_ref = ray.get(xy_ref_ptr)
71-
inner_result = execute_or_node_inner.remote(node, mode, xy_ref)
88+
inner_result = execute_or_node_remote.remote(node, mode, xy_ref)
7289
exec_xyrefs.append(inner_result)
7390

7491
for post_edge in post_edges:
@@ -78,29 +95,33 @@ def execute_or_node(node, pre_edges, edge_args, post_edges, mode: ExecutionType)
7895

7996

8097
@ray.remote
81-
def and_node_eval(and_func, Xyref_list):
98+
def execute_and_node_remote(node: AndNode, Xyref_list):
8299
xy_list = []
100+
prev_node_ptr = ray.put(node)
83101
for Xyref in Xyref_list:
84102
X = ray.get(Xyref.get_Xref())
85103
y = ray.get(Xyref.get_yref())
86104
xy_list.append(Xy(X, y))
87105

88-
res_Xy = and_func.eval(xy_list)
106+
cloned_node = node.clone()
107+
curr_node_ptr = ray.put(cloned_node)
108+
109+
cloned_and_func = cloned_node.get_and_func()
110+
res_Xy = cloned_and_func.transform(xy_list)
89111
res_Xref = ray.put(res_Xy.get_x())
90112
res_yref = ray.put(res_Xy.get_y())
91-
return XYRef(res_Xref, res_yref)
113+
return XYRef(res_Xref, res_yref, prev_node_ptr, curr_node_ptr, Xyref_list)
92114

93115

94116
def execute_and_node_inner(node: AndNode, Xyref_ptrs):
95-
and_func = node.get_and_func()
96117
result = []
97118

98119
Xyref_list = []
99120
for Xyref_ptr in Xyref_ptrs:
100121
Xyref = ray.get(Xyref_ptr)
101122
Xyref_list.append(Xyref)
102123

103-
Xyref_ptr = and_node_eval.remote(and_func, Xyref_list)
124+
Xyref_ptr = execute_and_node_remote.remote(node, Xyref_list)
104125
result.append(Xyref_ptr)
105126
return result
106127

@@ -136,9 +157,9 @@ def execute_pipeline(pipeline: Pipeline, mode: ExecutionType, in_args: dict):
136157
for node in nodes:
137158
pre_edges = pipeline.get_pre_edges(node)
138159
post_edges = pipeline.get_post_edges(node)
139-
if not node.get_and_flag():
160+
if node.get_node_input_type() == NodeInputType.OR:
140161
execute_or_node(node, pre_edges, edge_args, post_edges, mode)
141-
elif node.get_and_flag():
162+
elif node.get_node_input_type() == NodeInputType.AND:
142163
execute_and_node(node, pre_edges, edge_args, post_edges)
143164

144165
out_args = {}

codeflare_pipelines.egg-info/SOURCES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
README.md
12
setup.py
23
codeflare/__init__.py
34
codeflare/pipelines/Datamodel.py

0 commit comments

Comments
 (0)