1
+ import codeflare .pipelines .Datamodel as dm
2
+ import codeflare .pipelines .Runtime as rt
3
+
4
+ import numpy as np
5
+ from sklearn .preprocessing import MinMaxScaler
6
+ import os
7
+ import pandas as pd
8
+ from sklearn .pipeline import Pipeline
9
+ from sklearn .impute import SimpleImputer
10
+ from sklearn .preprocessing import StandardScaler , OneHotEncoder
11
+ from sklearn .ensemble import RandomForestClassifier , GradientBoostingClassifier
12
+
13
+ import ray
14
+
15
+
16
+ class FeatureUnion (dm .AndTransform ):
17
+ def __init__ (self ):
18
+ pass
19
+
20
+ def transform (self , xy_list ):
21
+ X_list = []
22
+ y_list = []
23
+
24
+ for xy in xy_list :
25
+ X_list .append (xy .get_x ())
26
+ X_concat = np .concatenate (X_list , axis = 0 )
27
+
28
+ return dm .Xy (X_concat , None )
29
+
30
+
31
+ def test_save_load ():
32
+ """
33
+ A simple save load test for a pipeline graph
34
+ :return:
35
+ """
36
+ pipeline = dm .Pipeline ()
37
+ minmax_scaler = MinMaxScaler ()
38
+
39
+ node_a = dm .EstimatorNode ('a' , minmax_scaler )
40
+ node_b = dm .EstimatorNode ('b' , minmax_scaler )
41
+ node_c = dm .AndNode ('c' , FeatureUnion ())
42
+
43
+ pipeline .add_edge (node_a , node_c )
44
+ pipeline .add_edge (node_b , node_c )
45
+
46
+ fname = 'save_pipeline.cfp'
47
+ fh = open (fname , 'wb' )
48
+ pipeline .save (fh )
49
+ fh .close ()
50
+
51
+ r_fh = open (fname , 'rb' )
52
+ saved_pipeline = dm .Pipeline .load (r_fh )
53
+ pre_edges = saved_pipeline .get_pre_edges (node_c )
54
+ assert (len (pre_edges ) == 2 )
55
+ os .remove (fname )
56
+
57
+
58
+ def test_runtime_save_load ():
59
+ """
60
+ Tests for selecting a pipeline and save/load it, we also test the predict to ensure state is
61
+ captured accurately
62
+ :return:
63
+ """
64
+ train = pd .read_csv ('../../../resources/data/train_ctrUa4K.csv' )
65
+ train = train .drop ('Loan_ID' , axis = 1 )
66
+
67
+ X = train .drop ('Loan_Status' , axis = 1 )
68
+ y = train ['Loan_Status' ]
69
+ from sklearn .model_selection import train_test_split
70
+ X_train , X_test , y_train , y_test = train_test_split (X , y , test_size = 0.2 )
71
+ imputer = SimpleImputer (strategy = 'median' )
72
+ scaler = StandardScaler ()
73
+
74
+ numeric_transformer = Pipeline (steps = [
75
+ ('imputer' , imputer ),
76
+ ('scaler' , scaler )])
77
+
78
+ cat_imputer = SimpleImputer (strategy = 'constant' , fill_value = 'missing' )
79
+ cat_onehot = OneHotEncoder (handle_unknown = 'ignore' )
80
+
81
+ categorical_transformer = Pipeline (steps = [
82
+ ('imputer' , cat_imputer ),
83
+ ('onehot' , cat_onehot )])
84
+ numeric_features = train .select_dtypes (include = ['int64' , 'float64' ]).columns
85
+ categorical_features = train .select_dtypes (include = ['object' ]).drop (['Loan_Status' ], axis = 1 ).columns
86
+ from sklearn .compose import ColumnTransformer
87
+ preprocessor = ColumnTransformer (
88
+ transformers = [
89
+ ('num' , numeric_transformer , numeric_features ),
90
+ ('cat' , categorical_transformer , categorical_features )])
91
+
92
+ classifiers = [
93
+ RandomForestClassifier (),
94
+ GradientBoostingClassifier ()
95
+ ]
96
+ pipeline = dm .Pipeline ()
97
+ node_pre = dm .EstimatorNode ('preprocess' , preprocessor )
98
+ node_rf = dm .EstimatorNode ('random_forest' , classifiers [0 ])
99
+ node_gb = dm .EstimatorNode ('gradient_boost' , classifiers [1 ])
100
+
101
+ pipeline .add_edge (node_pre , node_rf )
102
+ pipeline .add_edge (node_pre , node_gb )
103
+
104
+ import ray
105
+ ray .shutdown ()
106
+ ray .init ()
107
+ pipeline_input = dm .PipelineInput ()
108
+ xy = dm .Xy (X_train , y_train )
109
+ pipeline_input .add_xy_arg (node_pre , xy )
110
+
111
+ pipeline_output = rt .execute_pipeline (pipeline , rt .ExecutionType .FIT , pipeline_input )
112
+ node_rf_xyrefs = pipeline_output .get_xyrefs (node_rf )
113
+
114
+ # save this pipeline for random forest and load and then predict on test data
115
+ fname = 'random_forest.cfp'
116
+ w_fh = open (fname , 'wb' )
117
+ rt .save (pipeline_output , node_rf_xyrefs [0 ], w_fh )
118
+ w_fh .close ()
119
+
120
+ # load it
121
+ r_fh = open (fname , 'rb' )
122
+ saved_pipeline = dm .Pipeline .load (r_fh )
123
+ nodes = saved_pipeline .get_nodes ()
124
+ # this should not exist in the saved pipeline
125
+ assert (node_gb .get_node_name () not in nodes .keys ())
126
+
127
+ # should be preditable as well
128
+ predict_pipeline_input = dm .PipelineInput ()
129
+ predict_pipeline_input .add_xy_arg (node_pre , dm .Xy (X_test , y_test ))
130
+ try :
131
+ predict_pipeline_output = rt .execute_pipeline (saved_pipeline , rt .ExecutionType .PREDICT , predict_pipeline_input )
132
+ predict_pipeline_output .get_xyrefs (node_rf )
133
+ except Exception :
134
+ assert False
135
+
136
+ os .remove (fname )
0 commit comments