1
- import pytest
2
-
3
1
import codeflare .pipelines .Datamodel as dm
4
2
import codeflare .pipelines .Runtime as rt
5
3
6
4
import numpy as np
7
- from sklearn .preprocessing import FunctionTransformer
8
5
from sklearn .preprocessing import MinMaxScaler
9
6
import os
7
+ import pandas as pd
8
+ from sklearn .pipeline import Pipeline
9
+ from sklearn .impute import SimpleImputer
10
+ from sklearn .preprocessing import StandardScaler , OneHotEncoder
11
+ from sklearn .ensemble import RandomForestClassifier , GradientBoostingClassifier
12
+
13
+ import ray
10
14
11
15
12
16
class FeatureUnion (dm .AndTransform ):
@@ -47,8 +51,7 @@ def test_save_load():
47
51
r_fh = open (fname , 'rb' )
48
52
saved_pipeline = dm .Pipeline .load (r_fh )
49
53
pre_edges = saved_pipeline .get_pre_edges (node_c )
50
- assert (len (pre_edges ) == 2 )
51
-
54
+ assert (len (pre_edges ) == 2 )
52
55
os .remove (fname )
53
56
54
57
@@ -58,4 +61,76 @@ def test_runtime_save_load():
58
61
captured accurately
59
62
:return:
60
63
"""
61
-
64
+ train = pd .read_csv ('../../../resources/data/train_ctrUa4K.csv' )
65
+ train = train .drop ('Loan_ID' , axis = 1 )
66
+
67
+ X = train .drop ('Loan_Status' , axis = 1 )
68
+ y = train ['Loan_Status' ]
69
+ from sklearn .model_selection import train_test_split
70
+ X_train , X_test , y_train , y_test = train_test_split (X , y , test_size = 0.2 )
71
+ imputer = SimpleImputer (strategy = 'median' )
72
+ scaler = StandardScaler ()
73
+
74
+ numeric_transformer = Pipeline (steps = [
75
+ ('imputer' , imputer ),
76
+ ('scaler' , scaler )])
77
+
78
+ cat_imputer = SimpleImputer (strategy = 'constant' , fill_value = 'missing' )
79
+ cat_onehot = OneHotEncoder (handle_unknown = 'ignore' )
80
+
81
+ categorical_transformer = Pipeline (steps = [
82
+ ('imputer' , cat_imputer ),
83
+ ('onehot' , cat_onehot )])
84
+ numeric_features = train .select_dtypes (include = ['int64' , 'float64' ]).columns
85
+ categorical_features = train .select_dtypes (include = ['object' ]).drop (['Loan_Status' ], axis = 1 ).columns
86
+ from sklearn .compose import ColumnTransformer
87
+ preprocessor = ColumnTransformer (
88
+ transformers = [
89
+ ('num' , numeric_transformer , numeric_features ),
90
+ ('cat' , categorical_transformer , categorical_features )])
91
+
92
+ classifiers = [
93
+ RandomForestClassifier (),
94
+ GradientBoostingClassifier ()
95
+ ]
96
+ pipeline = dm .Pipeline ()
97
+ node_pre = dm .EstimatorNode ('preprocess' , preprocessor )
98
+ node_rf = dm .EstimatorNode ('random_forest' , classifiers [0 ])
99
+ node_gb = dm .EstimatorNode ('gradient_boost' , classifiers [1 ])
100
+
101
+ pipeline .add_edge (node_pre , node_rf )
102
+ pipeline .add_edge (node_pre , node_gb )
103
+
104
+ import ray
105
+ ray .shutdown ()
106
+ ray .init ()
107
+ pipeline_input = dm .PipelineInput ()
108
+ xy = dm .Xy (X_train , y_train )
109
+ pipeline_input .add_xy_arg (node_pre , xy )
110
+
111
+ pipeline_output = rt .execute_pipeline (pipeline , rt .ExecutionType .FIT , pipeline_input )
112
+ node_rf_xyrefs = pipeline_output .get_xyrefs (node_rf )
113
+
114
+ # save this pipeline for random forest and load and then predict on test data
115
+ fname = 'random_forest.cfp'
116
+ w_fh = open (fname , 'wb' )
117
+ rt .save (pipeline_output , node_rf_xyrefs [0 ], w_fh )
118
+ w_fh .close ()
119
+
120
+ # load it
121
+ r_fh = open (fname , 'rb' )
122
+ saved_pipeline = dm .Pipeline .load (r_fh )
123
+ nodes = saved_pipeline .get_nodes ()
124
+ # this should not exist in the saved pipeline
125
+ assert (node_gb .get_node_name () not in nodes .keys ())
126
+
127
+ # should be preditable as well
128
+ predict_pipeline_input = dm .PipelineInput ()
129
+ predict_pipeline_input .add_xy_arg (node_pre , dm .Xy (X_test , y_test ))
130
+ try :
131
+ predict_pipeline_output = rt .execute_pipeline (saved_pipeline , rt .ExecutionType .PREDICT , predict_pipeline_input )
132
+ predict_pipeline_output .get_xyrefs (node_rf )
133
+ except Exception :
134
+ assert False
135
+
136
+ os .remove (fname )
0 commit comments