added tests for more complex pipelines

yuanchi2807 · yuanchi2807 · commit c401edadec9e · 2021-05-27T11:31:57.000-04:00
diff --git a/codeflare/pipelines/tests/README.md b/codeflare/pipelines/tests/README.md
@@ -1,4 +1,4 @@
-# Architecture decision record 
+# Architecture decision record
 
 Select a test framework for Codeflare pipeline.
 
@@ -8,10 +8,9 @@ Contents:
 * [Unit test coverage](#unit-test-coverage)
 
 ## Use pytest as test framework
-PyTest is a testing framework in Python, with simple and easy syntax targeting unit tests and simple functional tests. PyTest can run tests in parallel and automatically detects tests in the test folder. PyTest serves the current goal of testing Codeflare pipelines well. 
+PyTest is a testing framework in Python, with simple and easy syntax targeting unit tests and simple functional tests. PyTest can run tests in parallel and automatically detects tests in the test folder. PyTest serves the current goal of testing Codeflare pipelines well.
 
 ## Unit test coverage
-* And node in the pipeline graph
-* Or node in the pipeline graph
-
-
+* and (fan-in) node in a pipeline graph, and variants
+* or (fan-out) node in a pipeline graph, and variants
+* multibranch with mixtures of and/or nodes in a pipeline graph
diff --git a/codeflare/pipelines/tests/test_and.py b/codeflare/pipelines/tests/test_and.py
@@ -2,7 +2,7 @@
 import ray
 import pandas as pd
 import numpy as np
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
 import codeflare.pipelines.Datamodel as dm
 import codeflare.pipelines.Runtime as rt
 from codeflare.pipelines.Datamodel import Xy
@@ -15,15 +15,108 @@ def __init__(self):
 
     def transform(self, xy_list):
         X_list = []
-        y_list = []
+        y_vec = None
 
         for xy in xy_list:
             X_list.append(xy.get_x())
+            y_vec = xy.get_y()
         X_concat = np.concatenate(X_list, axis=0)
 
-        return Xy(X_concat, None)
+        return Xy(X_concat, y_vec)
 
-def test_and():
+def test_two_tier_and():
+
+    ray.shutdown()
+    ray.init()
+
+    ## prepare the data
+    X = np.random.randint(0,100,size=(10000, 4))
+    y = np.random.randint(0,2,size=(10000, 1))
+
+    ## initialize codeflare pipeline by first creating the nodes
+    pipeline = dm.Pipeline()
+    node_a = dm.EstimatorNode('a', MinMaxScaler())
+    node_b = dm.EstimatorNode('b', StandardScaler())
+    node_c = dm.EstimatorNode('c', MaxAbsScaler())
+    node_d = dm.EstimatorNode('d', RobustScaler())
+
+    node_e = dm.AndNode('e', FeatureUnion())
+    node_f = dm.AndNode('f', FeatureUnion())
+    node_g = dm.AndNode('g', FeatureUnion())
+
+    ## codeflare nodes are then connected by edges
+    pipeline.add_edge(node_a, node_e)
+    pipeline.add_edge(node_b, node_e)
+    pipeline.add_edge(node_c, node_f)
+    pipeline.add_edge(node_d, node_f)
+    pipeline.add_edge(node_e, node_g)
+    pipeline.add_edge(node_f, node_g)
+
+    pipeline_input = dm.PipelineInput()
+    xy = dm.Xy(X,y)
+    pipeline_input.add_xy_arg(node_a, xy)
+    pipeline_input.add_xy_arg(node_b, xy)
+    pipeline_input.add_xy_arg(node_c, xy)
+    pipeline_input.add_xy_arg(node_d, xy)
+
+    ## execute the codeflare pipeline
+    pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)
+
+    ## retrieve node e
+    node_g_output = pipeline_output.get_xyrefs(node_g)
+    Xout = ray.get(node_g_output[0].get_Xref())
+    yout = ray.get(node_g_output[0].get_yref())
+
+    assert Xout.shape == (40000, 4)
+    assert yout.shape == (10000, 1)
+
+    ray.shutdown()
+
+def test_four_input_and():
+
+    ray.shutdown()
+    ray.init()
+
+    ## prepare the data
+    X = np.random.randint(0,100,size=(10000, 4))
+    y = np.random.randint(0,2,size=(10000, 1))
+
+    ## initialize codeflare pipeline by first creating the nodes
+    pipeline = dm.Pipeline()
+    node_a = dm.EstimatorNode('a', MinMaxScaler())
+    node_b = dm.EstimatorNode('b', StandardScaler())
+    node_c = dm.EstimatorNode('c', MaxAbsScaler())
+    node_d = dm.EstimatorNode('d', RobustScaler())
+
+    node_e = dm.AndNode('e', FeatureUnion())
+
+    ## codeflare nodes are then connected by edges
+    pipeline.add_edge(node_a, node_e)
+    pipeline.add_edge(node_b, node_e)
+    pipeline.add_edge(node_c, node_e)
+    pipeline.add_edge(node_d, node_e)
+
+    pipeline_input = dm.PipelineInput()
+    xy = dm.Xy(X,y)
+    pipeline_input.add_xy_arg(node_a, xy)
+    pipeline_input.add_xy_arg(node_b, xy)
+    pipeline_input.add_xy_arg(node_c, xy)
+    pipeline_input.add_xy_arg(node_d, xy)
+
+    ## execute the codeflare pipeline
+    pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)
+
+    ## retrieve node e
+    node_e_output = pipeline_output.get_xyrefs(node_e)
+    Xout = ray.get(node_e_output[0].get_Xref())
+    yout = ray.get(node_e_output[0].get_yref())
+
+    assert Xout.shape == (40000, 4)
+    assert yout.shape == (10000, 1)
+
+    ray.shutdown()
+
+def test_two_input_and():
 
     ray.shutdown()
     ray.init()
@@ -52,7 +145,11 @@ def test_and():
 
     ## retrieve node c
     node_c_output = pipeline_output.get_xyrefs(node_c)
-    assert node_c_output
+    Xout = ray.get(node_c_output[0].get_Xref())
+    yout = ray.get(node_c_output[0].get_yref())
+
+    assert Xout.shape == (20000, 4)
+    assert yout.shape == (10000, 1)
 
     ray.shutdown()
 
diff --git a/codeflare/pipelines/tests/test_multibranch.py b/codeflare/pipelines/tests/test_multibranch.py
@@ -7,6 +7,7 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import LogisticRegression
 import codeflare.pipelines.Datamodel as dm
 import codeflare.pipelines.Runtime as rt
 from codeflare.pipelines.Datamodel import Xy
@@ -19,15 +20,16 @@ def __init__(self):
 
     def transform(self, xy_list):
         X_list = []
-        y_list = []
+        y_vec = None
 
         for xy in xy_list:
             X_list.append(xy.get_x())
-        X_concat = np.concatenate(X_list, axis=0)
+            y_vec = xy.get_y()
+        X_concat = np.concatenate(X_list, axis=1)
 
-        return Xy(X_concat, None)
+        return Xy(X_concat, y_vec.values.ravel())
 
-def test_multibranch():
+def test_multibranch_1():
 
     ray.shutdown()
     ray.init()
@@ -48,19 +50,70 @@ def test_multibranch():
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
-    ## create two decision tree classifiers with different depth limit
-    c_a = DecisionTreeClassifier(max_depth=3)
-    c_b = DecisionTreeClassifier(max_depth=5)
+    ## initialize codeflare pipeline by first creating the nodes
+    pipeline = dm.Pipeline()
+
+    node_a = dm.EstimatorNode('preprocess', preprocessor)
+    node_b = dm.EstimatorNode('s_b', MinMaxScaler())
+    node_c = dm.AndNode('s_c', FeatureUnion())
+    node_d = dm.EstimatorNode('c_d', LogisticRegression())
+    node_e = dm.EstimatorNode('c_e', DecisionTreeClassifier(max_depth=3))
+
+    ## codeflare nodes are then connected by edges
+    pipeline.add_edge(node_a, node_b)
+    pipeline.add_edge(node_b, node_c)
+    pipeline.add_edge(node_c, node_d)
+    pipeline.add_edge(node_c, node_e)
+
+    pipeline_input = dm.PipelineInput()
+    xy = dm.Xy(X_train, y_train)
+    pipeline_input.add_xy_arg(node_a, xy)
+
+    ## execute the codeflare pipeline
+    pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)
+
+    ## retrieve node e
+    node_e_output = pipeline_output.get_xyrefs(node_e)
+    Xout = ray.get(node_e_output[0].get_Xref())
+    yout = ray.get(node_e_output[0].get_yref())
+
+    assert Xout.shape[0] == 8000
+    assert yout.shape[0] == 8000
+
+    ray.shutdown()
+
+def test_multibranch_2():
+
+    ray.shutdown()
+    ray.init()
+
+    ## prepare the data
+    X = pd.DataFrame(np.random.randint(0,100,size=(10000, 4)), columns=list('ABCD'))
+    y = pd.DataFrame(np.random.randint(0,2,size=(10000, 1)), columns=['Label'])
+
+    numeric_features = X.select_dtypes(include=['int64']).columns
+    numeric_transformer = Pipeline(steps=[
+        ('scaler', StandardScaler())])
+
+    ## set up preprocessor as StandardScaler
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', numeric_transformer, numeric_features),
+            ])
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
     ## initialize codeflare pipeline by first creating the nodes
     pipeline = dm.Pipeline()
+
     node_a = dm.EstimatorNode('preprocess', preprocessor)
-    node_b = dm.EstimatorNode('c_a', c_a)
-    node_c = dm.EstimatorNode('c_b', c_b)
+    node_b = dm.EstimatorNode('c_a', DecisionTreeClassifier(max_depth=3))
+    node_c = dm.EstimatorNode('c_b', LogisticRegression())
 
-    node_d = dm.EstimatorNode('d', MinMaxScaler())
-    node_e = dm.EstimatorNode('e', StandardScaler())
-    node_f = dm.AndNode('f', FeatureUnion())
+    node_d = dm.EstimatorNode('s_d', MinMaxScaler())
+    node_e = dm.EstimatorNode('s_e', StandardScaler())
+    node_f = dm.AndNode('s_f', FeatureUnion())
+    node_g = dm.EstimatorNode('c_g', DecisionTreeClassifier(max_depth=5))
 
     ## codeflare nodes are then connected by edges
     pipeline.add_edge(node_a, node_b)
@@ -70,18 +123,30 @@ def test_multibranch():
     pipeline.add_edge(node_a, node_e)
     pipeline.add_edge(node_d, node_f)
     pipeline.add_edge(node_e, node_f)
+    pipeline.add_edge(node_f, node_g)
 
     pipeline_input = dm.PipelineInput()
     xy = dm.Xy(X_train, y_train)
     pipeline_input.add_xy_arg(node_a, xy)
 
     ## execute the codeflare pipeline
     pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)
-    assert pipeline_output
 
     ## retrieve node b
     node_b_output = pipeline_output.get_xyrefs(node_b)
-    assert node_b_output
+    Xout = ray.get(node_b_output[0].get_Xref())
+    yout = ray.get(node_b_output[0].get_yref())
+
+    assert Xout.shape[0] == 8000
+    assert yout.shape[0] == 8000
+
+    ## retrieve node g
+    node_g_output = pipeline_output.get_xyrefs(node_g)
+    Xout = ray.get(node_g_output[0].get_Xref())
+    yout = ray.get(node_g_output[0].get_yref())
+
+    assert Xout.shape[0] == 8000
+    assert yout.shape[0] == 8000
 
     ray.shutdown()
 
diff --git a/codeflare/pipelines/tests/test_or.py b/codeflare/pipelines/tests/test_or.py
@@ -7,13 +7,15 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
 import codeflare.pipelines.Datamodel as dm
 import codeflare.pipelines.Runtime as rt
 from codeflare.pipelines.Datamodel import Xy
 from codeflare.pipelines.Datamodel import XYRef
 from codeflare.pipelines.Runtime import ExecutionType
 
-def test_or():
+def test_four_input_or():
 
     ray.shutdown()
     ray.init()
@@ -33,7 +35,65 @@ def test_or():
             ])
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
-    
+
+    ## create two decision tree classifiers with different depth limit
+    c_a = DecisionTreeClassifier(max_depth=3)
+    c_b = DecisionTreeClassifier(max_depth=5)
+    c_c = LogisticRegression()
+    c_d = RandomForestClassifier(max_depth=5)
+
+    ## initialize codeflare pipeline by first creating the nodes
+    pipeline = dm.Pipeline()
+    node_a = dm.EstimatorNode('preprocess', preprocessor)
+    node_b = dm.EstimatorNode('c_a', c_a)
+    node_c = dm.EstimatorNode('c_b', c_b)
+    node_d = dm.EstimatorNode('c_c', c_c)
+    node_e = dm.EstimatorNode('c_d', c_d)
+
+    ## codeflare nodes are then connected by edges
+    pipeline.add_edge(node_a, node_b)
+    pipeline.add_edge(node_a, node_c)
+    pipeline.add_edge(node_a, node_d)
+    pipeline.add_edge(node_a, node_e)
+
+    pipeline_input = dm.PipelineInput()
+    xy = dm.Xy(X_train, y_train)
+    pipeline_input.add_xy_arg(node_a, xy)
+
+    pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)
+
+    node_e_output = pipeline_output.get_xyrefs(node_e)
+
+    Xout = ray.get(node_e_output[0].get_Xref())
+    yout = ray.get(node_e_output[0].get_yref())
+
+    assert Xout.shape[0] == 8000
+    assert yout.shape[0] == 8000
+
+    ray.shutdown()
+
+
+def test_two_input_or():
+
+    ray.shutdown()
+    ray.init()
+
+    ## prepare the data
+    X = pd.DataFrame(np.random.randint(0,100,size=(10000, 4)), columns=list('ABCD'))
+    y = pd.DataFrame(np.random.randint(0,2,size=(10000, 1)), columns=['Label'])
+
+    numeric_features = X.select_dtypes(include=['int64']).columns
+    numeric_transformer = Pipeline(steps=[
+        ('scaler', StandardScaler())])
+
+    ## set up preprocessor as StandardScaler
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', numeric_transformer, numeric_features),
+            ])
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+
     ## create two decision tree classifiers with different depth limit
     c_a = DecisionTreeClassifier(max_depth=3)
     c_b = DecisionTreeClassifier(max_depth=5)
@@ -57,7 +117,11 @@ def test_or():
     node_b_output = pipeline_output.get_xyrefs(node_b)
     node_c_output = pipeline_output.get_xyrefs(node_c)
 
-    assert node_b_output
+    Xout = ray.get(node_b_output[0].get_Xref())
+    yout = ray.get(node_b_output[0].get_yref())
+
+    assert Xout.shape[0] == 8000
+    assert yout.shape[0] == 8000
 
     ray.shutdown()