Refactor AdaBoost code

ndrplz · ndrplz · commit d332476e354a · 2019-12-03T03:02:02.000+01:00
* Refactor comments to be more compliant with PEP 8
* Removed inconsistent / legacy / trivial comments
* Move `_plot` to static in AdaBoostClassifier
* Fix unused `title` parameter in plot_boundary
* Rewrite accuracy as one-liner in main
* Remove `cmap` global variable in `utils.py`
* Refactor variables to improve code readability
* Added docs and comments
* Make black lines homogeneous
diff --git a/lab/boosting/code/boosting.py b/lab/boosting/code/boosting.py
@@ -1,40 +1,36 @@
+import warnings
+
 import matplotlib.pyplot as plt
 import numpy as np
-
-from utils import cmap
+from numpy.random import choice
 
 
 class WeakClassifier:
     """
-    Function that models a WeakClassifier
+    Class that models a WeakClassifier
     """
-
     def __init__(self):
-
-        # initialize a few stuff
         self._dim = None
         self._threshold = None
         self._label_above_split = None
 
     def fit(self, X: np.ndarray, Y: np.ndarray):
 
-        n, d = X.shape
-        possible_labels = np.unique(Y)
-
-        # select random feature (see np.random.choice)
-        self._dim = np.random.choice(a=range(0, d))
+        # Select random feature (see np.random.choice)
+        _, n_feats = X.shape
+        self._dim = choice(a=range(0, n_feats))
 
-        # select random split (see np.random.uniform)
-        M, m = np.max(X[:, self._dim]), np.min(X[:, self._dim])
-        self._threshold = np.random.uniform(low=m, high=M)
+        # Select random split threshold
+        feat_min = np.min(X[:, self._dim])
+        feat_max = np.max(X[:, self._dim])
+        self._threshold = np.random.uniform(low=feat_min, high=feat_max)
 
-        # select random verse (see np.random.choice)
-        self._label_above_split = np.random.choice(a=possible_labels)
+        # Select random verse
+        possible_labels = np.unique(Y)
+        self._label_above_split = choice(a=possible_labels)
 
     def predict(self, X: np.ndarray):
-
-        num_samples = X.shape[0]
-        y_pred = np.zeros(shape=num_samples)
+        y_pred = np.zeros(shape=X.shape[0])
         y_pred[X[:, self._dim] >= self._threshold] = self._label_above_split
         y_pred[X[:, self._dim] < self._threshold] = -1 * self._label_above_split
 
@@ -43,20 +39,17 @@ def predict(self, X: np.ndarray):
 
 class AdaBoostClassifier:
     """
-    Function that models a Adaboost classifier
+    Class encapsulating AdaBoost classifier
     """
-
     def __init__(self, n_learners: int, n_max_trials: int = 200):
         """
-        Model constructor
+        Initialize an AdaBoost classifier.
 
         Parameters
         ----------
         n_learners: int
-            number of weak classifiers.
+            Number of weak classifiers.
         """
-
-        # initialize a few stuff
         self.n_learners = n_learners
         self.learners = []
         self.alphas = np.zeros(shape=n_learners)
@@ -69,74 +62,68 @@ def fit(self, X: np.ndarray, Y: np.ndarray, verbose: bool = False):
 
         Parameters
         ----------
-        X: ndarray
-            features having shape (n_samples, dim).
-        Y: ndarray
-            class labels having shape (n_samples,).
+        X: np.ndarray
+            Features having shape (n_samples, dim).
+        Y: np.ndarray
+            Class labels having shape (n_samples,).
         verbose: bool
-            whether or not to visualize the learning process.
-            Default is False
+            Whether or not to visualize the learning process (default=False).
         """
 
-        # some inits
-        n, d = X.shape
-        if d != 2:
-            verbose = False  # only plot learning if 2 dimensional
+        n_examples, n_feats = X.shape
 
-        possible_labels = np.unique(Y)
+        distinct_labels = len(np.unique(Y))
+        if distinct_labels == 1:
+            warnings.warn('Fitting {} on a dataset with only one label.'.format(
+                self.__class__.__name__))
+        elif distinct_labels > 2:
+            raise NotImplementedError('Only binary classification is supported.')
 
-        # only binary problems please
-        assert possible_labels.size == 2, 'Error: data is not binary'
+        # Initialize all examples with equal weights
+        weights = np.ones(shape=n_examples) / n_examples
 
-        # initialize the sample weights as equally probable
-        sample_weights = np.ones(shape=n) / n
-
-        # start training
+        # Train ensemble
         for l in range(self.n_learners):
-
-            # choose the indexes of 'difficult' samples (np.random.choice)
-            cur_idx = np.random.choice(a=range(0, n), size=n, replace=True, p=sample_weights)
-
-            # extract 'difficult' samples
-            cur_X = X[cur_idx]
-            cur_Y = Y[cur_idx]
-
-              # search for a weak classifier
-            error = 1
+            # Perform a weighted re-sampling (with replacement) of the dataset
+            #  to create a new dataset on which the current weak learner will
+            #  be trained.
+            sampled_idxs = choice(a=range(0, n_examples), size=n_examples,
+                                  replace=True, p=weights)
+            cur_X = X[sampled_idxs]
+            cur_Y = Y[sampled_idxs]
+
+            # Search for a weak classifier
             n_trials = 0
-            cur_wclass = None
-            y_pred = None
-
+            error = 1.
             while error > 0.5:
+                weak_learner = WeakClassifier()
+                weak_learner.fit(cur_X, cur_Y)
+                y_pred = weak_learner.predict(cur_X)
 
-                cur_wclass = WeakClassifier()
-                cur_wclass.fit(cur_X, cur_Y)
-                y_pred = cur_wclass.predict(cur_X)
-
-                # compute error
-                error = np.sum(sample_weights[cur_idx[cur_Y != y_pred]])
+                # Compute current weak learner error
+                error = np.sum(weights[sampled_idxs[cur_Y != y_pred]])
 
+                # Re-initialize sample weights if number of trials is exceeded
                 n_trials += 1
                 if n_trials > self.n_max_trials:
-                    # initialize the sample weights again
-                    sample_weights = np.ones(shape=n) / n
+                    weights = np.ones(shape=n_examples) / n_examples
 
-            # save weak learner parameter
+            # Store weak learner parameter
             self.alphas[l] = alpha = np.log((1 - error) / error) / 2
 
-            # append the weak classifier to the chain
-            self.learners.append(cur_wclass)
+            # Append the weak classifier to the chain
+            self.learners.append(weak_learner)
 
-            # update sample weights
-            sample_weights[cur_idx[cur_Y != y_pred]] *= np.exp(alpha)
-            sample_weights[cur_idx[cur_Y == y_pred]] *= np.exp(-alpha)
-            sample_weights /= np.sum(sample_weights)
+            # Update examples weights
+            weights[sampled_idxs[cur_Y != y_pred]] *= np.exp(alpha)
+            weights[sampled_idxs[cur_Y == y_pred]] *= np.exp(-alpha)
+            weights /= np.sum(weights)  # re-normalize
 
-            if verbose:
-                self._plot(cur_X, y_pred, sample_weights[cur_idx],
+            # Possibly plot the predictions (if these are 2D)
+            if verbose and n_feats == 2:
+                self._plot(cur_X, y_pred, weights[sampled_idxs],
                            self.learners[-1], l)
 
-
     def predict(self, X: np.ndarray):
         """
         Function to perform predictions over a set of samples.
@@ -167,10 +154,10 @@ def predict(self, X: np.ndarray):
 
         return pred
 
-    def _plot(self, X: np.ndarray, y_pred: np.ndarray, weights: np.ndarray,
-              learner: WeakClassifier, iteration: int):
+    @staticmethod
+    def _plot(X: np.ndarray, y_pred: np.ndarray, weights: np.ndarray,
+              learner: WeakClassifier, iteration: int, cmap: str = 'jet'):
 
-        # plot
         plt.clf()
         plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=weights * 50000,
                     cmap=cmap, edgecolors='k')
diff --git a/lab/boosting/code/main.py b/lab/boosting/code/main.py
@@ -1,39 +1,39 @@
 import matplotlib.pyplot as plt
 import numpy as np
+
+from boosting import AdaBoostClassifier
 from datasets import gaussians_dataset
 from utils import plot_2d_dataset
 from utils import plot_boundary
 
-from boosting import AdaBoostClassifier
-
-plt.ion()
 
 def main_adaboost():
     """
-    Main function for testing Adaboost.
+    Main function for fitting and testing Adaboost classifier.
     """
-
     X_train, Y_train, X_test, Y_test = gaussians_dataset(2, [300, 400], [[1, 3], [-4, 8]], [[2, 3], [4, 1]])
     # X_train, Y_train, X_test, Y_test = h_shaped_dataset()
     # X_train, Y_train, X_test, Y_test = two_moon_dataset(n_samples=500, noise=0.2)
 
-    # visualize dataset
-    plot_2d_dataset(X_train, Y_train, 'Training')
+    # Visualize dataset
+    plot_2d_dataset(X_train, Y_train, 'Training', blocking=False)
 
-    # train model and predict
+    # Init model
     model = AdaBoostClassifier(n_learners=100)
 
+    # Train
     model.fit(X_train, Y_train, verbose=True)
-    P = model.predict(X_test)
 
-    # visualize the boundary!
-    plot_boundary(X_train, Y_train, model)
+    # Predict
+    y_preds = model.predict(X_test)
+    print('Accuracy on test set: {}'.format(np.mean(y_preds == Y_test)))
 
-    # evaluate and print error
-    error = float(np.sum(P == Y_test)) / Y_test.size
-    print('Test set - Classification Accuracy: {}'.format(error))
+    # Visualize the predicted boundary
+    plot_boundary(X_train, Y_train, model)
 
 
-# entry point
 if __name__ == '__main__':
+
+    plt.ion()
+
     main_adaboost()
diff --git a/lab/boosting/code/utils.py b/lab/boosting/code/utils.py
@@ -1,72 +1,76 @@
-import numpy as np
 import matplotlib.pyplot as plt
-plt.ion()
-
-cmap = 'jet'
+import numpy as np
 
 
-def plot_2d_dataset(X, Y, title=''):
+def plot_2d_dataset(X, Y, title='', cmap='jet', blocking: bool = False):
     """
     Plots a two-dimensional dataset.
 
     Parameters
     ----------
-    X: ndarray
-        data points. (shape:(n_samples, dim))
-    Y: ndarray
-        groundtruth labels. (shape:(n_samples,))
+    X: np.ndarray
+        Data points. (shape:(n_samples, dim))
+    Y: np.ndarray
+        Groundtruth labels. (shape:(n_samples,))
     title: str
-        an optional title for the plot.
+        Optional title for the plot.
+    cmap: str
+        Colormap used for plotting
+    blocking: bool
+        When set, wait for user interaction
     """
 
-    # new figure
     plt.figure()
 
-    # set lims
+    # Compute and set range limits
     x_min = np.min(X[:, 0])
     x_max = np.max(X[:, 0])
     y_min = np.min(X[:, 1])
     y_max = np.max(X[:, 1])
     plt.xlim(x_min, x_max)
     plt.ylim(y_min, y_max)
 
-    # remove ticks
+    # Remove ticks
     plt.xticks(())
     plt.yticks(())
 
-    # plot points
+    # Plot points
     plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, s=40, cmap=cmap, edgecolors='k')
     plt.title(title)
-    plt.waitforbuttonpress()
+
+    if blocking:
+        plt.waitforbuttonpress()
 
 
-def plot_boundary(X, Y, model, title=''):
+def plot_boundary(X, Y, model, title='', cmap='jet'):
     """
     Represents the boundaries of a generic learning model over data.
 
     Parameters
     ----------
-    X: ndarray
-        data points. (shape:(n_samples, dim))
-    Y: ndarray
-        groundtruth labels. (shape:(n_samples,))
+    X: np.ndarray
+        Data points. (shape:(n_samples, dim))
+    Y: np.ndarray
+        Ground truth labels. (shape:(n_samples,))
     model: SVC
-        A sklearn.SVC fit model.
+        A sklearn classifier.
     title: str
-        an optional title for the plot.
+        Optional title for the plot.
+    cmap: str
+        Colormap used for plotting
     """
 
-    # initialize subplots
+    # Initialize subplots
     fig, ax = plt.subplots(1, 2)
     ax[0].scatter(X[:, 0], X[:, 1], c=Y, s=40, zorder=10, cmap=cmap, edgecolors='k')
 
-    # evaluate lims
+    # Compute range limits
     x_min = np.min(X[:, 0])
     x_max = np.max(X[:, 0])
     y_min = np.min(X[:, 1])
     y_max = np.max(X[:, 1])
 
-    # predict all over a grid
+    # Predict all over a dense grid
     XX, YY = np.mgrid[x_min:x_max:500j, y_min:y_max:500j]
     Z = model.predict(np.c_[XX.ravel(), YY.ravel()])
 
@@ -75,14 +79,14 @@ def plot_boundary(X, Y, model, title=''):
     ax[1].pcolormesh(XX, YY, Z, cmap=plt.cm.Paired)
     ax[1].scatter(X[:, 0], X[:, 1], c=Y, s=40, zorder=10, cmap=cmap, edgecolors='k')
 
-    # set stuff for subplots
+    # Set limits and ticks for each subplot
     for s in [0, 1]:
         ax[s].set_xlim([x_min, x_max])
         ax[s].set_ylim([y_min, y_max])
         ax[s].set_xticks([])
         ax[s].set_yticks([])
 
-    ax[0].set_title('Data')
+    ax[0].set_title(title)
     ax[1].set_title('Boundary')
 
     plt.waitforbuttonpress()