forked from Asiatik/codezilla
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implemented gradient descent in C++ (Asiatik#352)
* Gradiant descent implementation. * Small fixes in implementation. Split data into x and y data vectors, instead of containing it all inside one data vector. * Fixes and readme documentation.
- Loading branch information
1 parent
074c449
commit d5eec87
Showing
3 changed files
with
125 additions
and
0 deletions.
There are no files selected for viewing
71 changes: 71 additions & 0 deletions
71
Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
|
||
#include "Utility.h" | ||
|
||
using namespace std; | ||
|
||
// Performs predictions with the given model. | ||
vector<double> Predict(vector<double> x, pair<double,double> model){ | ||
int numDataPoints = x.size(); | ||
vector<double> predictions(numDataPoints); | ||
|
||
for(int i = 0; i < numDataPoints; ++i){ | ||
predictions[i] = x[i] * model.first + model.second; | ||
} | ||
|
||
return predictions; | ||
} | ||
|
||
// Performs the gradient step. | ||
pair<double, double> BatchGradientDecentStep(vector<double> predictions, vector<double> y, double learningRate, pair<double, double> model){ | ||
int numSamples = y.size(); | ||
double gradientX = 0.0; | ||
double gradientY = 0.0; | ||
|
||
for(int k = 0; k < numSamples; ++k){ | ||
double error = y[k] - predictions[k]; | ||
gradientX += ((-2.0) / (double) numSamples) * error * y[k]; | ||
gradientY += ((-2.0) / (double) numSamples) * error; | ||
} | ||
|
||
model.first = model.first - (learningRate * gradientX); | ||
model.second = model.second - (learningRate * gradientY); | ||
|
||
return model; | ||
} | ||
|
||
// Runs through all the epchs updating the model based on the calculated gradient. | ||
pair<double, double> LinearRegression(vector<double> x, vector<double> y, unsigned int epochs, double learningRate){ | ||
// Initialize our linear regression model as: 0x + 0. | ||
pair<double, double> model(0, 0); | ||
|
||
for(int i = 0; i < epochs; ++i){ | ||
auto predictions = Predict(x, model); | ||
model = BatchGradientDecentStep(predictions, y, learningRate, model); | ||
} | ||
|
||
return model; | ||
} | ||
|
||
int main(){ | ||
// Define the x range for data generation. | ||
// Note, larger data values might cause exploding gradients. | ||
// One possible solution is to reduce the learning rate. | ||
pair<int,int> range = pair<int,int>(0,100); | ||
|
||
// Get data from the following linear function: 2x + 5. | ||
pair<vector<double>, vector<double>> data = GetLinearFunctionData(range, 2, 5); | ||
vector<double> xData = data.first; | ||
vector<double> yData = data.second; | ||
|
||
// Run for 10000 epochs with a learning rate of 0.0001. | ||
pair<double, double> model = LinearRegression(xData, yData, 10000, 0.0001); | ||
auto predictions = Predict(xData, model); | ||
|
||
cout << "Data generating function: 2x + 5" << endl; | ||
// Mean squared error: 2.37223. | ||
cout << "Mean squared error: " << MSE(yData, predictions) << endl; | ||
// Learned model: 2.04665x + 1.94324. | ||
cout << "Learned model: " << model.first << "x + " << model.second << endl; | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
|
||
/* Header-only utility functions used for gradient descent */ | ||
|
||
#include <iostream> | ||
#include <vector> | ||
#include <cmath> | ||
#include <boost/iterator/zip_iterator.hpp> | ||
#include <boost/range.hpp> | ||
|
||
using namespace std; | ||
using namespace boost; | ||
|
||
// Generating data from a linear function. | ||
pair<vector<double>, vector<double>> GetLinearFunctionData(pair<int,int> range, double x, double yIntercept){ | ||
vector<double> xData(range.second); | ||
vector<double> yData(range.second); | ||
int numSamples = range.second - range.first; | ||
|
||
for(int i = range.first, k = 0; i < range.second, k < numSamples; ++i, ++k){ | ||
xData[k] = i; | ||
yData[k] = i * x + yIntercept; | ||
} | ||
|
||
pair<vector<double>, vector<double>> data(xData, yData); | ||
return data; | ||
} | ||
|
||
// Sum of squared erros. | ||
double MSE(vector<double> actual, vector<double> predicted){ | ||
auto actualItt = actual.begin(); | ||
auto predictedItt = predicted.begin(); | ||
double sum = 0; | ||
for( ; actualItt != actual.end(), predictedItt != predicted.end(); ++actualItt, ++predictedItt){ | ||
sum += pow(*actualItt - *predictedItt, 2); | ||
} | ||
return sum/actual.size(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
|
||
# Gradient Descent Optimisation Algorithm | ||
|
||
This explanation of the algorithm will not go into details with the mathematics, however, it is an important part but it is better explained online and in books. Instead, the focus is more on a high-level explanation of the algorithm. | ||
|
||
Gradient descent is a mathematical optimization algorithm. It is essentially a hill-climbing algorithm that follows the gradient of the function being optimized in order to search for optimal values. It is called gradient descent because we minimize a function by incrementally following the gradient towards a local minimum. And it is often used when training machine learning models. | ||
|
||
A gradient is basically the derivative for multi-variable functions, but it is a vector rather than a scaler. The gradient vector encapsulates the partial derivatives of a multi-variable function with respect to its parameters. The gradient of a function with respect to its input tells us something about how it behaves when we change the input, and for gradient descent we exploit a property, that is, the gradient vector points towards the steepest ascent. Hence, minimizing a function in iterations is simply calculating the gradient and moving in the opposite direction. | ||
|
||
Practically, we derive the partial derivative of our error function with respect to our model parameters, calculate the partial derivative for each weight and incrementally update each parameter in the opposite sign of the corresponding partial derivative. | ||
|
||
## Pseudocode | ||
Where lr is the learning rate, epochs is the number of iterations, and w are the model parameters. | ||
|
||
for i to num_epochs: | ||
for w_i in w: | ||
w_i = w_i - lr * partial_derivative(loss, w_i) |