Skip to content

Commit 81d0692

Browse files
committed
backpropagation
1 parent 90c67ee commit 81d0692

File tree

5 files changed

+501
-0
lines changed

5 files changed

+501
-0
lines changed
Binary file not shown.
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import os
2+
import gzip
3+
import numpy as np
4+
5+
DATA_URL = 'http://yann.lecun.com/exdb/mnist/'
6+
7+
# Download and import the MNIST dataset from Yann LeCun's website.
8+
# Reserve 10,000 examples from the training set for validation.
9+
# Each image is an array of 784 (28x28) float values from 0 (white) to 1 (black).
10+
def load_data(one_hot=True, reshape=None, validation_size=10000):
11+
x_tr = load_images('train-images-idx3-ubyte.gz')
12+
y_tr = load_labels('train-labels-idx1-ubyte.gz')
13+
x_te = load_images('t10k-images-idx3-ubyte.gz')
14+
y_te = load_labels('t10k-labels-idx1-ubyte.gz')
15+
16+
x_tr = x_tr[:-validation_size]
17+
y_tr = y_tr[:-validation_size]
18+
19+
if one_hot:
20+
y_tr, y_te = [to_one_hot(y) for y in (y_tr, y_te)]
21+
22+
if reshape:
23+
x_tr, x_te = [x.reshape(-1, *reshape) for x in (x_tr, x_te)]
24+
25+
y_tr, y_te = [y.reshape(-1, 1) for y in (y_tr, y_te)]
26+
27+
return x_tr, y_tr, x_te, y_te
28+
29+
def load_images(filename):
30+
maybe_download(filename)
31+
with gzip.open(filename, 'rb') as f:
32+
data = np.frombuffer(f.read(), np.uint8, offset=16)
33+
return data.reshape(-1, 28 * 28) / np.float32(256)
34+
35+
def load_labels(filename):
36+
maybe_download(filename)
37+
with gzip.open(filename, 'rb') as f:
38+
data = np.frombuffer(f.read(), np.uint8, offset=8)
39+
return data
40+
41+
# Download the file, unless it's already here.
42+
def maybe_download(filename):
43+
if not os.path.exists(filename):
44+
from urllib.request import urlretrieve
45+
print("Downloading %s" % filename)
46+
urlretrieve(DATA_URL + filename, filename)
47+
48+
# Convert class labels from scalars to one-hot vectors.
49+
def to_one_hot(labels, num_classes=10):
50+
return np.eye(num_classes)[labels]
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
import random
2+
import numpy as np
3+
import mnist
4+
5+
6+
7+
def sigmoid(z):
8+
"""
9+
The sigmoid function.
10+
[30/10, 1]
11+
"""
12+
return 1.0/(1.0+np.exp(-z))
13+
14+
def sigmoid_prime(z):
15+
"""Derivative of the sigmoid function."""
16+
return sigmoid(z)*(1-sigmoid(z))
17+
18+
19+
class Network:
20+
21+
def __init__(self, sizes):
22+
"""
23+
The list ``sizes`` contains the number of neurons in the
24+
respective layers of the network. For example, if the list
25+
was [2, 3, 1] then it would be a three-layer network, with the
26+
first layer containing 2 neurons, the second layer 3 neurons,
27+
and the third layer 1 neuron. The biases and weights for the
28+
network are initialized randomly, using a Gaussian
29+
distribution with mean 0, and variance 1. Note that the first
30+
layer is assumed to be an input layer, and by convention we
31+
won't set any biases for those neurons, since biases are only
32+
ever used in computing the outputs from later layers.
33+
:param sizes: [784, 100, 10]
34+
"""
35+
self.num_layers = len(sizes)
36+
self.sizes = sizes
37+
# [ch_out, 1]
38+
self.biases = [np.random.randn(ch_out, 1) for ch_out in sizes[1:]]
39+
# [ch_out, ch_in]
40+
self.weights = [np.random.randn(ch_out, ch_in)
41+
for ch_in, ch_out in zip(sizes[:-1], sizes[1:])]
42+
43+
def forward(self, x):
44+
"""
45+
46+
:param x: [784, 1]
47+
:return: [30, 1]
48+
"""
49+
50+
for b, w in zip(self.biases, self.weights):
51+
# [30, 784]@[784, 1] + [30, 1]=> [30, 1]
52+
# [10, 30]@[30, 1] + [10, 1]=> [10, 1]
53+
x = sigmoid(np.dot(w, x)+b)
54+
return x
55+
56+
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
57+
"""
58+
Train the neural network using mini-batch stochastic
59+
gradient descent. The ``training_data`` is a list of tuples
60+
``(x, y)`` representing the training inputs and the desired
61+
outputs. The other non-optional parameters are
62+
self-explanatory. If ``test_data`` is provided then the
63+
network will be evaluated against the test data after each
64+
epoch, and partial progress printed out. This is useful for
65+
tracking progress, but slows things down substantially.
66+
"""
67+
if test_data:
68+
n_test = len(test_data)
69+
70+
n = len(training_data)
71+
for j in range(epochs):
72+
random.shuffle(training_data)
73+
74+
mini_batches = [
75+
training_data[k:k+mini_batch_size]
76+
for k in range(0, n, mini_batch_size)]
77+
78+
# for every (x,y)
79+
for mini_batch in mini_batches:
80+
loss = self.update_mini_batch(mini_batch, eta)
81+
if test_data:
82+
print("Epoch {0}: {1} / {2}, Loss: {3}".format(
83+
j, self.evaluate(test_data), n_test, loss))
84+
else:
85+
print("Epoch {0} complete".format(j))
86+
87+
def update_mini_batch(self, mini_batch, eta):
88+
"""
89+
Update the network's weights and biases by applying
90+
gradient descent using backpropagation to a single mini batch.
91+
The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
92+
is the learning rate.
93+
"""
94+
# https://en.wikipedia.org/wiki/Del
95+
nabla_b = [np.zeros(b.shape) for b in self.biases]
96+
nabla_w = [np.zeros(w.shape) for w in self.weights]
97+
loss = 0
98+
99+
for x, y in mini_batch:
100+
delta_nabla_b, delta_nabla_w, loss_ = self.backprop(x, y)
101+
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
102+
nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
103+
loss += loss_
104+
105+
# tmp1 = [np.linalg.norm(b/len(mini_batch)) for b in nabla_b]
106+
# tmp2 = [np.linalg.norm(w/len(mini_batch)) for w in nabla_w]
107+
# print(tmp1)
108+
# print(tmp2)
109+
110+
self.weights = [w-(eta/len(mini_batch))*nw
111+
for w, nw in zip(self.weights, nabla_w)]
112+
self.biases = [b-(eta/len(mini_batch))*nb
113+
for b, nb in zip(self.biases, nabla_b)]
114+
loss = loss / len(mini_batch)
115+
116+
return loss
117+
118+
def backprop(self, x, y):
119+
"""
120+
Return a tuple ``(nabla_b, nabla_w)`` representing the
121+
gradient for the cost function C_x. ``nabla_b`` and
122+
``nabla_w`` are layer-by-layer lists of numpy arrays, similar
123+
to ``self.biases`` and ``self.weights``.
124+
"""
125+
nabla_b = [np.zeros(b.shape) for b in self.biases]
126+
nabla_w = [np.zeros(w.shape) for w in self.weights]
127+
128+
# 1. forward
129+
activation = x
130+
# w*x = z => sigmoid => x/activation
131+
zs = [] # list to store all the z vectors, layer by layer
132+
activations = [x] # list to store all the activations, layer by layer
133+
for b, w in zip(self.biases, self.weights):
134+
# https://stackoverflow.com/questions/34142485/difference-between-numpy-dot-and-python-3-5-matrix-multiplication
135+
# np.dot vs np.matmul = @ vs element-wise *
136+
z = np.dot(w, activation)
137+
z = z + b # [256, 784] matmul [784] => [256]
138+
# [256] => [256, 1]
139+
zs.append(z)
140+
activation = sigmoid(z)
141+
activations.append(activation)
142+
143+
loss = np.power(activations[-1]-y, 2).sum()
144+
145+
# 2. backward
146+
# (Ok-tk)*(1-Ok)*Ok
147+
# [10] - [10] * [10]
148+
delta = self.cost_prime(activations[-1], y) * sigmoid_prime(zs[-1]) # sigmoid(z)*(1-sigmoid(z))
149+
# O_j*Delta_k
150+
# [10]
151+
nabla_b[-1] = delta
152+
# deltaj * Oi
153+
# [10] @ [30, 1]^T => [10, 30]
154+
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
155+
# Note that the variable l in the loop below is used a little
156+
# differently to the notation in Chapter 2 of the book. Here,
157+
# l = 1 means the last layer of neurons, l = 2 is the
158+
# second-last layer, and so on. It's a renumbering of the
159+
# scheme in the book, used here to take advantage of the fact
160+
# that Python can use negative indices in lists.
161+
for l in range(2, self.num_layers):
162+
# [30, 1]
163+
z = zs[-l]
164+
sp = sigmoid_prime(z)
165+
# sum()
166+
# [10, 30] => [30, 10] @ [10, 1] => [30, 1] * [30, 1]
167+
delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
168+
nabla_b[-l] = delta
169+
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
170+
171+
return nabla_b, nabla_w, loss
172+
173+
def evaluate(self, test_data):
174+
"""
175+
Return the number of test inputs for which the neural
176+
network outputs the correct result. Note that the neural
177+
network's output is assumed to be the index of whichever
178+
neuron in the final layer has the highest activation.
179+
"""
180+
test_results = [(np.argmax(self.forward(x)), y)
181+
for (x, y) in test_data]
182+
return sum(int(x == y) for (x, y) in test_results)
183+
184+
def cost_prime(self, output_activations, y):
185+
"""
186+
Return the vector of partial derivatives \partial C_x /
187+
\partial a for the output activations.
188+
"""
189+
return output_activations-y
190+
191+
192+
193+
194+
def main():
195+
196+
x_train, y_train, x_test, y_test = mnist.load_data(reshape=[784,1])
197+
# (50000, 784) (50000, 10) (10000, 784) (10000, 10)
198+
print('x_train, y_train, x_test, y_test:', x_train.shape, y_train.shape, x_test.shape, y_test.shape)
199+
200+
np.random.seed(66)
201+
202+
model = Network([784, 30, 10])
203+
data_train = list(zip(x_train, y_train))
204+
data_test = list(zip(x_test, y_test))
205+
model.SGD(data_train, 10000, 10, 0.1, data_test)
206+
207+
208+
209+
if __name__ == '__main__':
210+
main()
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""
2+
mnist_loader
3+
~~~~~~~~~~~~
4+
5+
A library to load the MNIST image data. For details of the data
6+
structures that are returned, see the doc strings for ``load_data``
7+
and ``load_data_wrapper``. In practice, ``load_data_wrapper`` is the
8+
function usually called by our neural network code.
9+
"""
10+
11+
#### Libraries
12+
# Standard library
13+
import pickle
14+
import gzip
15+
16+
# Third-party libraries
17+
import numpy as np
18+
19+
def load_data():
20+
"""Return the MNIST data as a tuple containing the training data,
21+
the validation data, and the test data.
22+
23+
The ``training_data`` is returned as a tuple with two entries.
24+
The first entry contains the actual training images. This is a
25+
numpy ndarray with 50,000 entries. Each entry is, in turn, a
26+
numpy ndarray with 784 values, representing the 28 * 28 = 784
27+
pixels in a single MNIST image.
28+
29+
The second entry in the ``training_data`` tuple is a numpy ndarray
30+
containing 50,000 entries. Those entries are just the digit
31+
values (0...9) for the corresponding images contained in the first
32+
entry of the tuple.
33+
34+
The ``validation_data`` and ``test_data`` are similar, except
35+
each contains only 10,000 images.
36+
37+
This is a nice data format, but for use in neural networks it's
38+
helpful to modify the format of the ``training_data`` a little.
39+
That's done in the wrapper function ``load_data_wrapper()``, see
40+
below.
41+
"""
42+
f = gzip.open('data/mnist.pkl.gz', 'rb')
43+
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
44+
f.close()
45+
return (training_data, validation_data, test_data)
46+
47+
def load_data_wrapper():
48+
"""Return a tuple containing ``(training_data, validation_data,
49+
test_data)``. Based on ``load_data``, but the format is more
50+
convenient for use in our implementation of neural networks.
51+
52+
In particular, ``training_data`` is a list containing 50,000
53+
2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray
54+
containing the input image. ``y`` is a 10-dimensional
55+
numpy.ndarray representing the unit vector corresponding to the
56+
correct digit for ``x``.
57+
58+
``validation_data`` and ``test_data`` are lists containing 10,000
59+
2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional
60+
numpy.ndarry containing the input image, and ``y`` is the
61+
corresponding classification, i.e., the digit values (integers)
62+
corresponding to ``x``.
63+
64+
Obviously, this means we're using slightly different formats for
65+
the training data and the validation / test data. These formats
66+
turn out to be the most convenient for use in our neural network
67+
code."""
68+
tr_d, va_d, te_d = load_data()
69+
training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
70+
training_results = [vectorized_result(y) for y in tr_d[1]]
71+
training_data = list(zip(training_inputs, training_results))
72+
validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
73+
validation_data = list(zip(validation_inputs, va_d[1]))
74+
test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
75+
test_data = list(zip(test_inputs, te_d[1]))
76+
return (training_data, validation_data, test_data)
77+
78+
def vectorized_result(j):
79+
"""Return a 10-dimensional unit vector with a 1.0 in the jth
80+
position and zeroes elsewhere. This is used to convert a digit
81+
(0...9) into a corresponding desired output from the neural
82+
network."""
83+
e = np.zeros((10, 1))
84+
e[j] = 1.0
85+
return e

0 commit comments

Comments
 (0)