1
+ import random
2
+ import numpy as np
3
+ import mnist
4
+
5
+
6
+
7
+ def sigmoid (z ):
8
+ """
9
+ The sigmoid function.
10
+ [30/10, 1]
11
+ """
12
+ return 1.0 / (1.0 + np .exp (- z ))
13
+
14
+ def sigmoid_prime (z ):
15
+ """Derivative of the sigmoid function."""
16
+ return sigmoid (z )* (1 - sigmoid (z ))
17
+
18
+
19
+ class Network :
20
+
21
+ def __init__ (self , sizes ):
22
+ """
23
+ The list ``sizes`` contains the number of neurons in the
24
+ respective layers of the network. For example, if the list
25
+ was [2, 3, 1] then it would be a three-layer network, with the
26
+ first layer containing 2 neurons, the second layer 3 neurons,
27
+ and the third layer 1 neuron. The biases and weights for the
28
+ network are initialized randomly, using a Gaussian
29
+ distribution with mean 0, and variance 1. Note that the first
30
+ layer is assumed to be an input layer, and by convention we
31
+ won't set any biases for those neurons, since biases are only
32
+ ever used in computing the outputs from later layers.
33
+ :param sizes: [784, 100, 10]
34
+ """
35
+ self .num_layers = len (sizes )
36
+ self .sizes = sizes
37
+ # [ch_out, 1]
38
+ self .biases = [np .random .randn (ch_out , 1 ) for ch_out in sizes [1 :]]
39
+ # [ch_out, ch_in]
40
+ self .weights = [np .random .randn (ch_out , ch_in )
41
+ for ch_in , ch_out in zip (sizes [:- 1 ], sizes [1 :])]
42
+
43
+ def forward (self , x ):
44
+ """
45
+
46
+ :param x: [784, 1]
47
+ :return: [30, 1]
48
+ """
49
+
50
+ for b , w in zip (self .biases , self .weights ):
51
+ # [30, 784]@[784, 1] + [30, 1]=> [30, 1]
52
+ # [10, 30]@[30, 1] + [10, 1]=> [10, 1]
53
+ x = sigmoid (np .dot (w , x )+ b )
54
+ return x
55
+
56
+ def SGD (self , training_data , epochs , mini_batch_size , eta , test_data = None ):
57
+ """
58
+ Train the neural network using mini-batch stochastic
59
+ gradient descent. The ``training_data`` is a list of tuples
60
+ ``(x, y)`` representing the training inputs and the desired
61
+ outputs. The other non-optional parameters are
62
+ self-explanatory. If ``test_data`` is provided then the
63
+ network will be evaluated against the test data after each
64
+ epoch, and partial progress printed out. This is useful for
65
+ tracking progress, but slows things down substantially.
66
+ """
67
+ if test_data :
68
+ n_test = len (test_data )
69
+
70
+ n = len (training_data )
71
+ for j in range (epochs ):
72
+ random .shuffle (training_data )
73
+
74
+ mini_batches = [
75
+ training_data [k :k + mini_batch_size ]
76
+ for k in range (0 , n , mini_batch_size )]
77
+
78
+ # for every (x,y)
79
+ for mini_batch in mini_batches :
80
+ loss = self .update_mini_batch (mini_batch , eta )
81
+ if test_data :
82
+ print ("Epoch {0}: {1} / {2}, Loss: {3}" .format (
83
+ j , self .evaluate (test_data ), n_test , loss ))
84
+ else :
85
+ print ("Epoch {0} complete" .format (j ))
86
+
87
+ def update_mini_batch (self , mini_batch , eta ):
88
+ """
89
+ Update the network's weights and biases by applying
90
+ gradient descent using backpropagation to a single mini batch.
91
+ The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
92
+ is the learning rate.
93
+ """
94
+ # https://en.wikipedia.org/wiki/Del
95
+ nabla_b = [np .zeros (b .shape ) for b in self .biases ]
96
+ nabla_w = [np .zeros (w .shape ) for w in self .weights ]
97
+ loss = 0
98
+
99
+ for x , y in mini_batch :
100
+ delta_nabla_b , delta_nabla_w , loss_ = self .backprop (x , y )
101
+ nabla_b = [nb + dnb for nb , dnb in zip (nabla_b , delta_nabla_b )]
102
+ nabla_w = [nw + dnw for nw , dnw in zip (nabla_w , delta_nabla_w )]
103
+ loss += loss_
104
+
105
+ # tmp1 = [np.linalg.norm(b/len(mini_batch)) for b in nabla_b]
106
+ # tmp2 = [np.linalg.norm(w/len(mini_batch)) for w in nabla_w]
107
+ # print(tmp1)
108
+ # print(tmp2)
109
+
110
+ self .weights = [w - (eta / len (mini_batch ))* nw
111
+ for w , nw in zip (self .weights , nabla_w )]
112
+ self .biases = [b - (eta / len (mini_batch ))* nb
113
+ for b , nb in zip (self .biases , nabla_b )]
114
+ loss = loss / len (mini_batch )
115
+
116
+ return loss
117
+
118
+ def backprop (self , x , y ):
119
+ """
120
+ Return a tuple ``(nabla_b, nabla_w)`` representing the
121
+ gradient for the cost function C_x. ``nabla_b`` and
122
+ ``nabla_w`` are layer-by-layer lists of numpy arrays, similar
123
+ to ``self.biases`` and ``self.weights``.
124
+ """
125
+ nabla_b = [np .zeros (b .shape ) for b in self .biases ]
126
+ nabla_w = [np .zeros (w .shape ) for w in self .weights ]
127
+
128
+ # 1. forward
129
+ activation = x
130
+ # w*x = z => sigmoid => x/activation
131
+ zs = [] # list to store all the z vectors, layer by layer
132
+ activations = [x ] # list to store all the activations, layer by layer
133
+ for b , w in zip (self .biases , self .weights ):
134
+ # https://stackoverflow.com/questions/34142485/difference-between-numpy-dot-and-python-3-5-matrix-multiplication
135
+ # np.dot vs np.matmul = @ vs element-wise *
136
+ z = np .dot (w , activation )
137
+ z = z + b # [256, 784] matmul [784] => [256]
138
+ # [256] => [256, 1]
139
+ zs .append (z )
140
+ activation = sigmoid (z )
141
+ activations .append (activation )
142
+
143
+ loss = np .power (activations [- 1 ]- y , 2 ).sum ()
144
+
145
+ # 2. backward
146
+ # (Ok-tk)*(1-Ok)*Ok
147
+ # [10] - [10] * [10]
148
+ delta = self .cost_prime (activations [- 1 ], y ) * sigmoid_prime (zs [- 1 ]) # sigmoid(z)*(1-sigmoid(z))
149
+ # O_j*Delta_k
150
+ # [10]
151
+ nabla_b [- 1 ] = delta
152
+ # deltaj * Oi
153
+ # [10] @ [30, 1]^T => [10, 30]
154
+ nabla_w [- 1 ] = np .dot (delta , activations [- 2 ].transpose ())
155
+ # Note that the variable l in the loop below is used a little
156
+ # differently to the notation in Chapter 2 of the book. Here,
157
+ # l = 1 means the last layer of neurons, l = 2 is the
158
+ # second-last layer, and so on. It's a renumbering of the
159
+ # scheme in the book, used here to take advantage of the fact
160
+ # that Python can use negative indices in lists.
161
+ for l in range (2 , self .num_layers ):
162
+ # [30, 1]
163
+ z = zs [- l ]
164
+ sp = sigmoid_prime (z )
165
+ # sum()
166
+ # [10, 30] => [30, 10] @ [10, 1] => [30, 1] * [30, 1]
167
+ delta = np .dot (self .weights [- l + 1 ].transpose (), delta ) * sp
168
+ nabla_b [- l ] = delta
169
+ nabla_w [- l ] = np .dot (delta , activations [- l - 1 ].transpose ())
170
+
171
+ return nabla_b , nabla_w , loss
172
+
173
+ def evaluate (self , test_data ):
174
+ """
175
+ Return the number of test inputs for which the neural
176
+ network outputs the correct result. Note that the neural
177
+ network's output is assumed to be the index of whichever
178
+ neuron in the final layer has the highest activation.
179
+ """
180
+ test_results = [(np .argmax (self .forward (x )), y )
181
+ for (x , y ) in test_data ]
182
+ return sum (int (x == y ) for (x , y ) in test_results )
183
+
184
+ def cost_prime (self , output_activations , y ):
185
+ """
186
+ Return the vector of partial derivatives \partial C_x /
187
+ \partial a for the output activations.
188
+ """
189
+ return output_activations - y
190
+
191
+
192
+
193
+
194
+ def main ():
195
+
196
+ x_train , y_train , x_test , y_test = mnist .load_data (reshape = [784 ,1 ])
197
+ # (50000, 784) (50000, 10) (10000, 784) (10000, 10)
198
+ print ('x_train, y_train, x_test, y_test:' , x_train .shape , y_train .shape , x_test .shape , y_test .shape )
199
+
200
+ np .random .seed (66 )
201
+
202
+ model = Network ([784 , 30 , 10 ])
203
+ data_train = list (zip (x_train , y_train ))
204
+ data_test = list (zip (x_test , y_test ))
205
+ model .SGD (data_train , 10000 , 10 , 0.1 , data_test )
206
+
207
+
208
+
209
+ if __name__ == '__main__' :
210
+ main ()
0 commit comments