I am a noob with machine learning and I have been struggling with this for a few days now and I don't understand why my neural network is having trouble classifying the mnist dataset. I checked my calculations and used the gradient check, but I can't seem to find the problem.

```
import pickle as pc
import numpy as np
import matplotlib.pyplot as mb
class MNIST:
#fix: gradient checking maybe not working, maybe backprop not working, symetrical updating, check if copying correctly
def processImg(self):
'''
#slower than pickle file
inTrset = np.loadtxt("mnist_train.csv", delimiter = ",");
inTestSet = np.loadtxt("mnist_test.csv", delimiter = ",");
fullX = np.asfarray(inTrset(:,1:))
fullY = np.asfarray(inTrset(:, :1))
'''
with open("binaryMNIST.pkl", "br") as fh:
data = pc.load(fh)
img_dim = 28;
features = 784;
m = 60000
test_m = 10000;
fullX = (np.asfarray(data(0)))
bias = np.ones((60000, 1))
fullX = np.hstack((bias, fullX))
fullY = np.asfarray(data(1))
testX = (np.asfarray(data(2)))
bias2 = np.ones((10000, 1))
testX = np.hstack((bias2, testX))
testY = np.asfarray(data(3))
fullY = fullY.astype(int)
testY = testY.astype(int)
iden = np.identity(10, dtype = np.int)
oneHot = np.zeros((m, 10), dtype = np.int)
oneHot_T = np.zeros((test_m, 10), dtype = np.int)
#creates m number of one, zeros vector indicating the class
for i in range(test_m):
oneHot_T(i) = iden(testY(i), :)
for i in range(m):
oneHot(i) = iden(fullY(i), :)
trainX = fullX(:40000, :)
trainY = oneHot(:40000, :)
valX = np.asfarray(fullX(40000:, :))
valY = np.asfarray(oneHot(40000:, :))
self.trainX = trainX
self.trainY = trainY
self.valX = valX
self.valY = valY
self.testX = testX
self.oneHot_T = oneHot_T
def setThetas(self):
#784 features
#5 nodes per layer (not including bias)
#(nodes in previous layer, nodes in next layer)
#theta1(785, 5) theta2(6, 5) theta3(6, 10)
#after finishing, do big 3d matrix of theta and vectorize backprop
params = np.random.rand(4015)
self.params = params
def fbProp(self, theta1, theta2, theta3):
#after calculating a w/sig(), add bias
m = np.shape(self.trainY)(0)
z1 = np.array(np.dot(self.trainX, theta1), dtype = np.float64)
a1 = self.sig(z1)
bias = np.ones((40000, 1))
a1 = np.hstack((bias, a1))
z2 = np.dot(a1, theta2)
a2 = self.sig(z2)
a2 = np.hstack((bias, a2))
z3 = np.dot(a2, theta3)
hyp = self.sig(z3)
g3 = 0
g2 = 0
g1 = 0
for i in range(m):
dOut = hyp(i, :) - self.trainY(i, :)
d2 = np.dot(np.transpose(dOut), np.transpose(theta3))
d2 = d2(1:) * self.sigG(z2(i, :))
d1 = np.dot(d2, np.transpose(theta2))
d1 = d1(1:) * self.sigG(z1(i, :))
g3 = g3 + np.dot(np.transpose(np.array(a2(i, :), ndmin = 2)), np.array(dOut, ndmin = 2))
g2 = g2 + np.dot(np.transpose(np.array(a1(i, :), ndmin = 2)), np.array(d1, ndmin = 2))
g1 = g1 + np.dot(np.transpose(np.array(self.trainX(i, :), ndmin = 2)), np.array(d1, ndmin = 2))
self.theta1G = (1/m) * g1
self.theta2G = (1/m) * g2
self.theta3G = (1/m) * g3
def gradDescent(self):
params = np.array(self.params)
theta1 = params(0:3925)
theta1 = np.resize(theta1, (785, 5))
theta2 = params(3925:3955)
theta2 = np.resize(theta2, (6, 5))
theta3 = params(3955:4015)
theta3 = np.resize(theta3, (6, 10))
for i in range(self.steps):
J = self.error(theta1, theta2, theta3, self.trainX, self.trainY)
print("Iteration: ", i+1, " | error: ", J)
self.fbProp(theta1, theta2, theta3)
theta1 = theta1 - (self.alpha * self.theta1G)
theta2 = theta2 - (self.alpha * self.theta2G)
theta3 = theta3 - (self.alpha * self.theta3G)
#On test set
correct = self.test(theta1, theta2, theta3)
print(correct/100, "%")
def error(self, params, X, y):
theta1 = params(0:3925)
theta1 = np.resize(theta1, (785, 5))
theta2 = params(3925:3955)
theta2 = np.resize(theta2, (6, 5))
theta3 = params(3955:4015)
theta3 = np.resize(theta3, (6, 10))
bias = np.ones((np.shape(y)(0), 1))
a1 = self.sig(np.dot(X, theta1))
a1 = np.hstack((bias, a1))
a2 = self.sig(np.dot(a1, theta2))
a2 = np.hstack((bias, a2))
hyp = self.sig(np.dot(a2, theta3))
#10 classes
pt1 = ((-np.log(hyp) * y) - (np.log(1-hyp) * (1-y))).sum()
J = 1/(40000) * pt1.sum()
return J
def error(self, theta1, theta2, theta3, X, y):
bias = np.ones((np.shape(y)(0), 1))
a1 = self.sig(np.dot(X, theta1))
a1 = np.hstack((bias, a1))
a2 = self.sig(np.dot(a1, theta2))
a2 = np.hstack((bias, a2))
hyp = self.sig(np.dot(a2, theta3))
print(hyp(0, :))
#10 classes
pt1 = ((np.log(hyp) * y) + (np.log(1-hyp) * (1-y))).sum()
J = - (1/(40000)) * pt1.sum()
return J
#def validate(self):
def test(self, theta1, theta2, theta3):
X = self.testX
y = self.oneHot_T
bias = np.ones((np.shape(y)(0), 1))
a1 = self.sig(np.dot(X, (theta1)))
a1 = np.hstack((bias, a1))
a2 = self.sig(np.dot(a1, (theta2)))
a2 = np.hstack((bias, a2))
hyp = self.sig(np.dot(a2, (theta3)))
correct = 0
ans = np.array((0, 1, 2, 3, 4, 5, 6, 7, 8, 9))
for i in range(np.shape(y)(0)):
#fix backprop and forward prop then this
guess = np.argmax(hyp(i, :))
match = np.argmax(y(i, :))
print("guess: ", guess, "| ans: ", match)
if guess == match:
correct = correct + 1;
return correct
def gradientCheck(self):
params = np.array(self.params)
theta1 = params(0:3925)
theta1 = np.resize(theta1, (785, 5))
theta2 = params(3925:3955)
theta2 = np.resize(theta2, (6, 5))
theta3 = params(3955:4015)
theta3 = np.resize(theta3, (6, 10))
self.fbProp(theta1, theta2, theta3)
grad = self.theta1G.ravel()
grad = np.append(grad, self.theta2G.ravel())
grad = np.append(grad, self.theta3G.ravel())
print("got grads")
epsilon = 0.00001
params2 = np.array(self.params)
check = np.zeros(np.shape(params))
for i in range(3965, np.size(params)):
temp = params(i)
params(i) = params(i) + epsilon
params2(i) = params2(i) - epsilon
check(i) = (self.error(params, self.trainX, self.trainY) - self.error(params2, self.trainX, self.trainY)) / (2 * epsilon)
params(i) = temp
params2(i) = temp
print(grad(i), " ", check(i))
def sigG(self, z):
return (self.sig(z) * (1-self.sig(z)))
def sig(self, z):
return 1/(1+(np.exp(-z)))
def printPictures(self):
#number of training examples to iterate over
for i in range(3):
img = self.trainX(i, 1:).reshape((28,28))
mb.title('Digit = {}'.format(np.argmax(self.trainY(i,:))))
mb.imshow(img, cmap = 'gray_r')
mb.show()
def __init__(self, steps, alpha, nodes, h_layers):
self.steps = steps
self.alpha = alpha
self.nodes = nodes
self.h_layers = h_layers
obj = MNIST(100, 0.1, 5, 1);
obj.processImg();
obj.setThetas();
obj.gradDescent()
#obj.gradientCheck()
#obj.printPictures()
```