This post will demonstrate how to implement a three-layers neural network for digit recognition.

The detailed derivations of algorithm can be found from this script.

Main workflow

Preparing training/validation/testing datasets.
Set the weight decay / numerical parameters.
Check if the gradients of the loss function are correct.
Training model.
Estimate the accuracy of predictions.

Ipython notebook

In [1]:

%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

from dnn_play.classifiers.neural_net import NeuralNet, neural_net_loss, rel_err_gradients

from dnn_play.utils.data_utils import load_mnist
from dnn_play.utils.visualize_utils import display_network

# Plot settings
plt.rcParams['figure.figsize'] = (10.0, 10.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'

In [2]:

# Load MNIST data
(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_mnist()
#(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_mnist(n_train=5500, n_val=500, n_test=1000)

print("X_train shape = {} y_train shape = {}".format(X_train.shape, y_train.shape))
print("X_val   shape = {} y_val  shape = {}".format(X_val.shape, y_val.shape))
print("X_test  shape = {} y_test shape = {}".format(X_test.shape, y_test.shape))

X_train shape = (55000, 784) y_train shape = (55000,)
X_val   shape = (5000, 784) y_val  shape = (5000,)
X_test  shape = (10000, 784) y_test shape = (10000,)

In [3]:

# Number of layer units
input_size  = X_train.shape[1] # Dimension of features
hidden_size = 28
n_classes = np.max(y_train) + 1

layer_units = ((input_size, hidden_size, n_classes))

# Hyperparameters
reg = 1e-4 # Regulation, weight decay

# Numerical parameters
max_iters = 400

# Initialize weights
clf = NeuralNet(layer_units)
weights = clf.init_weights()
loss, grad = neural_net_loss(weights, X_train, y_train, reg)

print('loss: %f' % loss)

loss: 1.249988

In [4]:

# Gradient checking
if rel_err_gradients() < 1e-8:
    print("Gradient check passed!")
else:
    print("Gradient check failed!")

Gradient check passed!

In [5]:

"""
Training
"""
weights, loss_history, train_acc_history, val_acc_history = clf.fit(X_train, y_train, X_val, y_val, 
                                                                    reg=reg, max_iters=max_iters, verbose=True)

iter:   20, loss: 0.414114, train_acc: 0.302182, val_acc: 0.309200
iter:   40, loss: 0.229197, train_acc: 0.701073, val_acc: 0.719000
iter:   60, loss: 0.140759, train_acc: 0.837418, val_acc: 0.857400
iter:   80, loss: 0.122732, train_acc: 0.856491, val_acc: 0.869800
iter:  100, loss: 0.091045, train_acc: 0.929273, val_acc: 0.943400
iter:  120, loss: 0.082602, train_acc: 0.943836, val_acc: 0.958600
iter:  140, loss: 0.079479, train_acc: 0.951091, val_acc: 0.961800
iter:  160, loss: 0.077801, train_acc: 0.954618, val_acc: 0.962400
iter:  180, loss: 0.076804, train_acc: 0.956982, val_acc: 0.964800
iter:  200, loss: 0.075982, train_acc: 0.958327, val_acc: 0.966200
iter:  220, loss: 0.075257, train_acc: 0.959127, val_acc: 0.967600
iter:  240, loss: 0.074697, train_acc: 0.959564, val_acc: 0.968600
iter:  260, loss: 0.074332, train_acc: 0.959691, val_acc: 0.969200
iter:  280, loss: 0.074039, train_acc: 0.960055, val_acc: 0.969400
iter:  300, loss: 0.073792, train_acc: 0.960582, val_acc: 0.969200
iter:  320, loss: 0.073584, train_acc: 0.961055, val_acc: 0.970000
iter:  340, loss: 0.073424, train_acc: 0.961255, val_acc: 0.969400
iter:  360, loss: 0.073296, train_acc: 0.961364, val_acc: 0.970000
iter:  380, loss: 0.073186, train_acc: 0.961600, val_acc: 0.969800
iter:  400, loss: 0.073094, train_acc: 0.961491, val_acc: 0.969600

In [6]:

# Plot the loss function and train / validation accuracies
plt.subplot(2, 1, 1)
plt.plot(loss_history)
plt.title('Loss history')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(2, 1, 2)
plt.plot(train_acc_history)
plt.plot(val_acc_history)
plt.legend(['Training accuracy', 'Validation accuracy'], loc='lower right')
plt.xlabel('Epoch')
plt.ylabel('Clasification accuracy')

Out[6]:

<matplotlib.text.Text at 0x10bdb0940>

In [7]:

# Visualize the weights 

W0 = weights[0]['W']
image = display_network(W0)
plt.imshow(image, cmap = plt.cm.gray)

Out[7]:

<matplotlib.image.AxesImage at 0x10be77978>

In [8]:

# Make predictions
pred = clf.predict(X_test)
acc = np.mean(y_test == pred)

print("Accuracy: {:5.2f}% \n".format(acc*100))

Accuracy: 95.79%

In [9]:

# View some images and predictions
n_images = 3
images = X_test[:n_images].reshape((n_images, 28, 28))
pred = clf.predict(X_test[:n_images])

for i in range(n_images):
    plt.subplot(1, n_images, i+1)
    plt.imshow(images[i], cmap = plt.cm.gray)
    plt.title('Predicted digit: {}'.format(pred[i]))
    plt.axis('off')

NeuralNet classifier

import numpy as np import scipy.optimize from dnn_play.activations import sigmoid, sigmoid_deriv from dnn_play.utils.np_utils import to_binary_class_matrix, flatten_struct, pack_struct from dnn_play.utils.gradient_utils import eval_numerical_gradient, rel_norm_diff def ac_func(x): return sigmoid(x) def ac_func_deriv(x): return sigmoid_deriv(x) class NeuralNet(object): """ Classifier of the neural network. """ def __init__(self, layer_units, weights=None): self.weights = weights self.layer_units = layer_units def init_weights(self, eps=1e-4): """ Initialize weights. layer_units: tuple stores the size of each layer. weights: structured weights. """ layer_units = self.layer_units n_layers = len(layer_units) weights = [{} for i in range(n_layers - 1)] for i in range(n_layers - 1): weights[i]['W'] = eps * np.random.randn(layer_units[i], layer_units[i+1]) weights[i]['b'] = np.zeros(layer_units[i+1]) self.weights = weights return self.weights def fit(self, X, y, X_val, y_val, reg=0.0, learning_rate=1e-2, optimizer='L-BFGS-B', max_iters=100, sample_batches=True, n_epochs=30, batch_size=128, verbose=False): epoch = 0 best_val_acc = 0.0 best_weights = {} if self.weights is None: # lazily initialize weights self.weights = self.init_weights() # Solve with L-BFGS-B options = {'maxiter': max_iters, 'disp': verbose} def J(theta): weights = pack_struct(theta, self.layer_units) loss, grad = neural_net_loss(weights, X, y, reg) grad = flatten_struct(grad) return loss, grad # Callback to get accuracies based on training / validation sets iter_feval = 0 loss_history = [] train_acc_history = [] val_acc_history = [] def progress(x): nonlocal iter_feval, best_weights, best_val_acc iter_feval += 1 # Loss history weights = pack_struct(x, self.layer_units) loss, grad = neural_net_loss(weights, X, y, reg) loss_history.append(loss) # Training accurary y_pred_train = neural_net_predict(weights, X) train_acc = np.mean(y_pred_train == y) train_acc_history.append(train_acc) # Validation accuracy y_pred_val= neural_net_predict(weights, X_val) val_acc = np.mean(y_pred_val == y_val) val_acc_history.append(val_acc) # Keep track of the best weights based on validation accuracy if val_acc > best_val_acc: best_val_acc = val_acc n_weights = len(weights) best_weights = [{} for i in range(n_weights)] for i in range(n_weights): for p in weights[i]: best_weights[i][p] = weights[i][p].copy() n_iters_verbose = max_iters / 20 if iter_feval % n_iters_verbose == 0: print("iter: {:4d}, loss: {:8f}, train_acc: {:4f}, val_acc: {:4f}".format(iter_feval, loss, train_acc, val_acc)) # Minimize the loss function init_theta = flatten_struct(self.weights) results = scipy.optimize.minimize(J, init_theta, method=optimizer, jac=True, callback=progress, options=options) # Save weights self.weights = best_weights return self.weights, loss_history, train_acc_history, val_acc_history def predict(self, X): """ X: the N x M input matrix, where each column data[:, i] corresponds to a single test set pred: the predicted results. """ pred = neural_net_predict(self.weights, X) return pred def flatten_struct(self, data): return flatten_struct(data) def pack_struct(self, data): return pack_struct(data, self.layer_units) def neural_net_loss(weights, X, y, reg): """ Compute loss and gradients of the neutral network. """ Y = to_binary_class_matrix(y) L = len(weights) # The index of the output layer z = [] a = [] # Number of samples m = X.shape[0] # Forward pass z.append(0) # Dummy element a.append(X) # Input activation for i in range(0, L): W = weights[i]['W'] b = weights[i]['b'] z.append(np.dot(a[-1], W) + b) a.append(ac_func(z[-1])) # loss function sum_weight_square = 0.0 # Sum of weight square for i in range(L): W = weights[i]['W'] sum_weight_square += np.sum(W*W) loss = 1.0/(2.0*m) * np.sum((a[-1] - Y)**2) + 0.5*reg*sum_weight_square # Backpropagation delta = [(a[-1] - Y) * ac_func_deriv(z[-1])] for i in reversed(range(L)): # Note that delta[0] will not be used W = weights[i]['W'] d = np.dot(delta[0], W.T) * ac_func_deriv(z[i]) delta.insert(0, d) # Insert element at beginning # Gradients grad = [{} for i in range(L)] for i in range(L): W = weights[i]['W'] grad[i]['W'] = np.dot(a[i].T, delta[i+1]) / m + reg*W grad[i]['b'] = np.mean(delta[i+1], axis=0) return loss, grad def neural_net_predict(weights, X): """ X: the N x M input matrix, where each column data[:, i] corresponds to a single test set pred: the predicted results. """ L = len(weights) # The index of the output layer z = [] a = [] # Number of samples m = X.shape[0] # Forward pass z.append(0) # Dummy element a.append(X) # Input activation for i in range(0, L): W = weights[i]['W'] b = weights[i]['b'] z.append(np.dot(a[-1], W) + b) a.append(ac_func(z[-1])) # Predictions pred = np.argmax(a[-1], axis=1) return pred def rel_err_gradients(): """ Return the relative error between analytic and nemerical gradients. """ # Number of layer units n_samples = 100 input_size = 4 * 4 hidden_size = 4 n_classes = 10 layer_units = (input_size, hidden_size, n_classes) X_train = np.random.randn(n_samples, input_size) y_train = np.random.randint(n_classes, size=n_samples) reg = 1e-4 # Define the classifier clf = NeuralNet(layer_units) # Initialize weights weights = clf.init_weights() # Analytic gradients of the cost function cost, grad = neural_net_loss(weights, X_train, y_train, reg) grad = clf.flatten_struct(grad) # Flattened gradients def J(theta): # Structured weights weights = clf.pack_struct(theta) return neural_net_loss(weights, X_train, y_train, reg)[0] theta = clf.flatten_struct(weights) numerical_grad = eval_numerical_gradient(J, theta) # Compare numerically computed gradients with those computed analytically rel_err = rel_norm_diff(numerical_grad, grad) return rel_err

In case you are interested in all codes related in this demonstration, please check the repository.

Comments

Implement a neural network for digit recognition

Main workflow

Ipython notebook

NeuralNet classifier

Published

Category

Tags