This post will demonstrate how to implement a three-layers neural network for digit recognition.
The detailed derivations of algorithm can be found from this script.
Main workflow
- Preparing training/validation/testing datasets.
- Set the weight decay / numerical parameters.
- Check if the gradients of the loss function are correct.
- Training model.
- Estimate the accuracy of predictions.
Ipython notebook
In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from dnn_play.classifiers.neural_net import NeuralNet, neural_net_loss, rel_err_gradients
from dnn_play.utils.data_utils import load_mnist
from dnn_play.utils.visualize_utils import display_network
# Plot settings
plt.rcParams['figure.figsize'] = (10.0, 10.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
In [2]:
# Load MNIST data
(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_mnist()
#(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_mnist(n_train=5500, n_val=500, n_test=1000)
print("X_train shape = {} y_train shape = {}".format(X_train.shape, y_train.shape))
print("X_val shape = {} y_val shape = {}".format(X_val.shape, y_val.shape))
print("X_test shape = {} y_test shape = {}".format(X_test.shape, y_test.shape))
In [3]:
# Number of layer units
input_size = X_train.shape[1] # Dimension of features
hidden_size = 28
n_classes = np.max(y_train) + 1
layer_units = ((input_size, hidden_size, n_classes))
# Hyperparameters
reg = 1e-4 # Regulation, weight decay
# Numerical parameters
max_iters = 400
# Initialize weights
clf = NeuralNet(layer_units)
weights = clf.init_weights()
loss, grad = neural_net_loss(weights, X_train, y_train, reg)
print('loss: %f' % loss)
In [4]:
# Gradient checking
if rel_err_gradients() < 1e-8:
print("Gradient check passed!")
else:
print("Gradient check failed!")
In [5]:
"""
Training
"""
weights, loss_history, train_acc_history, val_acc_history = clf.fit(X_train, y_train, X_val, y_val,
reg=reg, max_iters=max_iters, verbose=True)
In [6]:
# Plot the loss function and train / validation accuracies
plt.subplot(2, 1, 1)
plt.plot(loss_history)
plt.title('Loss history')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.subplot(2, 1, 2)
plt.plot(train_acc_history)
plt.plot(val_acc_history)
plt.legend(['Training accuracy', 'Validation accuracy'], loc='lower right')
plt.xlabel('Epoch')
plt.ylabel('Clasification accuracy')
Out[6]:
In [7]:
# Visualize the weights
W0 = weights[0]['W']
image = display_network(W0)
plt.imshow(image, cmap = plt.cm.gray)
Out[7]:
In [8]:
# Make predictions
pred = clf.predict(X_test)
acc = np.mean(y_test == pred)
print("Accuracy: {:5.2f}% \n".format(acc*100))
In [9]:
# View some images and predictions
n_images = 3
images = X_test[:n_images].reshape((n_images, 28, 28))
pred = clf.predict(X_test[:n_images])
for i in range(n_images):
plt.subplot(1, n_images, i+1)
plt.imshow(images[i], cmap = plt.cm.gray)
plt.title('Predicted digit: {}'.format(pred[i]))
plt.axis('off')
NeuralNet classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy.optimize | |
from dnn_play.activations import sigmoid, sigmoid_deriv | |
from dnn_play.utils.np_utils import to_binary_class_matrix, flatten_struct, pack_struct | |
from dnn_play.utils.gradient_utils import eval_numerical_gradient, rel_norm_diff | |
def ac_func(x): | |
return sigmoid(x) | |
def ac_func_deriv(x): | |
return sigmoid_deriv(x) | |
class NeuralNet(object): | |
""" | |
Classifier of the neural network. | |
""" | |
def __init__(self, layer_units, weights=None): | |
self.weights = weights | |
self.layer_units = layer_units | |
def init_weights(self, eps=1e-4): | |
""" | |
Initialize weights. | |
layer_units: tuple stores the size of each layer. | |
weights: structured weights. | |
""" | |
layer_units = self.layer_units | |
n_layers = len(layer_units) | |
weights = [{} for i in range(n_layers - 1)] | |
for i in range(n_layers - 1): | |
weights[i]['W'] = eps * np.random.randn(layer_units[i], layer_units[i+1]) | |
weights[i]['b'] = np.zeros(layer_units[i+1]) | |
self.weights = weights | |
return self.weights | |
def fit(self, X, y, X_val, y_val, | |
reg=0.0, | |
learning_rate=1e-2, | |
optimizer='L-BFGS-B', max_iters=100, | |
sample_batches=True, | |
n_epochs=30, batch_size=128, | |
verbose=False): | |
epoch = 0 | |
best_val_acc = 0.0 | |
best_weights = {} | |
if self.weights is None: | |
# lazily initialize weights | |
self.weights = self.init_weights() | |
# Solve with L-BFGS-B | |
options = {'maxiter': max_iters, 'disp': verbose} | |
def J(theta): | |
weights = pack_struct(theta, self.layer_units) | |
loss, grad = neural_net_loss(weights, X, y, reg) | |
grad = flatten_struct(grad) | |
return loss, grad | |
# Callback to get accuracies based on training / validation sets | |
iter_feval = 0 | |
loss_history = [] | |
train_acc_history = [] | |
val_acc_history = [] | |
def progress(x): | |
nonlocal iter_feval, best_weights, best_val_acc | |
iter_feval += 1 | |
# Loss history | |
weights = pack_struct(x, self.layer_units) | |
loss, grad = neural_net_loss(weights, X, y, reg) | |
loss_history.append(loss) | |
# Training accurary | |
y_pred_train = neural_net_predict(weights, X) | |
train_acc = np.mean(y_pred_train == y) | |
train_acc_history.append(train_acc) | |
# Validation accuracy | |
y_pred_val= neural_net_predict(weights, X_val) | |
val_acc = np.mean(y_pred_val == y_val) | |
val_acc_history.append(val_acc) | |
# Keep track of the best weights based on validation accuracy | |
if val_acc > best_val_acc: | |
best_val_acc = val_acc | |
n_weights = len(weights) | |
best_weights = [{} for i in range(n_weights)] | |
for i in range(n_weights): | |
for p in weights[i]: | |
best_weights[i][p] = weights[i][p].copy() | |
n_iters_verbose = max_iters / 20 | |
if iter_feval % n_iters_verbose == 0: | |
print("iter: {:4d}, loss: {:8f}, train_acc: {:4f}, val_acc: {:4f}".format(iter_feval, loss, train_acc, val_acc)) | |
# Minimize the loss function | |
init_theta = flatten_struct(self.weights) | |
results = scipy.optimize.minimize(J, init_theta, method=optimizer, jac=True, callback=progress, options=options) | |
# Save weights | |
self.weights = best_weights | |
return self.weights, loss_history, train_acc_history, val_acc_history | |
def predict(self, X): | |
""" | |
X: the N x M input matrix, where each column data[:, i] corresponds to | |
a single test set | |
pred: the predicted results. | |
""" | |
pred = neural_net_predict(self.weights, X) | |
return pred | |
def flatten_struct(self, data): | |
return flatten_struct(data) | |
def pack_struct(self, data): | |
return pack_struct(data, self.layer_units) | |
def neural_net_loss(weights, X, y, reg): | |
""" | |
Compute loss and gradients of the neutral network. | |
""" | |
Y = to_binary_class_matrix(y) | |
L = len(weights) # The index of the output layer | |
z = [] | |
a = [] | |
# Number of samples | |
m = X.shape[0] | |
# Forward pass | |
z.append(0) # Dummy element | |
a.append(X) # Input activation | |
for i in range(0, L): | |
W = weights[i]['W'] | |
b = weights[i]['b'] | |
z.append(np.dot(a[-1], W) + b) | |
a.append(ac_func(z[-1])) | |
# loss function | |
sum_weight_square = 0.0 # Sum of weight square | |
for i in range(L): | |
W = weights[i]['W'] | |
sum_weight_square += np.sum(W*W) | |
loss = 1.0/(2.0*m) * np.sum((a[-1] - Y)**2) + 0.5*reg*sum_weight_square | |
# Backpropagation | |
delta = [(a[-1] - Y) * ac_func_deriv(z[-1])] | |
for i in reversed(range(L)): # Note that delta[0] will not be used | |
W = weights[i]['W'] | |
d = np.dot(delta[0], W.T) * ac_func_deriv(z[i]) | |
delta.insert(0, d) # Insert element at beginning | |
# Gradients | |
grad = [{} for i in range(L)] | |
for i in range(L): | |
W = weights[i]['W'] | |
grad[i]['W'] = np.dot(a[i].T, delta[i+1]) / m + reg*W | |
grad[i]['b'] = np.mean(delta[i+1], axis=0) | |
return loss, grad | |
def neural_net_predict(weights, X): | |
""" | |
X: the N x M input matrix, where each column data[:, i] corresponds to | |
a single test set | |
pred: the predicted results. | |
""" | |
L = len(weights) # The index of the output layer | |
z = [] | |
a = [] | |
# Number of samples | |
m = X.shape[0] | |
# Forward pass | |
z.append(0) # Dummy element | |
a.append(X) # Input activation | |
for i in range(0, L): | |
W = weights[i]['W'] | |
b = weights[i]['b'] | |
z.append(np.dot(a[-1], W) + b) | |
a.append(ac_func(z[-1])) | |
# Predictions | |
pred = np.argmax(a[-1], axis=1) | |
return pred | |
def rel_err_gradients(): | |
""" | |
Return the relative error between analytic and nemerical gradients. | |
""" | |
# Number of layer units | |
n_samples = 100 | |
input_size = 4 * 4 | |
hidden_size = 4 | |
n_classes = 10 | |
layer_units = (input_size, hidden_size, n_classes) | |
X_train = np.random.randn(n_samples, input_size) | |
y_train = np.random.randint(n_classes, size=n_samples) | |
reg = 1e-4 | |
# Define the classifier | |
clf = NeuralNet(layer_units) | |
# Initialize weights | |
weights = clf.init_weights() | |
# Analytic gradients of the cost function | |
cost, grad = neural_net_loss(weights, X_train, y_train, reg) | |
grad = clf.flatten_struct(grad) # Flattened gradients | |
def J(theta): | |
# Structured weights | |
weights = clf.pack_struct(theta) | |
return neural_net_loss(weights, X_train, y_train, reg)[0] | |
theta = clf.flatten_struct(weights) | |
numerical_grad = eval_numerical_gradient(J, theta) | |
# Compare numerically computed gradients with those computed analytically | |
rel_err = rel_norm_diff(numerical_grad, grad) | |
return rel_err |
In case you are interested in all codes related in this demonstration, please check the repository.