This post will demonstrate how to implement multilayer perceptron for digit recognition.

The detailed derivations of algorithm can be found from this script.

Main workflow

Preparing training/validation/testing datasets.
Set the weight decay / numerical parameters.
Check if the gradients of the loss function are correct.
Training model.
Estimate the accuracy of predictions.

Ipython notebook

In [1]:

%load_ext autoreload
%autoreload 2
%matplotlib inline


import numpy as np
import matplotlib.pyplot as plt

from dnn_play.classifiers.mlp import MLP, mlp_loss, rel_err_gradients

from dnn_play.utils.data_utils import load_mnist
from dnn_play.utils.visualize_utils import display_network

# Plot settings
plt.rcParams['figure.figsize'] = (10.0, 10.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'

In [2]:

# Load MNIST data
(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_mnist()
#(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_mnist(n_train=900, n_val=100, n_test=100)

print("X_train shape = {} y_train shape = {}".format(X_train.shape, y_train.shape))
print("X_val   shape = {} y_val  shape = {}".format(X_val.shape, y_val.shape))
print("X_test  shape = {} y_test shape = {}".format(X_test.shape, y_test.shape))

X_train shape = (55000, 784) y_train shape = (55000,)
X_val   shape = (5000, 784) y_val  shape = (5000,)
X_test  shape = (10000, 784) y_test shape = (10000,)

In [3]:

# Number of layer units
input_size  = X_train.shape[1] # Dimension of features
hidden_size_L1 = 200
output_size = np.max(y_train) + 1 # Number of classes

# Network configuration
layer_units = ((input_size, hidden_size_L1, output_size))

# Hyperparameters
reg = 1e-4 # Regulation, weight decay

# Numerical parameters
max_iters = 400

# Define the classifier
clf = MLP(layer_units)

# Initialize weights
weights = clf.init_weights()

loss, grad = mlp_loss(weights, X_train, y_train, 0.0)

# Note there are 10 classes.
# As a rough sanity check, our loss should be something close to -log(0.1).
print('loss: %f' % loss)
print('sanity check: %f' % (-np.log(0.1)))

loss: 2.302589
sanity check: 2.302585

In [4]:

# Gradient checking
if rel_err_gradients() < 1e-8:
    print("Gradient check passed!")
else:
    print("Gradient check failed!")

Gradient check passed!

In [5]:

"""
Training
"""
reg = 1e-4 # Regulation, weight decay

#clf = MLP(layer_units, weights = weights)
weights, loss_history, train_acc_history, val_acc_history = clf.fit(X_train, y_train, X_val, y_val, 
                                                                    reg=reg, max_iters=max_iters, verbose=True)

iter:   20, loss: 1.586643, train_acc: 0.431182, val_acc: 0.446200
iter:   40, loss: 0.361330, train_acc: 0.902291, val_acc: 0.924200
iter:   60, loss: 0.226814, train_acc: 0.945418, val_acc: 0.957600
iter:   80, loss: 0.169513, train_acc: 0.966964, val_acc: 0.972000
iter:  100, loss: 0.138802, train_acc: 0.980491, val_acc: 0.976800
iter:  120, loss: 0.121456, train_acc: 0.986982, val_acc: 0.979400
iter:  140, loss: 0.112092, train_acc: 0.991691, val_acc: 0.980600
iter:  160, loss: 0.107356, train_acc: 0.993564, val_acc: 0.981600
iter:  180, loss: 0.104478, train_acc: 0.994745, val_acc: 0.983200
iter:  200, loss: 0.102320, train_acc: 0.995327, val_acc: 0.984200
iter:  220, loss: 0.100476, train_acc: 0.995291, val_acc: 0.983800
iter:  240, loss: 0.099097, train_acc: 0.995145, val_acc: 0.985000
iter:  260, loss: 0.097932, train_acc: 0.995582, val_acc: 0.983800
iter:  280, loss: 0.097004, train_acc: 0.995782, val_acc: 0.984200
iter:  300, loss: 0.096228, train_acc: 0.996291, val_acc: 0.984600
iter:  320, loss: 0.095559, train_acc: 0.996364, val_acc: 0.985200
iter:  340, loss: 0.094926, train_acc: 0.996327, val_acc: 0.984400
iter:  360, loss: 0.094436, train_acc: 0.996382, val_acc: 0.985000
iter:  380, loss: 0.094027, train_acc: 0.996436, val_acc: 0.984800
iter:  400, loss: 0.093674, train_acc: 0.996491, val_acc: 0.985400

In [6]:

# Plot the loss function and train / validation accuracies
plt.subplot(2, 1, 1)
plt.plot(loss_history)
plt.title('Loss history')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(2, 1, 2)
plt.plot(train_acc_history)
plt.plot(val_acc_history)
plt.legend(['Training accuracy', 'Validation accuracy'], loc='lower right')
plt.xlabel('Epoch')
plt.ylabel('Clasification accuracy')

Out[6]:

<matplotlib.text.Text at 0x106b5f630>

In [7]:

# Visualize the weights 

W0 = weights[0]['W']
image = display_network(W0)
plt.imshow(image, cmap = plt.cm.gray)

Out[7]:

<matplotlib.image.AxesImage at 0x106be5fd0>

In [8]:

# Make predictions
pred = clf.predict(X_test)
acc = np.mean(y_test == pred)

print("Accuracy: {:5.2f}% \n".format(acc*100))

Accuracy: 98.14%

In [9]:

# View some images and predictions
n_images = 4
images = X_test[:n_images].reshape((n_images, 28, 28))
pred = clf.predict(X_test[:n_images])

for i in range(n_images):
    plt.subplot(1, n_images, i+1)
    plt.imshow(images[i], cmap = plt.cm.gray)
    plt.title('Predicted digit: {}'.format(pred[i]))
    plt.axis('off')

Multilayer perceptron

import numpy as np import scipy.optimize from dnn_play.activations import tanh, tanh_deriv, sigmoid, sigmoid_deriv from dnn_play.utils.np_utils import to_binary_class_matrix, flatten_struct, pack_struct from dnn_play.utils.gradient_utils import eval_numerical_gradient, rel_norm_diff def ac_func(x): return sigmoid(x) def ac_func_deriv(x): return sigmoid_deriv(x) class MLP(object): """ Multilayer perceptron. """ def __init__(self, layer_units, weights=None): self.weights = weights self.layer_units = layer_units def init_weights(self, eps=1e-4): """ Initialize weights. layer_units: tuple stores the size of each layer. weights: structured weights. """ layer_units = self.layer_units n_layers = len(layer_units) weights = [{} for i in range(n_layers - 1)] for i in range(n_layers - 1): weights[i]['W'] = eps * np.random.randn(layer_units[i], layer_units[i+1]) weights[i]['b'] = np.zeros(layer_units[i+1]) self.weights = weights return self.weights def fit(self, X, y, X_val, y_val, reg=0.0, learning_rate=1e-2, optimizer='L-BFGS-B', max_iters=100, verbose=False): epoch = 0 best_val_acc = 0.0 best_weights = {} if self.weights is None: # lazily initialize weights self.weights = self.init_weights() # Solve with L-BFGS-B options = {'maxiter': max_iters, 'disp': verbose} def J(theta): weights = pack_struct(theta, self.layer_units) loss, grad = mlp_loss(weights, X, y, reg) grad = flatten_struct(grad) return loss, grad # Callback to get accuracies based on training / validation sets iter_feval = 0 loss_history = [] train_acc_history = [] val_acc_history = [] def progress(x): nonlocal iter_feval, best_weights, best_val_acc iter_feval += 1 # Loss history weights = pack_struct(x, self.layer_units) loss, grad = mlp_loss(weights, X, y, reg) loss_history.append(loss) # Training accurary y_pred_train = mlp_predict(weights, X) train_acc = np.mean(y_pred_train == y) train_acc_history.append(train_acc) # Validation accuracy y_pred_val= mlp_predict(weights, X_val) val_acc = np.mean(y_pred_val == y_val) val_acc_history.append(val_acc) # Keep track of the best weights based on validation accuracy if val_acc > best_val_acc: best_val_acc = val_acc n_weights = len(weights) best_weights = [{} for i in range(n_weights)] for i in range(n_weights): for p in weights[i]: best_weights[i][p] = weights[i][p].copy() n_iters_verbose = max_iters / 20 if iter_feval % n_iters_verbose == 0: print("iter: {:4d}, loss: {:8f}, train_acc: {:4f}, val_acc: {:4f}".format(iter_feval, loss, train_acc, val_acc)) # Minimize the loss function init_theta = flatten_struct(self.weights) results = scipy.optimize.minimize(J, init_theta, method=optimizer, jac=True, callback=progress, options=options) # Save weights self.weights = best_weights return self.weights, loss_history, train_acc_history, val_acc_history def predict(self, X): """ X: the N x M input matrix, where each column data[:, i] corresponds to a single test set pred: the predicted results. """ pred = mlp_predict(self.weights, X) return pred def flatten_struct(self, data): return flatten_struct(data) def pack_struct(self, data): return pack_struct(data, self.layer_units) def mlp_loss(weights, X, y, reg): """ Compute loss and gradients of the neutral network. """ L = len(weights) # The index of the output layer z = [] a = [] err_tol = 1e-10 # Error of tolerance # Number of samples m = X.shape[0] # Forward pass z.append(0) # Dummy element a.append(X) # Input activation for i in range(0, L): W = weights[i]['W'] b = weights[i]['b'] z.append(np.dot(a[-1], W) + b) a.append(ac_func(z[-1])) # Note the final element in a[:] will not be used zL_max = np.max(z[-1], axis=1, keepdims=True) z[-1] -= zL_max # Avoid numerical problem due to large values of exp(z[-1]) proba = np.exp(z[-1]) / np.sum(np.exp(z[-1]), axis=1, keepdims=True) + err_tol # Add err_tol to avoid this value too close to zero # Target matrix of labels Y = to_binary_class_matrix(y) # loss function sum_squared_weights = 0.0 # Sum of squared weights for i in range(L): W = weights[i]['W'] sum_squared_weights += np.sum(W*W) loss = -1.0/m * np.sum(Y * np.log(proba)) + 0.5*reg*sum_squared_weights # Backpropagation delta = [-1.0 * (Y - proba)] for i in reversed(range(L)): # Note that delta[0] will not be used W = weights[i]['W'] d = np.dot(delta[0], W.T) * ac_func_deriv(z[i]) delta.insert(0, d) # Insert element at beginning # Gradients grad = [{} for i in range(L)] for i in range(L): W = weights[i]['W'] grad[i]['W'] = np.dot(a[i].T, delta[i+1]) / m + reg*W grad[i]['b'] = np.mean(delta[i+1], axis=0) return loss, grad def mlp_predict(weights, X): """ X: the N x M input matrix, where each column data[:, i] corresponds to a single test set pred: the predicted results. """ L = len(weights) # The index of the output layer z = [] a = [] err_tol = 1e-10 # Error of tolerance # Number of samples m = X.shape[0] # Forward pass z.append(0) # Dummy element a.append(X) # Input activation for i in range(0, L): W = weights[i]['W'] b = weights[i]['b'] z.append(np.dot(a[-1], W) + b) a.append(ac_func(z[-1])) # Note the final element in a[:] will not be used zL_max = np.max(z[-1], axis=1, keepdims=True) z[-1] -= zL_max # Avoid numerical problem due to large values of exp(z[-1]) proba = np.exp(z[-1]) / np.sum(np.exp(z[-1]), axis=1, keepdims=True) + err_tol # Add err_tol to avoid this value too close to zero # Predictions pred = np.argmax(proba, axis=1) return pred def rel_err_gradients(): """ Return the relative error between analytic gradients and nemerical ones. """ # Number of layer units n_samples = 100 input_size = 4 * 4 hidden_size_L1 = 4 hidden_size_L2 = 4 output_size = 10 layer_units = (input_size, hidden_size_L1, hidden_size_L2, output_size) X_train = np.random.randn(n_samples, input_size) y_train = np.random.randint(output_size, size=n_samples) reg = 1e-4 # Define the classifier clf = MLP(layer_units) # Initialize weights weights = clf.init_weights() # Analytic gradients of the cost function cost, grad = mlp_loss(weights, X_train, y_train, reg) grad = clf.flatten_struct(grad) # Flattened gradients def J(theta): # Structured weights weights = clf.pack_struct(theta) return mlp_loss(weights, X_train, y_train, reg)[0] theta = clf.flatten_struct(weights) numerical_grad = eval_numerical_gradient(J, theta) # Compare numerically computed gradients with those computed analytically rel_err = rel_norm_diff(numerical_grad, grad) return rel_err

In case you are interested in all codes related in this demonstration, please check the repository.

Comments

Implement multilayer perceptron for digit recognition

Main workflow

Ipython notebook

Multilayer perceptron

Published

Category

Tags