This post will demonstrate how to implement softmax regression for digit recognition.

The detailed derivations of algorithm can be found from this script.

Main workflow

Preparing training/validation/testing datasets.
Set the weight decay / numerical parameters.
Check if the gradients of the loss function are correct.
Training model.
Estimate the accuracy of predictions.

Ipython notebook

In [1]:

%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

from dnn_play.classifiers.softmax import Softmax, softmax_loss, rel_err_gradients
from dnn_play.utils.data_utils import load_mnist
from dnn_play.utils.visualize_utils import display_network


# Plot settings
plt.rcParams['figure.figsize'] = (10.0, 10.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'

In [2]:

# Load MNIST data
(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_mnist()
#(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_mnist(n_train=9000, n_val=1000, n_test=1000)

print("X_train shape = {} y_train shape = {}".format(X_train.shape, y_train.shape))
print("X_val   shape = {} y_val  shape = {}".format(X_val.shape, y_val.shape))
print("X_test  shape = {} y_test shape = {}".format(X_test.shape, y_test.shape))

X_train shape = (55000, 784) y_train shape = (55000,)
X_val   shape = (5000, 784) y_val  shape = (5000,)
X_test  shape = (10000, 784) y_test shape = (10000,)

In [3]:

# Sanity check of softmax loss function

# Number of layer units
input_size  = X_train.shape[1] # Dimension of features
n_classes = np.max(y_train) + 1

layer_units = (input_size, n_classes)

# Hyperparameters
reg = 1e-4

# Numerical parameters
max_iters = 400

# Define classifier
clf = Softmax(layer_units)
weights = clf.init_weights()

loss, grad = softmax_loss(weights, X_train, y_train, 0.0)

# Note there are 10 classes.
# As a rough sanity check, our loss should be something close to -log(0.1).
print('loss: %f' % loss)
print('sanity check: %f' % (-np.log(0.1)))

loss: 2.302549
sanity check: 2.302585

In [4]:

# Gradient checking
if rel_err_gradients() < 1e-8:
    print("Gradient check passed!")
else:
    print("Gradient check failed!")

Gradient check passed!

In [5]:

"""
Training
"""

model, loss_history, train_acc_history, val_acc_history = clf.fit(X_train, y_train, X_val, y_val, 
                                                                  reg=reg, max_iters=max_iters, verbose=True)

iter:   20, loss: 0.325872, train_acc: 0.908945, val_acc: 0.928800
iter:   40, loss: 0.285137, train_acc: 0.922873, val_acc: 0.937600
iter:   60, loss: 0.272479, train_acc: 0.927382, val_acc: 0.939600
iter:   80, loss: 0.267087, train_acc: 0.930382, val_acc: 0.942600
iter:  100, loss: 0.264715, train_acc: 0.931636, val_acc: 0.940800
iter:  120, loss: 0.263642, train_acc: 0.932600, val_acc: 0.941200
iter:  140, loss: 0.263141, train_acc: 0.932745, val_acc: 0.941200
iter:  160, loss: 0.262935, train_acc: 0.933073, val_acc: 0.942000
iter:  180, loss: 0.262832, train_acc: 0.932982, val_acc: 0.942400
iter:  200, loss: 0.262783, train_acc: 0.933109, val_acc: 0.942400
iter:  220, loss: 0.262759, train_acc: 0.933109, val_acc: 0.942600
iter:  240, loss: 0.262749, train_acc: 0.932927, val_acc: 0.942600
iter:  260, loss: 0.262744, train_acc: 0.932891, val_acc: 0.942400

In [6]:

# Plot the loss function and train / validation accuracies
plt.subplot(2, 1, 1)
plt.plot(loss_history)
plt.title('Loss history')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(2, 1, 2)
plt.plot(train_acc_history)
plt.plot(val_acc_history)
plt.legend(['Training accuracy', 'Validation accuracy'], loc='lower right')
plt.xlabel('Epoch')
plt.ylabel('Clasification accuracy')

Out[6]:

<matplotlib.text.Text at 0x10e838518>

In [7]:

# Visualize the weights 

W0 = model[0]['W']
image = display_network(W0)
plt.imshow(image, cmap = plt.cm.gray)

Out[7]:

<matplotlib.image.AxesImage at 0x10e88e390>

In [8]:

# Make predictions

pred = clf.predict(X_test)

acc = np.mean(y_test == pred)
print("Accuracy: {:5.2f}% \n".format(acc*100))

Accuracy: 92.46%

In [9]:

# View some images and predictions
n_images = 3
images = X_test[:n_images].reshape((n_images, 28, 28))
pred = clf.predict(X_test[:n_images])

for i in range(n_images):
    plt.subplot(1, n_images, i+1)
    plt.imshow(images[i], cmap = plt.cm.gray)
    plt.title('Predicted digit: {}'.format(pred[i]))
    plt.axis('off')

Softmax classifier

import numpy as np import scipy.optimize from dnn_play.utils.np_utils import to_binary_class_matrix, flatten_struct, pack_struct from dnn_play.utils.gradient_utils import eval_numerical_gradient, rel_norm_diff class Softmax(object): """ Softmax classifer """ def __init__(self, layer_units, weights=None): self.weights = weights self.layer_units = layer_units def init_weights(self, eps=1e-4): self.weights = init_weights(self.layer_units, eps=eps) return self.weights def fit(self, X, y, X_val, y_val, reg=0.0, optimizer='L-BFGS-B', max_iters=100, verbose=False): epoch = 0 best_val_acc = 0.0 best_weights = {} if self.weights is None: # lazily initialize weights self.weights = init_weights(self.layer_units) # Solve with L-BFGS-B options = {'maxiter': max_iters, 'disp': verbose} def J(theta): weights = pack_struct(theta, self.layer_units) loss, grad = softmax_loss(weights, X, y, reg) grad = flatten_struct(grad) return loss, grad # Callback to get accuracies based on training / validation sets iter_feval = 0 loss_history = [] train_acc_history = [] val_acc_history = [] def progress(x): nonlocal iter_feval, best_weights, best_val_acc iter_feval += 1 # Loss history weights = pack_struct(x, self.layer_units) loss, grad = softmax_loss(weights, X, y, reg) loss_history.append(loss) # Training accurary y_pred_train = softmax_predict(weights, X) train_acc = np.mean(y_pred_train == y) train_acc_history.append(train_acc) # Validation accuracy y_pred_val= softmax_predict(weights, X_val) val_acc = np.mean(y_pred_val == y_val) val_acc_history.append(val_acc) # Keep track of the best weights based on validation accuracy if val_acc > best_val_acc: best_val_acc = val_acc n_weights = len(weights) best_weights = [{} for i in range(n_weights)] for i in range(n_weights): for p in weights[i]: best_weights[i][p] = weights[i][p].copy() n_iters_verbose = max_iters / 20 if iter_feval % n_iters_verbose == 0: print("iter: {:4d}, loss: {:8f}, train_acc: {:4f}, val_acc: {:4f}".format(iter_feval, loss, train_acc, val_acc)) # Minimize the loss function init_theta = flatten_struct(self.weights) results = scipy.optimize.minimize(J, init_theta, method=optimizer, jac=True, callback=progress, options=options) # Save weights self.weights = best_weights return self.weights, loss_history, train_acc_history, val_acc_history def predict(self, X): """ X: the N x M input matrix, where each column data[:, i] corresponds to a single test set pred: the predicted results. """ pred = softmax_predict(self.weights, X) return pred def flatten_struct(self, data): return flatten_struct(data) def pack_struct(self, data): return pack_struct(data, self.layer_units) def get_weights(self): return self.weights def softmax_loss(weights, X, y, reg): """ Compute the loss and derivative. theta: weight matrix X: the N x M input matrix, where each column data[:, i] corresponds to a single test set y: labels corresponding to the input data """ # Small constant used to avoid numerical problem eps = 1e-10 # Weighting parameters W0 = weights[0]['W'] b0 = weights[0]['b'] # Number of samples m = X.shape[0] # Forward pass a0 = X # Input activation z1 = np.dot(a0, W0) + b0 z1_max = np.max(z1, axis=1, keepdims=True) z1 -= z1_max # Avoid numerical problem due to large values of exp(z1) proba = np.exp(z1) / np.sum(np.exp(z1), axis=1, keepdims=True) + eps # Add eps to avoid this value too close to zero # Target matrix of labels target = to_binary_class_matrix(y) # loss function loss = -1.0/m * np.sum(target * np.log(proba)) + 0.5*reg*np.sum(W0*W0) # Gradients delta1 = -1.0 * (target - proba) grad = [{}] grad[0]['W'] = np.dot(a0.T, delta1)/m + reg*W0 grad[0]['b'] = np.mean(delta1, axis=0) return loss, grad def softmax_predict(weights, X): """ weights: weights trained using softmax_train X: the test matrix, where each column X[:, i] corresponds to a single test set pred: the prediction array. """ # Small constant used to avoid numerical problem eps = 1e-10 # Weighting parameters W0 = weights[0]['W'] b0 = weights[0]['b'] # Number of samples m = X.shape[1] # Forward pass a0 = X # Input activation z1 = np.dot(a0, W0) + b0 # Propabilities z1_max = np.max(z1, axis=1, keepdims=True) z1 -= z1_max # Avoid numerical problem due to large values of exp(z1) proba = np.exp(z1) / np.sum(np.exp(z1), axis=1, keepdims=True) + eps # Add eps to avoid this value too close to zero # Predictions pred = np.argmax(proba, axis=1) return pred def init_weights(layer_units, eps=1e-4): """ Initialize weights. layer_units: tuple stores the size of each layer. weights: structured weights. """ assert len(layer_units) == 2 weights = [{}] weights[0]['W'] = eps * np.random.randn(layer_units[0], layer_units[1]) weights[0]['b'] = np.zeros(layer_units[1]) return weights def rel_err_gradients(): """ Return the relative error between analytic gradients and nemerical ones. """ # Number of layer units input_size = 4 * 4 hidden_size = 4 n_classes = 10 layer_units = (input_size, n_classes) X_train = np.random.randn(100, input_size) y_train = np.random.randint(n_classes, size=100) reg = 1e-4 # Define the classifier clf = Softmax(layer_units) # Initialize weights weights = clf.init_weights() # Analytic gradients of the cost function cost, grad = softmax_loss(weights, X_train, y_train, reg) grad = clf.flatten_struct(grad) # Flattened gradients def J(theta): # Structured weights weights = clf.pack_struct(theta) return softmax_loss(weights, X_train, y_train, reg)[0] theta = clf.flatten_struct(weights) numerical_grad = eval_numerical_gradient(J, theta) # Compare numerically computed gradients with those computed analytically rel_err = rel_norm_diff(numerical_grad, grad) return rel_err

In case you are interested in all codes related in this demonstration, please check the repository.

Comments

Implement softmax regression for digit recognition

Main workflow

Ipython notebook

Softmax classifier

Published

Category

Tags