This post will demonstrate how to implement stacked multilayer perceptron for digit recognition.

Here, we consider a multilayer perceptron with four layers and employ the technology of sparse autoencoder to determine the initial values of weighting parameters for the first three layers.

Main workflow

Preparing training/validation/testing datasets.
Set the hyperparameters and numerical parameters.
Determining the initial values for each layer.
Fine-tuning the model.
Estimate the accuracy of predictions.

Ipython notebook

In [1]:

%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

from dnn_play.classifiers.sparse_autoencoder import SparseAutoencoder
from dnn_play.classifiers.softmax import Softmax
from dnn_play.classifiers.mlp import MLP

from dnn_play.utils.data_utils import load_mnist
from dnn_play.utils.visualize_utils import display_network

# Plot settings
plt.rcParams['figure.figsize'] = (10.0, 10.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'

In [2]:

# Load MNIST data

(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_mnist()
#(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_mnist(n_train=5500, n_val=500, n_test=1000)

print("X_train shape = {} y_train shape = {}".format(X_train.shape, y_train.shape))
print("X_val   shape = {} y_val  shape = {}".format(X_val.shape, y_val.shape))
print("X_test  shape = {} y_test shape = {}".format(X_test.shape, y_test.shape))

X_train shape = (55000, 784) y_train shape = (55000,)
X_val   shape = (5000, 784) y_val  shape = (5000,)
X_test  shape = (10000, 784) y_test shape = (10000,)

In [3]:

# Network configuration
input_size  = X_train.shape[1] # Dimension of features
hidden_size_L1 = 200
hidden_size_L2 = 200
output_size = np.max(y_train) + 1 # Number of classes

layer_units = (input_size, hidden_size_L1, hidden_size_L2, output_size)
n_layers = len(layer_units)

# Hyperparameters
reg = 2e-4 # 1e-4 # Regulation, weight decay    
beta = 3               # weight of sparsity penalty term       
sparsity_param = 1e-1  # desired average activation of the hidden units 

# Numerical parameters
max_iters = 400

In [4]:

# Train the first sparse autoencoder

sae1_layer_units = (input_size, hidden_size_L1, input_size)
sae1 = SparseAutoencoder(sae1_layer_units)
sae1_weights, sae1_loss_history = sae1.fit(X_train, reg=reg, beta=beta, sparsity_param=sparsity_param, 
                                           max_iters=max_iters, verbose=True)

# Train / Validation features of layer 1
train_features_L1 = sae1.forward_pass(X_train)
val_features_L1 = sae1.forward_pass(X_val)

iter:   20, loss: 24.512965
iter:   40, loss: 17.893108
iter:   60, loss: 12.790935
iter:   80, loss: 10.147536
iter:  100, loss: 8.579675
iter:  120, loss: 7.455113
iter:  140, loss: 6.706541
iter:  160, loss: 6.187018
iter:  180, loss: 5.784339
iter:  200, loss: 5.512282
iter:  220, loss: 5.315639
iter:  240, loss: 5.148164
iter:  260, loss: 5.012013
iter:  280, loss: 4.919683
iter:  300, loss: 4.830509
iter:  320, loss: 4.758168
iter:  340, loss: 4.688783
iter:  360, loss: 4.628258
iter:  380, loss: 4.567358
iter:  400, loss: 4.510771

In [5]:

# Plot the loss function and train / validation accuracies
plt.subplot(2, 1, 1)
plt.plot(sae1_loss_history)
plt.title('SAE1 Loss history')
plt.xlabel('Epoch')
plt.ylabel('Loss')

Out[5]:

<matplotlib.text.Text at 0x112523b00>

In [6]:

# Visualize the weights 

sae1_W0 = sae1_weights[0]['W']
image = display_network(sae1_W0)
plt.imshow(image, cmap = plt.cm.gray)

Out[6]:

<matplotlib.image.AxesImage at 0x10bf80c88>

In [7]:

# Train the second sparse autoencoder

sae2_layer_units = (hidden_size_L1, hidden_size_L2, hidden_size_L1)
sae2 = SparseAutoencoder(sae2_layer_units)
sae2_weights, sae2_loss_history = sae2.fit(train_features_L1, reg=reg, beta=beta, sparsity_param=sparsity_param, 
                                           max_iters=max_iters, verbose=True)

# Train / Validation features of layer 2
train_features_L2 = sae2.forward_pass(train_features_L1)
val_features_L2 = sae2.forward_pass(val_features_L1)

iter:   20, loss: 3.870356
iter:   40, loss: 2.557963
iter:   60, loss: 2.024741
iter:   80, loss: 1.699575
iter:  100, loss: 1.518455
iter:  120, loss: 1.412255
iter:  140, loss: 1.350893
iter:  160, loss: 1.311039
iter:  180, loss: 1.277338
iter:  200, loss: 1.251721
iter:  220, loss: 1.229505
iter:  240, loss: 1.210008
iter:  260, loss: 1.193672
iter:  280, loss: 1.180739
iter:  300, loss: 1.168971
iter:  320, loss: 1.157312
iter:  340, loss: 1.147839
iter:  360, loss: 1.137430
iter:  380, loss: 1.130605
iter:  400, loss: 1.123317

In [8]:

# Plot the loss function and train / validation accuracies
plt.subplot(2, 1, 1)
plt.plot(sae2_loss_history)
plt.title('SAE2 Loss history')
plt.xlabel('Epoch')
plt.ylabel('Loss')

Out[8]:

<matplotlib.text.Text at 0x10bf3f048>

In [9]:

# Train softmax classifier
 
sm_layer_units = (hidden_size_L2, output_size)
sm_clf = Softmax(sm_layer_units)
sm_weights, sm_loss_history, sm_train_acc_history, sm_val_acc_history = sm_clf.fit(train_features_L2, y_train, 
    val_features_L2, y_val, reg=reg, max_iters=max_iters, verbose=True)

iter:   20, loss: 0.278223, train_acc: 0.946945, val_acc: 0.961600
iter:   40, loss: 0.271527, train_acc: 0.950945, val_acc: 0.965400
iter:   60, loss: 0.271289, train_acc: 0.951182, val_acc: 0.965200
iter:   80, loss: 0.271261, train_acc: 0.951018, val_acc: 0.965600

In [10]:

# Plot the loss function and train / validation accuracies
plt.subplot(2, 1, 1)
plt.plot(sm_loss_history)
plt.title('Softmax Loss history')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(2, 1, 2)
plt.plot(sm_train_acc_history)
plt.plot(sm_val_acc_history)
plt.legend(['Softmax Training accuracy', 'Softmax Validation accuracy'], loc='lower right')
plt.xlabel('Epoch')
plt.ylabel('Clasification accuracy')

Out[10]:

<matplotlib.text.Text at 0x10ee4c358>

In [11]:

# Initial weights
init_weights = [{} for i in range(n_layers - 1)] # Initial weights
init_weights[0] = sae1_weights[0]
init_weights[1] = sae2_weights[0]
init_weights[2] = sm_weights[0]

# Define the Multilayer perceptron classifier
clf = MLP(layer_units, weights=init_weights)

# Train
opt_weights, loss_history, train_acc_history, val_acc_history = clf.fit(X_train, y_train, X_val, y_val, 
                                                                        reg=reg, max_iters=max_iters, verbose=True)

iter:   20, loss: 0.454546, train_acc: 0.966145, val_acc: 0.974200
iter:   40, loss: 0.297224, train_acc: 0.978145, val_acc: 0.978200
iter:   60, loss: 0.219680, train_acc: 0.983618, val_acc: 0.979600
iter:   80, loss: 0.191619, train_acc: 0.987036, val_acc: 0.981600
iter:  100, loss: 0.176366, train_acc: 0.989764, val_acc: 0.982400
iter:  120, loss: 0.167822, train_acc: 0.991400, val_acc: 0.984600
iter:  140, loss: 0.162212, train_acc: 0.992382, val_acc: 0.984200
iter:  160, loss: 0.159403, train_acc: 0.992673, val_acc: 0.985400
iter:  180, loss: 0.157167, train_acc: 0.993055, val_acc: 0.985000
iter:  200, loss: 0.155405, train_acc: 0.993236, val_acc: 0.985200
iter:  220, loss: 0.153924, train_acc: 0.993418, val_acc: 0.984800
iter:  240, loss: 0.152531, train_acc: 0.993564, val_acc: 0.984400
iter:  260, loss: 0.151141, train_acc: 0.993818, val_acc: 0.985200
iter:  280, loss: 0.149839, train_acc: 0.994182, val_acc: 0.984600
iter:  300, loss: 0.148864, train_acc: 0.993964, val_acc: 0.983400
iter:  320, loss: 0.148131, train_acc: 0.994145, val_acc: 0.983800
iter:  340, loss: 0.147285, train_acc: 0.994164, val_acc: 0.984000
iter:  360, loss: 0.146488, train_acc: 0.993891, val_acc: 0.984000
iter:  380, loss: 0.145849, train_acc: 0.994073, val_acc: 0.983400
iter:  400, loss: 0.145317, train_acc: 0.994309, val_acc: 0.984800

In [12]:

# Plot the loss function and train / validation accuracies
plt.subplot(2, 1, 1)
plt.plot(loss_history)
plt.title('Loss history')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(2, 1, 2)
plt.plot(train_acc_history)
plt.plot(val_acc_history)
plt.legend(['Training accuracy', 'Validation accuracy'], loc='lower right')
plt.xlabel('Epoch')
plt.ylabel('Clasification accuracy')

Out[12]:

<matplotlib.text.Text at 0x1107ebe10>

In [13]:

# Make predictions with fine-tune

pred = clf.predict(X_test)
acc = np.mean(y_test == pred)

print("Accuracy with fine-tune: {:5.2f}% \n".format(acc*100))

Accuracy with fine-tune: 98.15%

In [14]:

# Make predictions without fine-tune

clf_without_finetune = MLP(layer_units, weights=init_weights)
pred = clf_without_finetune.predict(X_test)
acc = np.mean(y_test == pred)

print("Accuracy without fine-tune: {:5.2f}% \n".format(acc*100))

Accuracy without fine-tune: 95.53%

In [15]:

# View some images and predictions
n_images = 4
images = X_test[:n_images].reshape((n_images, 28, 28))
pred = clf.predict(X_test[:n_images])

for i in range(n_images):
    plt.subplot(1, n_images, i+1)
    plt.imshow(images[i], cmap = plt.cm.gray)
    plt.title('Predicted digit: {}'.format(pred[i]))
    plt.axis('off')

In [16]:

# Visulize the optimal W0

image = display_network(opt_weights[0]['W'])
plt.imshow(image, cmap = plt.cm.gray)

Out[16]:

<matplotlib.image.AxesImage at 0x1108ca4e0>

Sparse autoencoder

import numpy as np import scipy.optimize from dnn_play.activations import sigmoid, sigmoid_deriv from dnn_play.utils.np_utils import to_binary_class_matrix, flatten_struct, pack_struct from dnn_play.utils.gradient_utils import eval_numerical_gradient, rel_norm_diff def ac_func(x): return sigmoid(x) def ac_func_deriv(x): return sigmoid_deriv(x) class SparseAutoencoder(object): """ Sparse autoencoder. """ def __init__(self, layer_units, weights=None): self.weights = weights self.layer_units = layer_units def init_weights(self): """ Initialize weights. layer_units: tuple stores the size of each layer. weights: structured weights. """ """ Initialize weights. layer_units: tuple stores the size of each layer. weights: structured weights. """ # Note layer_units[2] = layer_units[0] layer_units = self.layer_units n_layers = len(layer_units) assert n_layers == 3 # Initialize parameters randomly based on layer sizes r = np.sqrt(6) / np.sqrt(layer_units[1] + layer_units[0]) # We'll choose weights uniformly from the interval [-r, r) weights = [{} for i in range(n_layers - 1)] weights[0]['W'] = np.random.random((layer_units[0], layer_units[1])) * 2.0 * r - r weights[1]['W'] = np.random.random((layer_units[1], layer_units[2])) * 2.0 * r - r weights[0]['b'] = np.zeros(layer_units[1]) weights[1]['b'] = np.zeros(layer_units[2]) self.weights = weights return self.weights def fit(self, X, reg=3e-3, beta=3, sparsity_param=1e-1, learning_rate=1e-2, optimizer='L-BFGS-B', max_iters=100, verbose=False): best_loss = 1e12 best_weights = {} if self.weights is None: # lazily initialize weights self.weights = self.init_weights() # Solve with L-BFGS-B options = {'maxiter': max_iters, 'disp': verbose} def J(theta): weights = pack_struct(theta, self.layer_units) loss, grad = sparse_autoencoder_loss(weights, X, reg, beta=beta, sparsity_param=sparsity_param) grad = flatten_struct(grad) return loss, grad # Callback to get accuracies based on training set iter_feval = 0 loss_history = [] def progress(x): nonlocal iter_feval, best_weights, best_loss iter_feval += 1 # Loss history weights = pack_struct(x, self.layer_units) loss, grad = sparse_autoencoder_loss(weights, X, reg, beta=beta, sparsity_param=sparsity_param) loss_history.append(loss) # Keep track of the best weights based on loss if loss < best_loss: best_loss = loss n_weights = len(weights) best_weights = [{} for i in range(n_weights)] for i in range(n_weights): for p in weights[i]: best_weights[i][p] = weights[i][p].copy() n_iters_verbose = max_iters / 20 if iter_feval % n_iters_verbose == 0: print("iter: {:4d}, loss: {:8f}".format(iter_feval, loss)) # Minimize the loss function init_theta = flatten_struct(self.weights) results = scipy.optimize.minimize(J, init_theta, method=optimizer, jac=True, callback=progress, options=options) # Save weights self.weights = best_weights return self.weights, loss_history def predict(self, X): """ Predict the outputs. """ weights = self.weights # Weighting parameters W0 = weights[0]['W'] b0 = weights[0]['b'] W1 = weights[1]['W'] b1 = weights[1]['b'] # Number of samples m = X.shape[0] # Forward pass a0 = X z1 = np.dot(a0, W0) + b0 a1 = ac_func(z1) z2 = np.dot(a1, W1) + b1 a2 = ac_func(z2) return a2 def forward_pass(self, X): """ Perform forward pass and return activations of layer 1. """ weights = self.weights # Weighting parameters W0 = weights[0]['W'] b0 = weights[0]['b'] # Number of samples m = X.shape[0] # Forward pass a0 = X # Input activation z1 = np.dot(a0, W0) + b0 a1 = ac_func(z1) return a1 def flatten_struct(self, data): return flatten_struct(data) def pack_struct(self, data): return pack_struct(data, self.layer_units) def get_weights(self): return self.weights def sparse_autoencoder_loss(weights, X, reg, beta=3, sparsity_param=0.1): """ Compute loss and gradients of the sparse autorncoder. """ Y = X W0 = weights[0]['W'] b0 = weights[0]['b'] W1 = weights[1]['W'] b1 = weights[1]['b'] # Number of samples m = X.shape[0] # Forward pass a0 = X z1 = np.dot(a0, W0) + b0 a1 = ac_func(z1) z2 = np.dot(a1, W1) + b1 a2 = ac_func(z2) # Compute rho_hat used in sparsity penalty rho = sparsity_param rho_hat = np.mean(a1, axis=0) sparsity_delta = -rho/rho_hat + (1.0-rho)/(1-rho_hat) # Loss function mean_squared_error = 1.0/(2.0*m) * np.sum((a2 - Y)**2) weight_decay = 0.5*reg*(np.sum(W0*W0) + np.sum(W1*W1)) sparsity_term = beta*np.sum(KL_divergence(rho, rho_hat)) loss = mean_squared_error + weight_decay + sparsity_term # Backpropagation delta2 = (a2 - Y) * ac_func_deriv(z2) delta1 = (np.dot(delta2, W1.T) + beta*sparsity_delta) * ac_func_deriv(z1) # Gradients grad = [{} for i in range(len(weights))] grad[0]['W'] = np.dot(a0.T, delta1) / m + reg*W0 grad[0]['b'] = np.mean(delta1, axis=0) grad[1]['W'] = np.dot(a1.T, delta2) / m + reg*W1 grad[1]['b'] = np.mean(delta2, axis=0) return loss, grad def KL_divergence(p, q): """ Kullback-Leiber divergence. """ return p*np.log(p/q) + (1-p)*np.log((1-p)/(1-q)) def rel_err_gradients(): """ Return the relative error between analytic and nemerical gradients. """ # Number of layer units n_samples = 100 input_size = 4 * 4 hidden_size = 4 output_size = input_size layer_units = (input_size, hidden_size, output_size) X_train = np.random.randn(n_samples, input_size) reg = 1e-4 beta = 3 # weight of sparsity penalty term sparsity_param = 1e-1 # desired average activation of the hidden units # Define the classifier sae = SparseAutoencoder(layer_units) # Initialize weights weights = sae.init_weights() # Analytic gradients of the cost function cost, grad = sparse_autoencoder_loss(weights, X_train, reg, beta=beta, sparsity_param=sparsity_param) grad = sae.flatten_struct(grad) # Flattened gradients def J(theta): # Structured weights weights = sae.pack_struct(theta) return sparse_autoencoder_loss(weights, X_train, reg, beta=beta, sparsity_param=sparsity_param)[0] theta = sae.flatten_struct(weights) numerical_grad = eval_numerical_gradient(J, theta) # Compare numerically computed gradients with those computed analytically rel_err = rel_norm_diff(numerical_grad, grad) return rel_err

Multilayer perceptron

import numpy as np import scipy.optimize from dnn_play.activations import tanh, tanh_deriv, sigmoid, sigmoid_deriv from dnn_play.utils.np_utils import to_binary_class_matrix, flatten_struct, pack_struct from dnn_play.utils.gradient_utils import eval_numerical_gradient, rel_norm_diff def ac_func(x): return sigmoid(x) def ac_func_deriv(x): return sigmoid_deriv(x) class MLP(object): """ Multilayer perceptron. """ def __init__(self, layer_units, weights=None): self.weights = weights self.layer_units = layer_units def init_weights(self, eps=1e-4): """ Initialize weights. layer_units: tuple stores the size of each layer. weights: structured weights. """ layer_units = self.layer_units n_layers = len(layer_units) weights = [{} for i in range(n_layers - 1)] for i in range(n_layers - 1): weights[i]['W'] = eps * np.random.randn(layer_units[i], layer_units[i+1]) weights[i]['b'] = np.zeros(layer_units[i+1]) self.weights = weights return self.weights def fit(self, X, y, X_val, y_val, reg=0.0, learning_rate=1e-2, optimizer='L-BFGS-B', max_iters=100, verbose=False): epoch = 0 best_val_acc = 0.0 best_weights = {} if self.weights is None: # lazily initialize weights self.weights = self.init_weights() # Solve with L-BFGS-B options = {'maxiter': max_iters, 'disp': verbose} def J(theta): weights = pack_struct(theta, self.layer_units) loss, grad = mlp_loss(weights, X, y, reg) grad = flatten_struct(grad) return loss, grad # Callback to get accuracies based on training / validation sets iter_feval = 0 loss_history = [] train_acc_history = [] val_acc_history = [] def progress(x): nonlocal iter_feval, best_weights, best_val_acc iter_feval += 1 # Loss history weights = pack_struct(x, self.layer_units) loss, grad = mlp_loss(weights, X, y, reg) loss_history.append(loss) # Training accurary y_pred_train = mlp_predict(weights, X) train_acc = np.mean(y_pred_train == y) train_acc_history.append(train_acc) # Validation accuracy y_pred_val= mlp_predict(weights, X_val) val_acc = np.mean(y_pred_val == y_val) val_acc_history.append(val_acc) # Keep track of the best weights based on validation accuracy if val_acc > best_val_acc: best_val_acc = val_acc n_weights = len(weights) best_weights = [{} for i in range(n_weights)] for i in range(n_weights): for p in weights[i]: best_weights[i][p] = weights[i][p].copy() n_iters_verbose = max_iters / 20 if iter_feval % n_iters_verbose == 0: print("iter: {:4d}, loss: {:8f}, train_acc: {:4f}, val_acc: {:4f}".format(iter_feval, loss, train_acc, val_acc)) # Minimize the loss function init_theta = flatten_struct(self.weights) results = scipy.optimize.minimize(J, init_theta, method=optimizer, jac=True, callback=progress, options=options) # Save weights self.weights = best_weights return self.weights, loss_history, train_acc_history, val_acc_history def predict(self, X): """ X: the N x M input matrix, where each column data[:, i] corresponds to a single test set pred: the predicted results. """ pred = mlp_predict(self.weights, X) return pred def flatten_struct(self, data): return flatten_struct(data) def pack_struct(self, data): return pack_struct(data, self.layer_units) def mlp_loss(weights, X, y, reg): """ Compute loss and gradients of the neutral network. """ L = len(weights) # The index of the output layer z = [] a = [] err_tol = 1e-10 # Error of tolerance # Number of samples m = X.shape[0] # Forward pass z.append(0) # Dummy element a.append(X) # Input activation for i in range(0, L): W = weights[i]['W'] b = weights[i]['b'] z.append(np.dot(a[-1], W) + b) a.append(ac_func(z[-1])) # Note the final element in a[:] will not be used zL_max = np.max(z[-1], axis=1, keepdims=True) z[-1] -= zL_max # Avoid numerical problem due to large values of exp(z[-1]) proba = np.exp(z[-1]) / np.sum(np.exp(z[-1]), axis=1, keepdims=True) + err_tol # Add err_tol to avoid this value too close to zero # Target matrix of labels Y = to_binary_class_matrix(y) # loss function sum_squared_weights = 0.0 # Sum of squared weights for i in range(L): W = weights[i]['W'] sum_squared_weights += np.sum(W*W) loss = -1.0/m * np.sum(Y * np.log(proba)) + 0.5*reg*sum_squared_weights # Backpropagation delta = [-1.0 * (Y - proba)] for i in reversed(range(L)): # Note that delta[0] will not be used W = weights[i]['W'] d = np.dot(delta[0], W.T) * ac_func_deriv(z[i]) delta.insert(0, d) # Insert element at beginning # Gradients grad = [{} for i in range(L)] for i in range(L): W = weights[i]['W'] grad[i]['W'] = np.dot(a[i].T, delta[i+1]) / m + reg*W grad[i]['b'] = np.mean(delta[i+1], axis=0) return loss, grad def mlp_predict(weights, X): """ X: the N x M input matrix, where each column data[:, i] corresponds to a single test set pred: the predicted results. """ L = len(weights) # The index of the output layer z = [] a = [] err_tol = 1e-10 # Error of tolerance # Number of samples m = X.shape[0] # Forward pass z.append(0) # Dummy element a.append(X) # Input activation for i in range(0, L): W = weights[i]['W'] b = weights[i]['b'] z.append(np.dot(a[-1], W) + b) a.append(ac_func(z[-1])) # Note the final element in a[:] will not be used zL_max = np.max(z[-1], axis=1, keepdims=True) z[-1] -= zL_max # Avoid numerical problem due to large values of exp(z[-1]) proba = np.exp(z[-1]) / np.sum(np.exp(z[-1]), axis=1, keepdims=True) + err_tol # Add err_tol to avoid this value too close to zero # Predictions pred = np.argmax(proba, axis=1) return pred def rel_err_gradients(): """ Return the relative error between analytic gradients and nemerical ones. """ # Number of layer units n_samples = 100 input_size = 4 * 4 hidden_size_L1 = 4 hidden_size_L2 = 4 output_size = 10 layer_units = (input_size, hidden_size_L1, hidden_size_L2, output_size) X_train = np.random.randn(n_samples, input_size) y_train = np.random.randint(output_size, size=n_samples) reg = 1e-4 # Define the classifier clf = MLP(layer_units) # Initialize weights weights = clf.init_weights() # Analytic gradients of the cost function cost, grad = mlp_loss(weights, X_train, y_train, reg) grad = clf.flatten_struct(grad) # Flattened gradients def J(theta): # Structured weights weights = clf.pack_struct(theta) return mlp_loss(weights, X_train, y_train, reg)[0] theta = clf.flatten_struct(weights) numerical_grad = eval_numerical_gradient(J, theta) # Compare numerically computed gradients with those computed analytically rel_err = rel_norm_diff(numerical_grad, grad) return rel_err

In case you are interested in all codes related in this demonstration, please check the repository.

Comments

Implement stacked multilayer perceptron for digit recognition

Main workflow

Ipython notebook

Sparse autoencoder

Multilayer perceptron

Published

Category

Tags