Backpropagation algorithm for actual combat

The back propagation algorithm of this paper is based on the neural network back propagation algorithm (BP) formula of the previous article. If you are not familiar with the back propagation algorithm, it is strongly recommended to refer to the previous article.

We will implement a 4 layer fully connected network to complete the binary task. The number of nodes in the network input is 2, the number of nodes in the hidden layer is designed as 25, 50 and 25, and the two nodes in the output layer represent the probability of belonging to category 1 and category 2 respectively, as shown in the figure below. Instead of using Softmax function to constrain the sum of the probability values of the network output, the mean square error function is directly used to calculate the error between the real tag and the one-HOT encoding. All the network activation functions adopt Sigmoid function, all of which are designed to directly use our gradient propagation formula.

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
Copy the code

1. Prepare data

X, y = datasets.make_moons(n_samples=1000, noise=0.2, random_state=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X.shape, y.shape)  # (1000, 2) (1000)
Copy the code
(1000, 2) (1000),Copy the code
def make_plot(X, y, plot_name):
    plt.figure(figsize=(12.8))    
    plt.title(plot_name, fontsize=30)     
    plt.scatter(X[y==0.0], X[y==0.1])
    plt.scatter(X[y==1.0], X[y==1.1])
Copy the code
make_plot(X, y, "Classification Dataset Visualization ") 
Copy the code

2. The network layer

  • By creating a new classLayerTo realize a network layer, parameters such as the number of input nodes, the number of output nodes and the type of activation function are needed
  • A weightweightsAnd the offset tensorbiasAutomatically generated and initialized according to the number of input and output nodes during initialization
class Layer:
    # Full link network layer
    def __init__(self, n_input, n_output, activation=None, weights=None, bias=None):
        "" :param int n_input: number of input nodes :param int n_output: number of output nodes :param STR activation: Activation Function type :param weights: Weight tensor, default class internal generated :param bias: default class internal generated """
        self.weights = weights if weights is not None else np.random.randn(n_input, n_output) * np.sqrt(1 / n_output) 
        self.bias = bias if bias is not None else np.random.rand(n_output) * 0.1
        self.activation = activation # activate function type, e.g. 'sigmoid'
        self.activation_output = None The output of the activation function is o
        self.error = None  The intermediate variable used to calculate the delta variable for the current layer
        self.delta = None  Record the delta variable for the current layer to calculate the gradient
    
    def activate(self, X):
        # forward calculation function
        r = np.dot(X, self.weights) + self.bias # X@W + b
        Activation_output = activation_output = activation_output
        self.activation_output = self._apply_activation(r) 
        return self.activation_output
    
    def _apply_activation(self, r): Calculate the output of the activation function
        if self.activation is None:
            return r # no activation function, return directly
        elif self.activation == 'relu':
            return np.maximum(r, 0)
        elif self.activation == 'tanh':
            return np.tanh(r)
        elif self.activation == 'sigmoid':
            return 1 / (1 + np.exp(-r))
        
        return r
    
    def apply_activation_derivative(self, r):
        Compute the derivative of the activation function
        # No activation function, derivative is 1
        if self.activation is None:
            return np.ones_like(r)
        # Derivative of ReLU function
        elif self.activation == 'relu':             
            grad = np.array(r, copy=True)             
            grad[r > 0] = 1.             
            grad[r <= 0] = 0.             
            return grad
        Derivative implementation of the # tanh function
        elif self.activation == 'tanh':             
            return 1 - r ** 2 
        # Sigmoid function derivative implementation
        elif self.activation == 'sigmoid': 
            return r * (1 - r)
        return r
Copy the code

3. Network model

  • After creating the single-layer network class, we implement the network modelNeuralNetwork
  • It maintains the network layer of each layer internallyLayerClass object, which can be passedadd_layerFunction appends network layer,
  • Achieve the creation of different structure of the network model.
y_test.flatten().shape # (300),
Copy the code
(300)Copy the code
class NeuralNetwork:
    def __init__(self):
        self._layers = [] # List of network layer objects
    
    def add_layer(self, layer):
        self._layers.append(layer)
    
    def feed_forward(self, X):
        # Forward propagation (derivative)
        for layer in self._layers:
            X = layer.activate(X)
        return X
    
    def backpropagation(self, X, y, learning_rate):
        # Backpropagation algorithm implementation
        # Compute forward to get the final output value
        output = self.feed_forward(X)
        for i in reversed(range(len(self._layers))): # Reverse loop
            layer = self._layers[i]
            if layer == self._layers[- 1] :# if it is output layer
                layer.error = y - output
                Calculate the delta of the last layer, refer to the gradient formula of the output layer
                layer.delta = layer.error * layer.apply_activation_derivative(output)
            else: # If it is a hidden layer
                next_layer = self._layers[i + 1]
                layer.error = np.dot(next_layer.weights, next_layer.delta)
                layer.delta = layer.error*layer.apply_activation_derivative(layer.activation_output)
        
        # loop to update weights
        for i in range(len(self._layers)):
            layer = self._layers[i]
            # o_i is the output of the previous network layer
            o_i = np.atleast_2d(X if i == 0 else self._layers[i - 1].activation_output)
            # Gradient descent algorithm, delta is a negative number in the formula, so we use a plus sign here
            layer.weights += layer.delta * o_i.T * learning_rate 
    
    def train(self, X_train, X_test, y_train, y_test, learning_rate, max_epochs):
        # network training function
        # one - hot coding
        y_onehot = np.zeros((y_train.shape[0].2)) 
        y_onehot[np.arange(y_train.shape[0]), y_train] = 1
        mses = [] 
        for i in range(max_epochs):  # Train 100 Epochs
            for j in range(len(X_train)):  # Train one sample at a time
                self.backpropagation(X_train[j], y_onehot[j], learning_rate)             
                if i % 10= =0: 
                    Print MSE Loss
                    mse = np.mean(np.square(y_onehot - self.feed_forward(X_train)))                 
                    mses.append(mse)                 
                    print('Epoch: #%s, MSE: %f, Accuracy: %.2f%%' % 
                          (i, float(mse), self.accuracy(self.predict(X_test), y_test.flatten()) * 100)) 

        return mses
    
    def accuracy(self, y_predict, y_test): # Accuracy of calculation
        return np.sum(y_predict == y_test) / len(y_test)
    
    def predict(self, X_predict):
        y_predict = self.feed_forward(X_predict) # the shape of y_predict is [600 * 2], and the second dimension represents the probability of both outputs
        y_predict = np.argmax(y_predict, axis=1)
        return y_predict        
Copy the code

4. Network training

nn = NeuralNetwork() Instantiate the network class
nn.add_layer(Layer(2.25.'sigmoid'))  # Hidden layer 1, 2=>25
nn.add_layer(Layer(25.50.'sigmoid')) # Hidden Layer 2, 25=>50
nn.add_layer(Layer(50.25.'sigmoid')) # Hidden layer 3, 50=>25
nn.add_layer(Layer(25.2.'sigmoid'))  Output layer, 25=>2
Copy the code
Train (X_train, X_test, y_train, y_test, learning_rate=0.01, max_epochs=50)
Copy the code
def plot_decision_boundary(model, axis):
    
    x0, x1 = np.meshgrid(
        np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) *100)).reshape(1.- 1),
        np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) *100)).reshape(- 1.1)
    )
    X_new = np.c_[x0.ravel(), x1.ravel()]
    
    y_predic = model.predict(X_new)
    zz = y_predic.reshape(x0.shape)
    
    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A'.'#FFF590'.'#90CAF9'])
    
    plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
Copy the code
plt.figure(figsize=(12.8))    
plot_decision_boundary(nn, [2 -.2.5.- 1.2])
plt.scatter(X[y==0.0], X[y==0.1])
plt.scatter(X[y==1.0], X[y==1.1])
Copy the code
<matplotlib.collections.PathCollection at 0x29018d6dfd0>
Copy the code

y_predict = nn.predict(X_test)
Copy the code
y_predict[:10] # array([1, 1, 0, 1, 0, 0, 0, 1, 1, 1], dtype=int64)
Copy the code
array([1, 1, 0, 1, 0, 0, 0, 1, 1, 1], dtype=int64)
Copy the code
y_test[:10] # array([1, 1, 0, 1, 0, 0, 0, 1, 1, 1], dtype=int64)
Copy the code
array([1, 1, 0, 1, 0, 0, 0, 1, 1, 1], dtype=int64)
Copy the code
nn.accuracy(y_predict, y_test.flatten()) # 0.86
Copy the code
0.86
Copy the code