This is the 13th day of my participation in the August More text Challenge. For details, see: August More Text Challenge

NumPy built my first neural network

preface

Using pure NUMPY to achieve gesture recognition, the first is to carry out the overall network composition, and then show the code part. This is my first neural network.

Complete code: GitHub

Network generally reflects: input layer, hidden layer, output layer. What we already know is that the output layer has ten outcomes, which is the probability of ten numbers.

About training sets, validation sets, test sets

Gradient descent by handThe solution of three parameters

The first parameter is calculated by hand

Code section

The activation function, and the derivative of the activation function

def tanh(x) :
    return np.tanh(x)

def bypass(x) :
    return x

def softmax(x) :
    exp=np.exp(x-x.max())   # To prevent exponential explosion (exponential function)
    # so now I'm going to subtract the maximum value from it and make it negative, but in the end,
    # The results are the same.
    return exp/exp.sum(a)def d_softmax(data) :
    sm = softmax(data)
    return np.diag(sm) - np.outer(sm, sm)

def d_tanh(data) :  # Return value return vector
    return 1 / (np.cosh(data)) ** 2

def d_bypass(x) :
    return 1

differential = {softmax: d_softmax, tanh: d_tanh,bypass:d_bypass}
d_type = {bypass:'times',softmax:'dot',tanh:'times'}
Copy the code

Initialization parameter

dimensions=[28*28.100.10]  # 28*28 neurons,100 neurons are hidden layers, output 10 numbers
# 28*28 neurons are first connected to 100 neurons and then connected to 10 output layers
activation =[bypass,tanh,softmax]  # Bypass Two activation functions
distribution=[   Initialization process
    {},# empty
    {'b': [0.0].'w':[-math.sqrt(6/(dimensions[0]+dimensions[1])),math.sqrt(6/(dimensions[0]+dimensions[1])]}, {'b': [0.0].'w':[-math.sqrt(6/(dimensions[1]+dimensions[2])),math.sqrt(6/(dimensions[1]+dimensions[2])]},]Copy the code

Initialize the parameter function

def init_parameters_b(layer) :  # Initialize b
    dist= distribution[layer]['b']
    return np.random.rand(dimensions[layer])*(dist[1]-dist[0])+dist[0]    

def init_parameters_w(layer) :
    dist=distribution[layer]['w']
    return np.random.rand(dimensions[layer-1],dimensions[layer])*(dist[1]-dist[0])+dist[0]

def init_parameters() :
    parameter=[]   # Iterate over the results of each iteration
    for i in range (len(distribution)):
        layer_parameter={}   # Store data for each iteration and always
        for j in distribution[i].keys():
            if j=='b':
                layer_parameter['b']=init_parameters_b(i)
                continue
            if j=='w':
                layer_parameter['w']=init_parameters_w(i)
                continue
        parameter.append(layer_parameter)
    return parameter
Copy the code

Initial initialization parameters (untrained)

parameters=init_parameters()
Copy the code

Prediction function

def predict(img,parameters) :
    # Parameters: images, parameters
    l_in=img
    l_out=activation[0](l_in)  Initialization of the first layer
    for layer in range(1.len(dimensions)):  
        l_in = np.dot(l_out,parameters[layer]['w'])+parameters[layer]['b']  # Iterate
        l_out = activation[layer](l_in)
    return l_out
Copy the code

The first prediction (this prediction is unstable because it uses the initialized model)

predict(train_img[0],init_parameters()) // Result: array([0.07210171.0.07957606.0.13152407.0.05420442.0.08498909.0.12788144.0.14911174.0.14570486.0.08225591.0.07265069])
Copy the code

Training set, validation set, test set

dataset_path=Path('D:/Desktop/MNIST')

train_img_path=dataset_path/'train-images-idx3-ubyte/train-images.idx3-ubyte'
train_lab_path=dataset_path/'train-labels-idx1-ubyte/train-labels.idx1-ubyte'
test_img_path=dataset_path/'t10k-images-idx3-ubyte/t10k-images.idx3-ubyte'
test_lab_path=dataset_path/'t10k-labels-idx1-ubyte/t10k-labels.idx1-ubyte'
Copy the code

Each set is separate

train_num = 50000  # training
valid_num = 10000  # validation
test_num = 10000  # test

with open(train_img_path, 'rb') as f:
    struct.unpack('>4i', f.read(16))
    temp_img = np.fromfile(f, dtype=np.uint8).reshape(-1.28 * 28) / 255
    train_img = temp_img[:train_num]  # Divide 1w of data from the training set for verification
    valid_img = temp_img[train_num:]

with open(test_img_path, 'rb') as f:
    struct.unpack('>4i', f.read(16))
    test_img = np.fromfile(f, dtype=np.uint8).reshape(-1.28 * 28) / 255

with open(train_lab_path, 'rb') as f:
    struct.unpack('>2i', f.read(8))
    temp_lab = np.fromfile(f, dtype=np.uint8)
    train_lab = temp_lab[:train_num]
    valid_lab = temp_lab[train_num:]

with open(test_lab_path, 'rb') as f:
    struct.unpack('>2i', f.read(8))
    test_lab = np.fromfile(f, dtype=np.uint8)
Copy the code

Show picture label

def show_train(index) :
    plt.imshow(train_img[index].reshape(28.28), cmap='gray')
    pylab.show()
    print('label:{}'.format(train_lab[index]))


def show_valid(index) :
    plt.imshow(valid_img[index].reshape(28.28), cmap='gray')
    pylab.show()
    print('label:{}'.format(valid_lab[index]))


def show_test(index) :
    plt.imshow(test_img[index].reshape(28.28), cmap='gray')
    pylab.show()
    print('test:{}'.format(test_lab[index]))
Copy the code

Use random numbers to predict results

predict(np.random.rand(784// Result: array([parameters])0.0942381 , 0.11644771.0.05850607.0.23711087.0.02732923.0.0176975 , 0.19317991.0.14196864.0.08510021.0.02842176])
Copy the code

Check to see if you’ve got the derivative right, check by definition, otherwise if you’ve got the derivative wrong, you’re going to lose everything.

h = 0.0001
func = softmax
input_len = 4
for i in range(input_len):  # Two ways to take the derivative
    # One is the definition method, and one is the direct derivation method
    test_input = np.random.rand(input_len)
    derivative = differential[func](test_input)
    value1 = func(test_input)
    test_input[i] += h
    value2 = func(test_input)
    # print((value2 - value1) / h)
    Print (derivative[I] - (value2-value1)/h

onehot = np.identity(dimensions[-1])  # 10 numbers

Copy the code

Gradient descent, in this case, is to reverse transfer (in simple terms, chain derivative), in order to make loss_function as small as possible, is to make y1-y0 as close as possible, so it is to make y1 closer to the true value, and then update the initial value once, and then update it again and again.

def sqr_loss(img, lab, parameters) :
    y_pred = predict(img, parameters)
    y = onehot[lab]
    diff = y - y_pred
    return np.dot(diff, diff)

def grad_parameters(img, lab, parameters) :
    # Parameters: images, parameters
    l_in_list=[img] # the first argument is 0 so img+0 is equal to img
    l_out_list=[activation[0](l_in_list[0]]# The first out is also the initial out
    for layer in range(1.len(dimensions)):  
        l_in = np.dot(l_in_list[layer-1], parameters[layer]['w']) + parameters[layer]['b']
        l_out = activation[layer](l_in)
        l_in_list.append(l_in)
        l_out_list.append(l_out)
        
    d_layer =-2*(onehot[lab] - l_out_list[-1])
    grad_result=[None] *len(dimensions)
    for layer in range(len(dimensions)-1.0, -1) :# Back propagation
        if d_type[activation[layer]]=='times':
            d_layer = differential[activation[layer]](l_in_list[layer])*d_layer
        if d_type[activation[layer]]=='dot':
            d_layer = np.dot(differential[activation[layer]](l_in_list[layer]), d_layer)
        # Parameter 1:
        grad_result[layer]={}
        grad_result[layer]['b'] = d_layer
        grad_result[layer]['w'] = np.outer(l_out_list[layer-1], d_layer)  # Results from the previous layer
        d_layer = np.dot(parameters[layer]['w'],d_layer)
    return grad_result 
Copy the code

Parameter results after back propagation

grad_parameters(train_img[0],train_lab[0],init_parameters())
Copy the code

Verify that the partial derivative of back propagation is correct

# Verify parameter b
h = 0.00001  					# Verify the reverse transfer is correct, the derivation process.
layer=2
parameters = init_parameters()
pname='b'
for i in range(len(parameters[layer][pname])):  	# Two ways to take the derivative
    # One is the definition method, and one is the direct derivation method
    img_i = np.random.randint(train_num)  			# Random digital images
    test_parameters = init_parameters()  			# random find
    derivative = grad_parameters(train_img[img_i], train_lab[img_i], test_parameters)[layer][pname]
    value1 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
    test_parameters[layer][pname][i] += h
    value2 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
    print(derivative[i]-(value2-value1)/h)

# Verify parameter w
h = 0.00001  # Verify the direction transfer is correct, the derivation process.
layer=1
parameters = init_parameters()
pname='w'
grad_list=[]
for i in range(len(parameters[layer][pname])):  # Two ways to take the derivative
    for j in range(len(parameters[layer][pname][0])): 
        img_i = np.random.randint(train_num)  # Random digital images
        test_parameters = init_parameters()  # random find
        derivative = grad_parameters(train_img[img_i], train_lab[img_i], test_parameters)[layer][pname]
        value1 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
        test_parameters[layer][pname][i][j] += h
        value2 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
        grad_list.append(derivative[i][j]-(value2-value1)/h)
np.abs(grad_list).max(a)Copy the code

Loss function

def valid_loss(parameters) :  # Verify set functions
    loss_accu = 0
    for img_i in range(valid_num):
        loss_accu += sqr_loss(valid_img[img_i], valid_lab[img_i], parameters)
    return loss_accu / (valid_num / 10000)  # loss_accu per 1w images
    # Normalize, 1w images as a whole

    
def valid_accuracy(parameters) :  # accuracy
    correct = [predict(valid_img[img_i], parameters).argmax() == valid_lab[img_i] for img_i in range(valid_num)]
    return correct.count(True) / len(correct)


def train_loss(parameters) :   # Verify set functions
    loss_accu = 0
    for img_i in range(train_num):
        loss_accu += sqr_loss(train_img[img_i], train_lab[img_i], parameters)
    return loss_accu / (train_num / 10000)


def train_accuracy(parameters) :  # accuracy
    correct = [predict(train_img[img_i], parameters).argmax() == train_lab[img_i] for img_i in range(train_num)]
    return correct.count(True) / len(correct)

def test_accuracy(parameters) :  # accuracy
    correct = [predict(test_img[img_i], parameters).argmax() == test_lab[img_i] for img_i in range(test_num)]
    return correct.count(True) / len(correct)

Copy the code
def grad_add(grad1,grad2) :
    for layer in range(1.len(grad1)):
        for pname in grad1[layer].keys():
            grad1[layer][pname]+=grad2[layer][pname]
    return grad1
    
def grad_divide(grad,denominator) :
    for layer in range(1.len(grad)):
        for pname in grad[layer].keys():
            grad[layer][pname]/=denominator
    return grad
    
def combine_parameters(parameters, grad, learn_rate) :  # New parameter formation
    parameter_tmp = copy.deepcopy(parameters)
    for layer in range(len(parameter_tmp)):
        for pname in parameter_tmp[layer].keys():
            parameter_tmp[layer][pname] -= learn_rate * grad[layer][pname]
    return parameter_tmp
Copy the code

An overall amount of training

batch_size = 100  	# 100 pictures as a whole
                    # 100 pictures as a piece
                    # Train a small piece
def train_batch(current_batch, parameters) :  	Calculate the gradient of the 100 images and get an average!
    grad_accu = grad_parameters(train_img[current_batch * batch_size], train_lab[current_batch * batch_size],
                                parameters)
    for img_i in range(1, batch_size):
        grad_temp = grad_parameters(train_img[current_batch * batch_size + img_i],
                                    train_lab[current_batch * batch_size + img_i], parameters)
        grad_add(grad_accu,grad_temp)
            # Get the cumulative gradient
    grad_divide(grad_accu,batch_size)    				# Get the direction
    return grad_accu

parameters = init_parameters()
Copy the code

The training process

from tqdm import tqdm_notebook
current_epoch = 0
train_loss_list = []  # Store the value of loss
valid_loss_list = []  # Store the value of the verified loss
train_accu_list = []  # Correct training
valid_accu_list = []  # Verify that the set is correct
learn_rate = 10* * -0.3  The learning rate will become smaller in the end
epoch_num = 5  # The number of training times is called an epoch after training
for epoch in tqdm_notebook(range(epoch_num)):
    for i in range(train_num // batch_size):
# if i % 100 == 99:
# print('running batch{}/{}'.format(i + 1, train_num // batch_size))
        grad_tmp = train_batch(i, parameters)
        parameters = combine_parameters(parameters, grad_tmp, learn_rate)
    current_epoch += 1
    train_loss_list.append(train_loss(parameters))
    train_accu_list.append(train_accuracy(parameters))
    valid_loss_list.append(valid_loss(parameters))
    valid_accu_list.append(valid_accuracy(parameters))

    
valid_accuracy(parameters)
Copy the code

Loss of validation set and training set

lower = -0
plt.plot(valid_loss_list[lower:], color='black', label='validation loss')
plt.plot(train_loss_list[lower:], color='red', label='train loss')
plt.show()
Copy the code

Accuracy of the validation set and the training set

plt.plot(valid_accu_list[lower:], color='black', label='validation accuracy')
plt.plot(train_accu_list[lower:], color='red', label='train accuracy')
plt.show()
Copy the code

The last

Xiao Sheng Fan Yi, looking forward to your attention.