· Use Word2Vec and RNN(LSTM) to do text sentiment analysis (how to read the human mind)

With the help of

  1. DNN
  2. CNN
  3. Word2Vec
  4. RNN(LSTM)

If not, review our previous posts. With full connection, convolutional neural network and cyclic neural network are implemented respectively. Code part: 1. Full connection implementation

import os
os.environ["KMP_DUPLICATE_LIB_OK"] ="TRUE"
import numpy as np
import pandas as pd
import pickle
import time
import tensorflow as tf
import collections
from tensorflow import keras



reviews = pd.read_csv('./2RNN/txt_deal/reviews.txt', header=None)
labels = pd.read_csv('./2RNN/txt_deal/labels.txt', header=None)

reviews_datas = reviews.values
labels_datas = labels.values
chars = ["".""]

def get_words(npll) :
    words = []
    for ii in npll:
        for i in ii[0].split("") :if(i in chars):
                pass
            else:
                words.append(i)
    return(words)
words = get_words(reviews_datas)

vocab_size = 10000
vocab = collections.Counter(words).most_common(vocab_size-1)
#print((vocab))
count = [["<PAD>".0]]
count.extend(vocab)
#print(count[:10])

word2id = {}
id2word = {}
for i, w in enumerate(count):
    word2id[w[0]] = i
    id2word[i] = w[0]
print(id2word[100], word2id['i'])

reviews_seq = [seq[0].split("") for seq in reviews_datas]

reviews_list = []
seq_len = 256
for seq in reviews_seq:
    l = [1]
    for s in seq:
        if s in word2id:
            pass
        else:
            s = "<PAD>"
        l.append(word2id[s])
    if(len(l)>=seq_len):
        l=l[:seq_len]
    while(len(l)<seq_len):
        l.append(0)
    reviews_list.append(l)

reviews_list = np.array(reviews_list)
labels_list = pd.get_dummies(labels).values


x_val = reviews_list[:5000]
partial_x_train = reviews_list[5000:]

y_val = labels_list[:5000]
partial_y_train = labels_list[5000:]

labels = np.argmax(labels_list,axis=1)
print(reviews_list[0],labels[0])

print(reviews_list[0].len(reviews_list))
train_data = reviews_list 
train_labels = labels

train_rate=0.0001 
train_step=20
batch_size=500
embed_size = 32
sequence_length = 256
n_classes = 2


h1_num = 32
h2_num = 16
h3_num = 2





x = tf.placeholder(tf.int32,shape=[None,sequence_length],name="inputx")

y = tf.placeholder(dtype=tf.float32,shape=[None.2],name="expected_y")
print(y)

embeddings = tf.Variable(tf.random_normal([vocab_size, embed_size]))
x_1 = tf.nn.embedding_lookup(embeddings,x)
#(-1,256)-->(-1,256,32)
h1 = tf.keras.layers.GlobalAveragePooling1D()(x_1)
# (1256, 32) - > (1, 32)
weights2 = tf.Variable(tf.random_normal(shape=[h1_num,h2_num]))
bias2 = tf.Variable(tf.fill([h2_num],0.1))
Aiaa # (1) -- - > (1 dec)
h2 =  tf.nn.relu(tf.matmul(h1,weights2)+bias2)


# (1 dec) - > (1, 2)
#y_ = tf.nn.softmax(tf.matmul(h3,weights4)+bias4)

weights3 = tf.Variable(tf.random_normal(shape=[h2_num,h3_num]))
bias3 = tf.Variable(tf.fill([h3_num],0.1))
predy = (tf.matmul(h2,weights3)+bias3)

cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y,logits=predy))

opt=tf.train.AdamOptimizer().minimize(cost)

correct_pred=tf.equal(tf.argmax(predy,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_pred,tf.float32))

with tf.Session() as sess:
    saver = tf.train.Saver()
    srun = sess.run
    init =  tf.global_variables_initializer()
    srun(init)
    for e in range(train_step):
        for t in range(20000//batch_size):
            ts = int(t*batch_size)
            batch_x,batch_y = partial_x_train[ts:ts+batch_size],partial_y_train[ts:ts+batch_size]
            srun(opt,{x:batch_x,y:batch_y})
            if(t%1= =0):
                accuracy_val, cost_val = srun([accuracy,cost],{x:batch_x,y:batch_y})
                print(e,t,cost_val,accuracy_val)
                saver.save(sess,'./2RNN/3_1Word2Vec/txt/saver/model.ckpt',global_step=t)
        accuracy_val, cost_val = srun([accuracy,cost],{x:x_val,y:y_val})
        print(e,cost_val,accuracy_val)
Copy the code

The output

. 19 37 0.29615197 0.906 19 38 0.31939483 0.87 1939 0.4328907 0.81 19 0.42973673 0.8094Copy the code

2. CNN implementation

import os
os.environ["KMP_DUPLICATE_LIB_OK"] ="TRUE"
import numpy as np
import pandas as pd
import pickle
import time
import tensorflow as tf
import collections
from tensorflow import keras




reviews = pd.read_csv('./2RNN/txt_deal/reviews.txt', header=None)
labels = pd.read_csv('./2RNN/txt_deal/labels.txt', header=None)

reviews_datas = reviews.values
labels_datas = labels.values
chars = ["".""]

def get_words(npll) :
    words = []
    for ii in npll:
        for i in ii[0].split("") :if(i in chars):
                pass
            else:
                words.append(i)
    return(words)
words = get_words(reviews_datas)

vocab_size = 10000
vocab = collections.Counter(words).most_common(vocab_size-1)
#print((vocab))
count = [["<PAD>".0]]
count.extend(vocab)
#print(count[:10])

word2id = {}
id2word = {}
for i, w in enumerate(count):
    word2id[w[0]] = i
    id2word[i] = w[0]
print(id2word[100], word2id['i'])

reviews_seq = [seq[0].split("") for seq in reviews_datas]

reviews_list = []
seq_len = 256
for seq in reviews_seq:
    l = [1]
    for s in seq:
        if s in word2id:
            pass
        else:
            s = "<PAD>"
        l.append(word2id[s])
    if(len(l)>=seq_len):
        l=l[:seq_len]
    while(len(l)<seq_len):
        l.append(0)
    reviews_list.append(l)

reviews_list = np.array(reviews_list)
labels_list = pd.get_dummies(labels).values


x_val = reviews_list[:5000]
partial_x_train = reviews_list[5000:]

y_val = labels_list[:5000]
partial_y_train = labels_list[5000:]



train_rate=0.0001 
train_step=50
batch_size=500
embed_size = 16
sequence_length = 256
n_classes = 2




h1_num = 32
h2_num = 16
h3_num = 2





# (-1, 256)
x = tf.placeholder(tf.int32,shape=[None,sequence_length],name="inputx")

embeddings = tf.Variable(tf.random_normal([vocab_size, embed_size]))
# (-1, 256) -># (-1, 256,32)
x_1 = tf.nn.embedding_lookup(embeddings,x)



y=tf.placeholder(dtype=tf.float32,shape=[None,h3_num],name="expected_y")


def CNN(x) :
    Convert the input to the shape accepted by CNN: batch_size,sequence_length,frame_size,deepsize
    # (1256) - > 1256,16,1 (-)
    x = tf.reshape(x,[-1,sequence_length,embed_size,1])
    # 1256,16,1 (-) - > 1128,8,1 (-)
    pool0 = tf.nn.max_pool(x, ksize=[1.2.2.1], strides=[1.2.2.1], padding='SAME')  
    # The first layer: convolution layer
    # 1128,8,1 (-) - > 1128,8,32 (-)
    conv1_weights = tf.get_variable("conv1_weights"[5.5.1.32], initializer=tf.truncated_normal_initializer(stddev=0.1)) The current layer depth is 1, and the depth of the filter is 32
    conv1_biases = tf.get_variable("conv1_biases"[32], initializer=tf.constant_initializer(0.0))  
    conv1 = tf.nn.conv2d(pool0, conv1_weights, strides=[1.1.1.1], padding='SAME') # Move step 1, fill with all zeros
    relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_biases)) Activate function Relu to de-linearize

    # layer 2: maximum pooling layer
    The size of the pool layer filter is 2*2, the move step is 2, and the filter is filled with all zeros
    # 1128,8,32 (-) - > (1),64,4,32
    pool1 = tf.nn.max_pool(relu1, ksize=[1.2.2.1], strides=[1.2.2.1], padding='SAME')  

    # Third layer: convolution layer
    # (1,64,4,32) - > (1,64,4,64)
    conv2_weights = tf.get_variable("conv2_weights"[3.3.32.64], initializer=tf.truncated_normal_initializer(stddev=0.1)) The current layer depth is 32. The depth of the filter is 64
    conv2_biases = tf.get_variable("conv2_biases"[64], initializer=tf.constant_initializer(0.0))  
    conv2 = tf.nn.conv2d(pool1, conv2_weights, strides=[1.1.1.1], padding='SAME') # Move step 1, fill with all zeros
    relu2 = tf.nn.relu( tf.nn.bias_add(conv2, conv2_biases) )  

    # Layer 4: Maximum pooling layer
    The size of the pool layer filter is 2*2, the move step is 2, and the filter is filled with all zeros
    # (1,64,4,64) - > (1,32,2,64)
    pool2 = tf.nn.max_pool(relu2, ksize=[1.2.2.1], strides=[1.2.2.1], padding='SAME')  
    
    Layer 5: Full connection layer
    fc1_weights = tf.get_variable("fc1_weights"[32 * 2 * 64.256], initializer=tf.truncated_normal_initializer(stddev=0.1)) #7*7*64=3136 Turn the output of the previous layer into eigenvectors
    fc1_baises = tf.get_variable("fc1_baises"[256], initializer=tf.constant_initializer(0.1))  
    # (1,32,2,64) - > (1 * 2 * 64)
    pool2_vector = tf.reshape(pool2, [-1.32 * 2 * 64])  
    # (1, 32 * 2 * 64) - > (1256)
    fc1 = tf.nn.relu(tf.matmul(pool2_vector, fc1_weights) + fc1_baises)  
    fc2_weights = tf.get_variable("fc2_weights"[256.2], initializer=tf.truncated_normal_initializer(stddev=0.1)) #7*7*64=3136 Turn the output of the previous layer into eigenvectors
    fc2_baises = tf.get_variable("fc2_baises"[2], initializer=tf.constant_initializer(0.1))  
    # (1256) - > (1, 2)
    h2 = tf.matmul(fc1, fc2_weights) + fc2_baises
    return (h2)
# To reduce overfitting, add a Dropout layer

predy = CNN(x_1)


cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y,logits=predy))

opt=tf.train.AdamOptimizer().minimize(cost)

correct_pred=tf.equal(tf.argmax(predy,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_pred,tf.float32))

with tf.Session() as sess:
    saver = tf.train.Saver()
    srun = sess.run
    init =  tf.global_variables_initializer()
    srun(init)
    for e in range(train_step):
        for t in range(20000//batch_size):
            ts = int(t*batch_size)
            batch_x,batch_y = partial_x_train[ts:ts+batch_size],partial_y_train[ts:ts+batch_size]
            srun(opt,{x:batch_x,y:batch_y})
            if(t%1= =0):
                accuracy_val, cost_val = srun([accuracy,cost],{x:batch_x,y:batch_y})
                print(e,t,cost_val,accuracy_val)
        saver.save(sess,'/Users/yss/YSSFiles/TFAPP/2RNN/txt_deal/saver/model',global_step=e)        
        accuracy_val, cost_val = srun([accuracy,cost],{x:x_val,y:y_val})
        print(e,cost_val,accuracy_val)
Copy the code

The output

. 39 37 0.11854752 0.974 39 38 0.05035739 0.994 39 39 0.025472356 1.0 39 1.1234461 0.657Copy the code

3. The RNN

import os
os.environ["KMP_DUPLICATE_LIB_OK"] ="TRUE"
import numpy as np
import pandas as pd
import pickle
import time
import tensorflow as tf
import collections
from tensorflow import keras




reviews = pd.read_csv('./2RNN/txt_deal/reviews.txt', header=None)
labels = pd.read_csv('./2RNN/txt_deal/labels.txt', header=None)

reviews_datas = reviews.values
labels_datas = labels.values
chars = ["".""]

def get_words(npll) :
    words = []
    for ii in npll:
        for i in ii[0].split("") :if(i in chars):
                pass
            else:
                words.append(i)
    return(words)
words = get_words(reviews_datas)

vocab_size = 10000
vocab = collections.Counter(words).most_common(vocab_size-1)
#print((vocab))
count = [["<PAD>".0]]
count.extend(vocab)
#print(count[:10])

word2id = {}
id2word = {}
for i, w in enumerate(count):
    word2id[w[0]] = i
    id2word[i] = w[0]
print(id2word[100], word2id['i'])

reviews_seq = [seq[0].split("") for seq in reviews_datas]

reviews_list = []
seq_len = 256
for seq in reviews_seq:
    l = [1]
    for s in seq:
        if s in word2id:
            pass
        else:
            s = "<PAD>"
        l.append(word2id[s])
    if(len(l)>=seq_len):
        l=l[:seq_len]
    while(len(l)<seq_len):
        l.append(0)
    reviews_list.append(l)

reviews_list = np.array(reviews_list)
labels_list = pd.get_dummies(labels).values


x_val = reviews_list[:5000]
partial_x_train = reviews_list[5000:]

y_val = labels_list[:5000]
partial_y_train = labels_list[5000:]



train_rate=0.0001 
train_step=50
batch_size=500
embed_size = 32
sequence_length = 256
n_classes = 2




h1_num = 32
h2_num = 16
h3_num = 2





# (-1, 256)
x = tf.placeholder(tf.int32,shape=[None,sequence_length],name="inputx")

embeddings = tf.Variable(tf.random_normal([vocab_size, embed_size]))
# (-1, 256) -># (-1, 256,32)
x_1 = tf.nn.embedding_lookup(embeddings,x)



y=tf.placeholder(dtype=tf.float32,shape=[None,h3_num],name="expected_y")

weights={
    "h1":tf.Variable(tf.random_normal(shape=[h1_num,h2_num])),
    "h2":tf.Variable(tf.random_normal(shape=[h2_num,h3_num])),
    }
bias={
    "h1":tf.Variable(tf.fill([h2_num],0.1)),
    "h2":tf.Variable(tf.fill([h3_num],0.1})),def RNN(x,weights,bias) :
    First convert the input to the accepted shapes of dynamic_rnn: batch_size,sequence_length,frame_size
    rnn_cell=tf.nn.rnn_cell.BasicLSTMCell(h1_num)

    output,states=tf.nn.dynamic_rnn(rnn_cell,x,dtype=tf.float32)
    h = tf.matmul(output[:,-1,:],weights)+bias
    Output will be [batch_size,sequence_length, RNn_cell.output_size
    return (h)



h2 = tf.nn.relu(RNN(x_1,weights["h1"],bias["h1"]))

predy = tf.matmul(h2,weights["h2"])+bias["h2"]



cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y,logits=predy))

opt=tf.train.AdamOptimizer().minimize(cost)

correct_pred=tf.equal(tf.argmax(predy,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_pred,tf.float32))

with tf.Session() as sess:
    saver = tf.train.Saver()
    srun = sess.run
    init =  tf.global_variables_initializer()
    srun(init)
    for e in range(train_step):
        for t in range(20000//batch_size):
            ts = int(t*batch_size)
            batch_x,batch_y = partial_x_train[ts:ts+batch_size],partial_y_train[ts:ts+batch_size]
            srun(opt,{x:batch_x,y:batch_y})
            if(t%1= =0):
                accuracy_val, cost_val = srun([accuracy,cost],{x:batch_x,y:batch_y})
                print(e,t,cost_val,accuracy_val)
        saver.save(sess,'/Users/yss/YSSFiles/TFAPP/2RNN/txt_deal/saver/model',global_step=e)        
        accuracy_val, cost_val = srun([accuracy,cost],{x:x_val,y:y_val})
        print(e,cost_val,accuracy_val)
Copy the code

The output

. 49 37 0.21631543 0.92 49 38 0.21078381 0.924 49 39 0.36801508 0.854 49 0.82634455 0.7292Copy the code