· Use Word2Vec and RNN(LSTM) to do text sentiment analysis (how to read the human mind)
With the help of
- DNN
- CNN
- Word2Vec
- RNN(LSTM)
If not, review our previous posts. With full connection, convolutional neural network and cyclic neural network are implemented respectively. Code part: 1. Full connection implementation
import os
os.environ["KMP_DUPLICATE_LIB_OK"] ="TRUE"
import numpy as np
import pandas as pd
import pickle
import time
import tensorflow as tf
import collections
from tensorflow import keras
reviews = pd.read_csv('./2RNN/txt_deal/reviews.txt', header=None)
labels = pd.read_csv('./2RNN/txt_deal/labels.txt', header=None)
reviews_datas = reviews.values
labels_datas = labels.values
chars = ["".""]
def get_words(npll) :
words = []
for ii in npll:
for i in ii[0].split("") :if(i in chars):
pass
else:
words.append(i)
return(words)
words = get_words(reviews_datas)
vocab_size = 10000
vocab = collections.Counter(words).most_common(vocab_size-1)
#print((vocab))
count = [["<PAD>".0]]
count.extend(vocab)
#print(count[:10])
word2id = {}
id2word = {}
for i, w in enumerate(count):
word2id[w[0]] = i
id2word[i] = w[0]
print(id2word[100], word2id['i'])
reviews_seq = [seq[0].split("") for seq in reviews_datas]
reviews_list = []
seq_len = 256
for seq in reviews_seq:
l = [1]
for s in seq:
if s in word2id:
pass
else:
s = "<PAD>"
l.append(word2id[s])
if(len(l)>=seq_len):
l=l[:seq_len]
while(len(l)<seq_len):
l.append(0)
reviews_list.append(l)
reviews_list = np.array(reviews_list)
labels_list = pd.get_dummies(labels).values
x_val = reviews_list[:5000]
partial_x_train = reviews_list[5000:]
y_val = labels_list[:5000]
partial_y_train = labels_list[5000:]
labels = np.argmax(labels_list,axis=1)
print(reviews_list[0],labels[0])
print(reviews_list[0].len(reviews_list))
train_data = reviews_list
train_labels = labels
train_rate=0.0001
train_step=20
batch_size=500
embed_size = 32
sequence_length = 256
n_classes = 2
h1_num = 32
h2_num = 16
h3_num = 2
x = tf.placeholder(tf.int32,shape=[None,sequence_length],name="inputx")
y = tf.placeholder(dtype=tf.float32,shape=[None.2],name="expected_y")
print(y)
embeddings = tf.Variable(tf.random_normal([vocab_size, embed_size]))
x_1 = tf.nn.embedding_lookup(embeddings,x)
#(-1,256)-->(-1,256,32)
h1 = tf.keras.layers.GlobalAveragePooling1D()(x_1)
# (1256, 32) - > (1, 32)
weights2 = tf.Variable(tf.random_normal(shape=[h1_num,h2_num]))
bias2 = tf.Variable(tf.fill([h2_num],0.1))
Aiaa # (1) -- - > (1 dec)
h2 = tf.nn.relu(tf.matmul(h1,weights2)+bias2)
# (1 dec) - > (1, 2)
#y_ = tf.nn.softmax(tf.matmul(h3,weights4)+bias4)
weights3 = tf.Variable(tf.random_normal(shape=[h2_num,h3_num]))
bias3 = tf.Variable(tf.fill([h3_num],0.1))
predy = (tf.matmul(h2,weights3)+bias3)
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y,logits=predy))
opt=tf.train.AdamOptimizer().minimize(cost)
correct_pred=tf.equal(tf.argmax(predy,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_pred,tf.float32))
with tf.Session() as sess:
saver = tf.train.Saver()
srun = sess.run
init = tf.global_variables_initializer()
srun(init)
for e in range(train_step):
for t in range(20000//batch_size):
ts = int(t*batch_size)
batch_x,batch_y = partial_x_train[ts:ts+batch_size],partial_y_train[ts:ts+batch_size]
srun(opt,{x:batch_x,y:batch_y})
if(t%1= =0):
accuracy_val, cost_val = srun([accuracy,cost],{x:batch_x,y:batch_y})
print(e,t,cost_val,accuracy_val)
saver.save(sess,'./2RNN/3_1Word2Vec/txt/saver/model.ckpt',global_step=t)
accuracy_val, cost_val = srun([accuracy,cost],{x:x_val,y:y_val})
print(e,cost_val,accuracy_val)
Copy the code
The output
. 19 37 0.29615197 0.906 19 38 0.31939483 0.87 1939 0.4328907 0.81 19 0.42973673 0.8094Copy the code
2. CNN implementation
import os
os.environ["KMP_DUPLICATE_LIB_OK"] ="TRUE"
import numpy as np
import pandas as pd
import pickle
import time
import tensorflow as tf
import collections
from tensorflow import keras
reviews = pd.read_csv('./2RNN/txt_deal/reviews.txt', header=None)
labels = pd.read_csv('./2RNN/txt_deal/labels.txt', header=None)
reviews_datas = reviews.values
labels_datas = labels.values
chars = ["".""]
def get_words(npll) :
words = []
for ii in npll:
for i in ii[0].split("") :if(i in chars):
pass
else:
words.append(i)
return(words)
words = get_words(reviews_datas)
vocab_size = 10000
vocab = collections.Counter(words).most_common(vocab_size-1)
#print((vocab))
count = [["<PAD>".0]]
count.extend(vocab)
#print(count[:10])
word2id = {}
id2word = {}
for i, w in enumerate(count):
word2id[w[0]] = i
id2word[i] = w[0]
print(id2word[100], word2id['i'])
reviews_seq = [seq[0].split("") for seq in reviews_datas]
reviews_list = []
seq_len = 256
for seq in reviews_seq:
l = [1]
for s in seq:
if s in word2id:
pass
else:
s = "<PAD>"
l.append(word2id[s])
if(len(l)>=seq_len):
l=l[:seq_len]
while(len(l)<seq_len):
l.append(0)
reviews_list.append(l)
reviews_list = np.array(reviews_list)
labels_list = pd.get_dummies(labels).values
x_val = reviews_list[:5000]
partial_x_train = reviews_list[5000:]
y_val = labels_list[:5000]
partial_y_train = labels_list[5000:]
train_rate=0.0001
train_step=50
batch_size=500
embed_size = 16
sequence_length = 256
n_classes = 2
h1_num = 32
h2_num = 16
h3_num = 2
# (-1, 256)
x = tf.placeholder(tf.int32,shape=[None,sequence_length],name="inputx")
embeddings = tf.Variable(tf.random_normal([vocab_size, embed_size]))
# (-1, 256) -># (-1, 256,32)
x_1 = tf.nn.embedding_lookup(embeddings,x)
y=tf.placeholder(dtype=tf.float32,shape=[None,h3_num],name="expected_y")
def CNN(x) :
Convert the input to the shape accepted by CNN: batch_size,sequence_length,frame_size,deepsize
# (1256) - > 1256,16,1 (-)
x = tf.reshape(x,[-1,sequence_length,embed_size,1])
# 1256,16,1 (-) - > 1128,8,1 (-)
pool0 = tf.nn.max_pool(x, ksize=[1.2.2.1], strides=[1.2.2.1], padding='SAME')
# The first layer: convolution layer
# 1128,8,1 (-) - > 1128,8,32 (-)
conv1_weights = tf.get_variable("conv1_weights"[5.5.1.32], initializer=tf.truncated_normal_initializer(stddev=0.1)) The current layer depth is 1, and the depth of the filter is 32
conv1_biases = tf.get_variable("conv1_biases"[32], initializer=tf.constant_initializer(0.0))
conv1 = tf.nn.conv2d(pool0, conv1_weights, strides=[1.1.1.1], padding='SAME') # Move step 1, fill with all zeros
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_biases)) Activate function Relu to de-linearize
# layer 2: maximum pooling layer
The size of the pool layer filter is 2*2, the move step is 2, and the filter is filled with all zeros
# 1128,8,32 (-) - > (1),64,4,32
pool1 = tf.nn.max_pool(relu1, ksize=[1.2.2.1], strides=[1.2.2.1], padding='SAME')
# Third layer: convolution layer
# (1,64,4,32) - > (1,64,4,64)
conv2_weights = tf.get_variable("conv2_weights"[3.3.32.64], initializer=tf.truncated_normal_initializer(stddev=0.1)) The current layer depth is 32. The depth of the filter is 64
conv2_biases = tf.get_variable("conv2_biases"[64], initializer=tf.constant_initializer(0.0))
conv2 = tf.nn.conv2d(pool1, conv2_weights, strides=[1.1.1.1], padding='SAME') # Move step 1, fill with all zeros
relu2 = tf.nn.relu( tf.nn.bias_add(conv2, conv2_biases) )
# Layer 4: Maximum pooling layer
The size of the pool layer filter is 2*2, the move step is 2, and the filter is filled with all zeros
# (1,64,4,64) - > (1,32,2,64)
pool2 = tf.nn.max_pool(relu2, ksize=[1.2.2.1], strides=[1.2.2.1], padding='SAME')
Layer 5: Full connection layer
fc1_weights = tf.get_variable("fc1_weights"[32 * 2 * 64.256], initializer=tf.truncated_normal_initializer(stddev=0.1)) #7*7*64=3136 Turn the output of the previous layer into eigenvectors
fc1_baises = tf.get_variable("fc1_baises"[256], initializer=tf.constant_initializer(0.1))
# (1,32,2,64) - > (1 * 2 * 64)
pool2_vector = tf.reshape(pool2, [-1.32 * 2 * 64])
# (1, 32 * 2 * 64) - > (1256)
fc1 = tf.nn.relu(tf.matmul(pool2_vector, fc1_weights) + fc1_baises)
fc2_weights = tf.get_variable("fc2_weights"[256.2], initializer=tf.truncated_normal_initializer(stddev=0.1)) #7*7*64=3136 Turn the output of the previous layer into eigenvectors
fc2_baises = tf.get_variable("fc2_baises"[2], initializer=tf.constant_initializer(0.1))
# (1256) - > (1, 2)
h2 = tf.matmul(fc1, fc2_weights) + fc2_baises
return (h2)
# To reduce overfitting, add a Dropout layer
predy = CNN(x_1)
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y,logits=predy))
opt=tf.train.AdamOptimizer().minimize(cost)
correct_pred=tf.equal(tf.argmax(predy,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_pred,tf.float32))
with tf.Session() as sess:
saver = tf.train.Saver()
srun = sess.run
init = tf.global_variables_initializer()
srun(init)
for e in range(train_step):
for t in range(20000//batch_size):
ts = int(t*batch_size)
batch_x,batch_y = partial_x_train[ts:ts+batch_size],partial_y_train[ts:ts+batch_size]
srun(opt,{x:batch_x,y:batch_y})
if(t%1= =0):
accuracy_val, cost_val = srun([accuracy,cost],{x:batch_x,y:batch_y})
print(e,t,cost_val,accuracy_val)
saver.save(sess,'/Users/yss/YSSFiles/TFAPP/2RNN/txt_deal/saver/model',global_step=e)
accuracy_val, cost_val = srun([accuracy,cost],{x:x_val,y:y_val})
print(e,cost_val,accuracy_val)
Copy the code
The output
. 39 37 0.11854752 0.974 39 38 0.05035739 0.994 39 39 0.025472356 1.0 39 1.1234461 0.657Copy the code
3. The RNN
import os
os.environ["KMP_DUPLICATE_LIB_OK"] ="TRUE"
import numpy as np
import pandas as pd
import pickle
import time
import tensorflow as tf
import collections
from tensorflow import keras
reviews = pd.read_csv('./2RNN/txt_deal/reviews.txt', header=None)
labels = pd.read_csv('./2RNN/txt_deal/labels.txt', header=None)
reviews_datas = reviews.values
labels_datas = labels.values
chars = ["".""]
def get_words(npll) :
words = []
for ii in npll:
for i in ii[0].split("") :if(i in chars):
pass
else:
words.append(i)
return(words)
words = get_words(reviews_datas)
vocab_size = 10000
vocab = collections.Counter(words).most_common(vocab_size-1)
#print((vocab))
count = [["<PAD>".0]]
count.extend(vocab)
#print(count[:10])
word2id = {}
id2word = {}
for i, w in enumerate(count):
word2id[w[0]] = i
id2word[i] = w[0]
print(id2word[100], word2id['i'])
reviews_seq = [seq[0].split("") for seq in reviews_datas]
reviews_list = []
seq_len = 256
for seq in reviews_seq:
l = [1]
for s in seq:
if s in word2id:
pass
else:
s = "<PAD>"
l.append(word2id[s])
if(len(l)>=seq_len):
l=l[:seq_len]
while(len(l)<seq_len):
l.append(0)
reviews_list.append(l)
reviews_list = np.array(reviews_list)
labels_list = pd.get_dummies(labels).values
x_val = reviews_list[:5000]
partial_x_train = reviews_list[5000:]
y_val = labels_list[:5000]
partial_y_train = labels_list[5000:]
train_rate=0.0001
train_step=50
batch_size=500
embed_size = 32
sequence_length = 256
n_classes = 2
h1_num = 32
h2_num = 16
h3_num = 2
# (-1, 256)
x = tf.placeholder(tf.int32,shape=[None,sequence_length],name="inputx")
embeddings = tf.Variable(tf.random_normal([vocab_size, embed_size]))
# (-1, 256) -># (-1, 256,32)
x_1 = tf.nn.embedding_lookup(embeddings,x)
y=tf.placeholder(dtype=tf.float32,shape=[None,h3_num],name="expected_y")
weights={
"h1":tf.Variable(tf.random_normal(shape=[h1_num,h2_num])),
"h2":tf.Variable(tf.random_normal(shape=[h2_num,h3_num])),
}
bias={
"h1":tf.Variable(tf.fill([h2_num],0.1)),
"h2":tf.Variable(tf.fill([h3_num],0.1})),def RNN(x,weights,bias) :
First convert the input to the accepted shapes of dynamic_rnn: batch_size,sequence_length,frame_size
rnn_cell=tf.nn.rnn_cell.BasicLSTMCell(h1_num)
output,states=tf.nn.dynamic_rnn(rnn_cell,x,dtype=tf.float32)
h = tf.matmul(output[:,-1,:],weights)+bias
Output will be [batch_size,sequence_length, RNn_cell.output_size
return (h)
h2 = tf.nn.relu(RNN(x_1,weights["h1"],bias["h1"]))
predy = tf.matmul(h2,weights["h2"])+bias["h2"]
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y,logits=predy))
opt=tf.train.AdamOptimizer().minimize(cost)
correct_pred=tf.equal(tf.argmax(predy,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_pred,tf.float32))
with tf.Session() as sess:
saver = tf.train.Saver()
srun = sess.run
init = tf.global_variables_initializer()
srun(init)
for e in range(train_step):
for t in range(20000//batch_size):
ts = int(t*batch_size)
batch_x,batch_y = partial_x_train[ts:ts+batch_size],partial_y_train[ts:ts+batch_size]
srun(opt,{x:batch_x,y:batch_y})
if(t%1= =0):
accuracy_val, cost_val = srun([accuracy,cost],{x:batch_x,y:batch_y})
print(e,t,cost_val,accuracy_val)
saver.save(sess,'/Users/yss/YSSFiles/TFAPP/2RNN/txt_deal/saver/model',global_step=e)
accuracy_val, cost_val = srun([accuracy,cost],{x:x_val,y:y_val})
print(e,cost_val,accuracy_val)
Copy the code
The output
. 49 37 0.21631543 0.92 49 38 0.21078381 0.924 49 39 0.36801508 0.854 49 0.82634455 0.7292Copy the code