当前位置:   article > 正文

TensorFlow 情感分析句子分类RNN循环神经网络代码(附带数据)_rnn情感分析代码

rnn情感分析代码
  1. import tensorflow as tf
  2. from tensorflow.examples.tutorials.mnist import input_data
  3. import nltk
  4. import pandas as pd
  5. from collections import Counter
  6. import numpy as np
  7. MAX_FEATURES = 150
  8. MAX_SENTENCE_LENGTH =100
  9. # hyperparameters
  10. lr = 0.001
  11. training_iters = 100000
  12. batch_size = 127
  13. vocab_size = 200
  14. embedding_size = 100
  15. n_inputs = embedding_size # MNIST data input (img shape: 28*28)
  16. n_steps = MAX_SENTENCE_LENGTH # time steps
  17. n_hidden_units = 128 # neurons in hidden layer
  18. n_classes = 2 # MNIST classes (0-9 digits)
  19. def get_sentiment_data():
  20. df_sentiment = pd.read_csv('sentiment.csv',encoding='utf-8')
  21. sentenses = df_sentiment['sentence'].values
  22. sentenses = [s.lower() for s in sentenses]
  23. wordlist_sentence = [nltk.word_tokenize(s) for s in sentenses]
  24. ws = []
  25. for wordlist in wordlist_sentence:
  26. ws.extend(wordlist)
  27. word_counter = Counter(ws)
  28. mc = word_counter.most_common(100)
  29. print(mc)
  30. vocab_size = min(MAX_FEATURES, len(word_counter)) + 2
  31. word2index = {x[0]: i + 2 for i, x in
  32. enumerate(word_counter.most_common(MAX_FEATURES))}
  33. word2index["PAD"] = 0
  34. word2index["UNK"] = 1
  35. index2word = {v: k for k, v in word2index.items()}
  36. res = []
  37. print('iterrows')
  38. for line in df_sentiment.iterrows():
  39. #print('line')
  40. label, sentence = str(line[1]['label']), line[1]['sentence']
  41. # label, sentence = line.strip().split("\t")
  42. # print(label,sentence)
  43. # words = nltk.word_tokenize(sentence_q.lower())
  44. words = nltk.word_tokenize(sentence.lower())
  45. #print(words)
  46. seqs1 = []
  47. for word in words:
  48. if word in word2index.keys():
  49. seqs1.append(word2index[word])
  50. else:
  51. seqs1.append(word2index["UNK"])
  52. if MAX_SENTENCE_LENGTH < len(seqs1):
  53. print('unexpected length of padding', len(padding), padding)
  54. continue
  55. padding = [0]*(MAX_SENTENCE_LENGTH - len(seqs1))
  56. padding.extend(seqs1)
  57. if len(padding)!=MAX_SENTENCE_LENGTH:
  58. print('unexpected length of padding',len(padding),padding)
  59. #padding = [u for u in padding]
  60. #for i in range(MAX_SENTENCE_LENGTH):
  61. if label == '0':
  62. res.append([np.array([1, 0]), padding])
  63. #print('0')
  64. if label == '1':
  65. res.append([np.array([0, 1]), padding])
  66. #print('1')
  67. return res
  68. # set random seed for comparing the two result calculations
  69. tf.set_random_seed(1)
  70. # this is data
  71. mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
  72. # tf Graph input
  73. x = tf.placeholder(tf.int32, [None, n_steps])
  74. W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),name = "W")
  75. embedded_chars = tf.nn.embedding_lookup(W, x)
  76. #embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)
  77. y = tf.placeholder(tf.float32, [None, n_classes])
  78. # Define weights
  79. weights = {
  80. # (28, 128)
  81. 'in': tf.Variable(tf.random_normal([n_inputs, n_hidden_units])),
  82. # (128, 10)
  83. 'out': tf.Variable(tf.random_normal([n_hidden_units, n_classes]))
  84. }
  85. biases = {
  86. # (128, )
  87. 'in': tf.Variable(tf.constant(0.1, shape=[n_hidden_units, ])),
  88. # (10, )
  89. 'out': tf.Variable(tf.constant(0.1, shape=[n_classes, ]))
  90. }
  91. def RNN(X, weights, biases):
  92. # hidden layer for input to cell
  93. ########################################
  94. # transpose the inputs shape from
  95. # X ==> (128 batch * 28 steps, 28 inputs)
  96. #X = tf.reshape(X, [-1, n_inputs])
  97. # into hidden
  98. # X_in = (128 batch * 28 steps, 128 hidden)
  99. #X_in = tf.matmul(X, weights['in']) + biases['in']
  100. # X_in ==> (128 batch, 28 steps, 128 hidden)
  101. #X_in = tf.reshape(X_in, [-1, n_steps, n_hidden_units])
  102. # cell
  103. ##########################################
  104. # basic LSTM Cell.
  105. if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
  106. cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden_units, forget_bias=1.0, state_is_tuple=True)
  107. else:
  108. cell = tf.contrib.rnn.BasicLSTMCell(n_hidden_units)
  109. # lstm cell is divided into two parts (c_state, h_state)
  110. init_state = cell.zero_state(batch_size, dtype=tf.float32)
  111. # You have 2 options for following step.
  112. # 1: tf.nn.rnn(cell, inputs);
  113. # 2: tf.nn.dynamic_rnn(cell, inputs).
  114. # If use option 1, you have to modified the shape of X_in, go and check out this:
  115. # https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py
  116. # In here, we go for option 2.
  117. # dynamic_rnn receive Tensor (batch, steps, inputs) or (steps, batch, inputs) as X_in.
  118. # Make sure the time_major is changed accordingly.
  119. outputs, final_state = tf.nn.dynamic_rnn(cell, X, initial_state=init_state, time_major=False)
  120. # hidden layer for output as the final results
  121. #############################################
  122. # results = tf.matmul(final_state[1], weights['out']) + biases['out']
  123. # # or
  124. # unpack to list [(batch, outputs)..] * steps
  125. if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
  126. outputs = tf.unpack(tf.transpose(outputs, [1, 0, 2])) # states is the last outputs
  127. else:
  128. outputs = tf.unstack(tf.transpose(outputs, [1,0,2]))
  129. results = tf.matmul(outputs[-1], weights['out']) + biases['out'] # shape = (128, 10)
  130. return results
  131. def generate_number_classification():
  132. import numpy as np
  133. import random
  134. number = training_iters
  135. data = []
  136. for i in range(number):
  137. number_list = []
  138. for j in range(MAX_SENTENCE_LENGTH):
  139. number_list.append(random.randint(0,MAX_FEATURES))
  140. #number_list.sort()
  141. #number_list = [str(n) for n in number_list]
  142. data.append(number_list)
  143. res = []
  144. for i,number in enumerate(data):
  145. if i %2 ==0:
  146. question = [str(n) for n in number]
  147. res.append([[1,0],question])
  148. if i %2 ==1:
  149. question = [str(n+30) for n in number]
  150. res.append([[0,1], question])
  151. #training_data = pd.DataFrame(res, columns=['label', 'sentence_q', 'sentence_a'])
  152. return res
  153. data = get_sentiment_data()
  154. training_iters = len(data)
  155. pred = RNN(embedded_chars, weights, biases)
  156. cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
  157. train_op = tf.train.AdamOptimizer(lr).minimize(cost)
  158. correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
  159. accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
  160. def get_batch(data,step , batch_size):
  161. data = data[step*batch_size:(step+1)*batch_size]
  162. return [u[1] for u in data],[u[0] for u in data]
  163. with tf.Session() as sess:
  164. # tf.initialize_all_variables() no long valid from
  165. # 2017-03-02 if using tensorflow >= 0.12
  166. if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
  167. init = tf.initialize_all_variables()
  168. else:
  169. init = tf.global_variables_initializer()
  170. sess.run(init)
  171. step = 0
  172. while ((step+2) * batch_size )< training_iters:
  173. #print('{},{},{},{}'.format(step,batch_size,training_iters,(step+1) * batch_size ))
  174. batch_xs, batch_ys = get_batch(data,step , batch_size)
  175. batch_xs2, batch_ys2 = get_batch(data,step +1, batch_size)
  176. # mnist.train.next_batch(batch_size)
  177. #batch_xs = batch_xs.reshape([batch_size, n_steps, n_inputs])
  178. #batch_xs = batch_xs.reshape([batch_size, n_steps])
  179. sess.run([train_op], feed_dict={
  180. x: batch_xs,
  181. y: batch_ys,
  182. })
  183. if step % 2 == 0:
  184. print((step) * batch_size,sess.run(accuracy, feed_dict={
  185. x: batch_xs2,
  186. y: batch_ys2,
  187. }))
  188. step += 1

这一篇文章和之前的区别不大,主要是加入了真实数据,数据格式为

【0,1】或【1,0】是y,2分类标签

【id,id,id,id,id,。。。id】是x,为句子单词id序列

 

处理数据的代码是:

首先得到所有单词的list,有重复的单词,然后用counter直接得到frequency dict。

然后重新遍历数据df,把每个句子分词,用nltk的tokenize。(貌似也可以直接用一个东西同时做counter和转化id,忘了是哪个函数了)

得到word之后转化成id 序列

 

y转化成上面的格式。

放入res里

return 返回res就可以了。

  1. def get_sentiment_data():
  2. df_sentiment = pd.read_csv('sentiment.csv',encoding='utf-8')
  3. sentenses = df_sentiment['sentence'].values
  4. sentenses = [s.lower() for s in sentenses]
  5. wordlist_sentence = [nltk.word_tokenize(s) for s in sentenses]
  6. ws = []
  7. for wordlist in wordlist_sentence:
  8. ws.extend(wordlist)
  9. word_counter = Counter(ws)
  10. mc = word_counter.most_common(100)
  11. print(mc)
  12. vocab_size = min(MAX_FEATURES, len(word_counter)) + 2
  13. word2index = {x[0]: i + 2 for i, x in
  14. enumerate(word_counter.most_common(MAX_FEATURES))}
  15. word2index["PAD"] = 0
  16. word2index["UNK"] = 1
  17. index2word = {v: k for k, v in word2index.items()}
  18. res = []
  19. print('iterrows')
  20. for line in df_sentiment.iterrows():
  21. #print('line')
  22. label, sentence = str(line[1]['label']), line[1]['sentence']
  23. # label, sentence = line.strip().split("\t")
  24. # print(label,sentence)
  25. # words = nltk.word_tokenize(sentence_q.lower())
  26. words = nltk.word_tokenize(sentence.lower())
  27. #print(words)
  28. seqs1 = []
  29. for word in words:
  30. if word in word2index.keys():
  31. seqs1.append(word2index[word])
  32. else:
  33. seqs1.append(word2index["UNK"])
  34. if MAX_SENTENCE_LENGTH < len(seqs1):
  35. print('unexpected length of padding', len(padding), padding)
  36. continue
  37. padding = [0]*(MAX_SENTENCE_LENGTH - len(seqs1))
  38. padding.extend(seqs1)
  39. if len(padding)!=MAX_SENTENCE_LENGTH:
  40. print('unexpected length of padding',len(padding),padding)
  41. #padding = [u for u in padding]
  42. #for i in range(MAX_SENTENCE_LENGTH):
  43. if label == '0':
  44. res.append([np.array([1, 0]), padding])
  45. #print('0')
  46. if label == '1':
  47. res.append([np.array([0, 1]), padding])
  48. #print('1')
  49. return res

数据在我的资源下载那里,叫情感分析数据,把里面的train.csv改下名字就可以了

声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号