赞
踩
- import tensorflow as tf
- from tensorflow.examples.tutorials.mnist import input_data
- import nltk
- import pandas as pd
- from collections import Counter
- import numpy as np
- MAX_FEATURES = 150
- MAX_SENTENCE_LENGTH =100
-
- # hyperparameters
- lr = 0.001
- training_iters = 100000
- batch_size = 127
- vocab_size = 200
- embedding_size = 100
- n_inputs = embedding_size # MNIST data input (img shape: 28*28)
- n_steps = MAX_SENTENCE_LENGTH # time steps
- n_hidden_units = 128 # neurons in hidden layer
- n_classes = 2 # MNIST classes (0-9 digits)
-
- def get_sentiment_data():
- df_sentiment = pd.read_csv('sentiment.csv',encoding='utf-8')
- sentenses = df_sentiment['sentence'].values
- sentenses = [s.lower() for s in sentenses]
- wordlist_sentence = [nltk.word_tokenize(s) for s in sentenses]
- ws = []
- for wordlist in wordlist_sentence:
- ws.extend(wordlist)
- word_counter = Counter(ws)
- mc = word_counter.most_common(100)
- print(mc)
- vocab_size = min(MAX_FEATURES, len(word_counter)) + 2
- word2index = {x[0]: i + 2 for i, x in
- enumerate(word_counter.most_common(MAX_FEATURES))}
- word2index["PAD"] = 0
- word2index["UNK"] = 1
- index2word = {v: k for k, v in word2index.items()}
- res = []
- print('iterrows')
- for line in df_sentiment.iterrows():
- #print('line')
- label, sentence = str(line[1]['label']), line[1]['sentence']
- # label, sentence = line.strip().split("\t")
- # print(label,sentence)
- # words = nltk.word_tokenize(sentence_q.lower())
-
- words = nltk.word_tokenize(sentence.lower())
- #print(words)
- seqs1 = []
- for word in words:
- if word in word2index.keys():
- seqs1.append(word2index[word])
- else:
- seqs1.append(word2index["UNK"])
- if MAX_SENTENCE_LENGTH < len(seqs1):
- print('unexpected length of padding', len(padding), padding)
- continue
- padding = [0]*(MAX_SENTENCE_LENGTH - len(seqs1))
- padding.extend(seqs1)
- if len(padding)!=MAX_SENTENCE_LENGTH:
- print('unexpected length of padding',len(padding),padding)
- #padding = [u for u in padding]
- #for i in range(MAX_SENTENCE_LENGTH):
- if label == '0':
- res.append([np.array([1, 0]), padding])
- #print('0')
- if label == '1':
- res.append([np.array([0, 1]), padding])
- #print('1')
- return res
-
- # set random seed for comparing the two result calculations
- tf.set_random_seed(1)
-
- # this is data
- mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
-
- # tf Graph input
- x = tf.placeholder(tf.int32, [None, n_steps])
-
- W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),name = "W")
- embedded_chars = tf.nn.embedding_lookup(W, x)
- #embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)
-
-
- y = tf.placeholder(tf.float32, [None, n_classes])
-
- # Define weights
- weights = {
- # (28, 128)
- 'in': tf.Variable(tf.random_normal([n_inputs, n_hidden_units])),
- # (128, 10)
- 'out': tf.Variable(tf.random_normal([n_hidden_units, n_classes]))
- }
- biases = {
- # (128, )
- 'in': tf.Variable(tf.constant(0.1, shape=[n_hidden_units, ])),
- # (10, )
- 'out': tf.Variable(tf.constant(0.1, shape=[n_classes, ]))
- }
-
-
- def RNN(X, weights, biases):
- # hidden layer for input to cell
- ########################################
-
- # transpose the inputs shape from
- # X ==> (128 batch * 28 steps, 28 inputs)
- #X = tf.reshape(X, [-1, n_inputs])
-
- # into hidden
- # X_in = (128 batch * 28 steps, 128 hidden)
- #X_in = tf.matmul(X, weights['in']) + biases['in']
- # X_in ==> (128 batch, 28 steps, 128 hidden)
- #X_in = tf.reshape(X_in, [-1, n_steps, n_hidden_units])
-
- # cell
- ##########################################
-
- # basic LSTM Cell.
- if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
- cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden_units, forget_bias=1.0, state_is_tuple=True)
- else:
- cell = tf.contrib.rnn.BasicLSTMCell(n_hidden_units)
- # lstm cell is divided into two parts (c_state, h_state)
- init_state = cell.zero_state(batch_size, dtype=tf.float32)
-
- # You have 2 options for following step.
- # 1: tf.nn.rnn(cell, inputs);
- # 2: tf.nn.dynamic_rnn(cell, inputs).
- # If use option 1, you have to modified the shape of X_in, go and check out this:
- # https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py
- # In here, we go for option 2.
- # dynamic_rnn receive Tensor (batch, steps, inputs) or (steps, batch, inputs) as X_in.
- # Make sure the time_major is changed accordingly.
- outputs, final_state = tf.nn.dynamic_rnn(cell, X, initial_state=init_state, time_major=False)
- # hidden layer for output as the final results
- #############################################
- # results = tf.matmul(final_state[1], weights['out']) + biases['out']
- # # or
- # unpack to list [(batch, outputs)..] * steps
- if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
- outputs = tf.unpack(tf.transpose(outputs, [1, 0, 2])) # states is the last outputs
- else:
- outputs = tf.unstack(tf.transpose(outputs, [1,0,2]))
- results = tf.matmul(outputs[-1], weights['out']) + biases['out'] # shape = (128, 10)
- return results
-
- def generate_number_classification():
- import numpy as np
- import random
- number = training_iters
- data = []
- for i in range(number):
- number_list = []
- for j in range(MAX_SENTENCE_LENGTH):
- number_list.append(random.randint(0,MAX_FEATURES))
- #number_list.sort()
- #number_list = [str(n) for n in number_list]
- data.append(number_list)
- res = []
- for i,number in enumerate(data):
- if i %2 ==0:
- question = [str(n) for n in number]
- res.append([[1,0],question])
- if i %2 ==1:
- question = [str(n+30) for n in number]
- res.append([[0,1], question])
- #training_data = pd.DataFrame(res, columns=['label', 'sentence_q', 'sentence_a'])
- return res
-
- data = get_sentiment_data()
- training_iters = len(data)
- pred = RNN(embedded_chars, weights, biases)
- cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
- train_op = tf.train.AdamOptimizer(lr).minimize(cost)
- correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
- accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
-
- def get_batch(data,step , batch_size):
- data = data[step*batch_size:(step+1)*batch_size]
- return [u[1] for u in data],[u[0] for u in data]
-
- with tf.Session() as sess:
- # tf.initialize_all_variables() no long valid from
- # 2017-03-02 if using tensorflow >= 0.12
- if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
- init = tf.initialize_all_variables()
- else:
- init = tf.global_variables_initializer()
- sess.run(init)
- step = 0
- while ((step+2) * batch_size )< training_iters:
- #print('{},{},{},{}'.format(step,batch_size,training_iters,(step+1) * batch_size ))
- batch_xs, batch_ys = get_batch(data,step , batch_size)
- batch_xs2, batch_ys2 = get_batch(data,step +1, batch_size)
- # mnist.train.next_batch(batch_size)
- #batch_xs = batch_xs.reshape([batch_size, n_steps, n_inputs])
- #batch_xs = batch_xs.reshape([batch_size, n_steps])
- sess.run([train_op], feed_dict={
- x: batch_xs,
- y: batch_ys,
- })
- if step % 2 == 0:
- print((step) * batch_size,sess.run(accuracy, feed_dict={
- x: batch_xs2,
- y: batch_ys2,
- }))
- step += 1

这一篇文章和之前的区别不大,主要是加入了真实数据,数据格式为
【0,1】或【1,0】是y,2分类标签
【id,id,id,id,id,。。。id】是x,为句子单词id序列
处理数据的代码是:
首先得到所有单词的list,有重复的单词,然后用counter直接得到frequency dict。
然后重新遍历数据df,把每个句子分词,用nltk的tokenize。(貌似也可以直接用一个东西同时做counter和转化id,忘了是哪个函数了)
得到word之后转化成id 序列
y转化成上面的格式。
放入res里
return 返回res就可以了。
-
- def get_sentiment_data():
- df_sentiment = pd.read_csv('sentiment.csv',encoding='utf-8')
- sentenses = df_sentiment['sentence'].values
- sentenses = [s.lower() for s in sentenses]
- wordlist_sentence = [nltk.word_tokenize(s) for s in sentenses]
- ws = []
- for wordlist in wordlist_sentence:
- ws.extend(wordlist)
- word_counter = Counter(ws)
- mc = word_counter.most_common(100)
- print(mc)
- vocab_size = min(MAX_FEATURES, len(word_counter)) + 2
- word2index = {x[0]: i + 2 for i, x in
- enumerate(word_counter.most_common(MAX_FEATURES))}
- word2index["PAD"] = 0
- word2index["UNK"] = 1
- index2word = {v: k for k, v in word2index.items()}
- res = []
- print('iterrows')
- for line in df_sentiment.iterrows():
- #print('line')
- label, sentence = str(line[1]['label']), line[1]['sentence']
- # label, sentence = line.strip().split("\t")
- # print(label,sentence)
- # words = nltk.word_tokenize(sentence_q.lower())
-
- words = nltk.word_tokenize(sentence.lower())
- #print(words)
- seqs1 = []
- for word in words:
- if word in word2index.keys():
- seqs1.append(word2index[word])
- else:
- seqs1.append(word2index["UNK"])
- if MAX_SENTENCE_LENGTH < len(seqs1):
- print('unexpected length of padding', len(padding), padding)
- continue
- padding = [0]*(MAX_SENTENCE_LENGTH - len(seqs1))
- padding.extend(seqs1)
- if len(padding)!=MAX_SENTENCE_LENGTH:
- print('unexpected length of padding',len(padding),padding)
- #padding = [u for u in padding]
- #for i in range(MAX_SENTENCE_LENGTH):
- if label == '0':
- res.append([np.array([1, 0]), padding])
- #print('0')
- if label == '1':
- res.append([np.array([0, 1]), padding])
- #print('1')
- return res

数据在我的资源下载那里,叫情感分析数据,把里面的train.csv改下名字就可以了
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。