赞
踩
目录
利用循环神经网络进行新闻话题分类的代码,设置Embedding的trainable=True,并调整网络结构,看是否可以提高识别率。
经过不断的调参,得出下表结果,当设置Embedding的trainable=True时,并经过下表的参数调整,识别率有所下降。
SimpleRNN:
optimizer | lr | batch_ size | trainable | epochs | 验证集识别率 |
Adam | 0.001 | 512 | False | 10 | 0.4666 |
Adam | 0.001 | 512 | True | 10 | 0.4974 |
RMSprop | 0.001 | 512 | True | 10 | 0.5083 |
SGD | 0.001 | 512 | True | 10 | 0.2692 |
Lstm:
optimizer | lr | batch_ size | trainable | epochs | 验证集识别率 |
Adam | 0.001 | 512 | False | 10 | 0.5904 |
Adam | 0.001 | 512 | True | 10 | 0.5670 |
RMSprop | 0.001 | 512 | True | 10 | 0.5940 |
SGD | 0.001 | 512 | True | 10 | 0.2674 |
GRU:
optimizer | lr | batch_ size | trainable | epochs | 验证集识别率 |
Adam | 0.001 | 512 | False | 10 | 0.5973 |
Adam | 0.001 | 512 | True | 10 | 0.5518 |
RMSprop | 0.001 | 512 | True | 10 | 0.5916 |
SGD | 0.001 | 512 | True | 10 | 0.2800 |
网络结构:
优化器为Adam,trainable=True,各种循环神经网络的识别率:
SimpleRNN | |
LSTM | |
GRU | |
优化器为RMSprop,trainable=True,各种循环神经网络的识别率:
SimpleRNN | |
LSTM | |
GRU | |
优化器为SGD,trainable=True,各种循环神经网络的识别率:
SimpleRNN | |
LSTM | |
GRU | |
可见,SGD并不适合当做循环神经网络的优化器。
- # In[1]: 读取新闻话题分类数据
- import pandas as pd
- df = pd.read_json(r'D:\Cadabra_tools002\course_data\News_Category_Dataset.json', lines=True)
- df.head()
-
- # In[2]: 预处理,合并 "WORLDPOST"和"THE WORLDPOST"两种类别
- df.category = df.category.map(lambda x:"WORLDPOST" if x == "THE WORLDPOST" else x)
- categories = df.groupby('category')
- print("total categories: ", categories.ngroups)
- print(categories.size())
-
- # In[3]: 将单词进行标号
- from keras.preprocessing import sequence
- from keras.preprocessing.text import Tokenizer
-
- # 将标题和正文合并
- df['text'] = df.headline + " " + df.short_description
-
- # 将单词进行标号
- tokenizer = Tokenizer()
- tokenizer.fit_on_texts(df.text)
- X = tokenizer.texts_to_sequences(df.text)
- df['words'] = X
-
- #记录每条数据的单词数
- df['word_length'] = df.words.apply(lambda i: len(i))
-
- #清除单词数不足5个的数据条目
- df = df[df.word_length >= 5]
- df.word_length.describe()
-
- # In[4]: 将类别进行编号
- maxlen = 50
- X = list(sequence.pad_sequences(df.words, maxlen=maxlen))
-
- # 将类别进行编号,
- categories = df.groupby('category').size().index.tolist()
- category_int = {}
- int_category = {}
- for i, k in enumerate(categories):
- category_int.update({k:i}) # 类别 -> 编号
- int_category.update({i:k}) # 编号 -> 类别
-
- df['c2id'] = df['category'].apply(lambda x: category_int[x])
-
- # In[5]: 随机选取训练样本
- import numpy as np
- import keras.utils as utils
- from sklearn.model_selection import train_test_split
-
- X = np.array(X)
- Y = utils.to_categorical(list(df.c2id))
-
- # 将数据分成两部分,随机取80%用于训练,20%用于测试
- seed = 29 # 随机种子
- x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)
-
- # In[6]: 加载预先训练好的单词向量
- EMBEDDING_DIM = 100
- embeddings_index = {}
- f = open(r'D:\Cadabra_tools002\course_data\glove.6B.100d.txt',errors='ignore') # 每个单词用100个数字的向量表示
- for line in f:
- values = line.split()
- word = values[0]
- coefs = np.asarray(values[1:], dtype='float32')
- embeddings_index[word] = coefs
- f.close()
-
- print('Total %s word vectors.' %len(embeddings_index)) #399913
-
- # In[7]: 构造Embedding层,并用预训练好的单词向量初始化,注意该层不用训练
- from keras.initializers import Constant
- from keras.layers.embeddings import Embedding
-
- word_index = tokenizer.word_index
- embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
- for word, i in word_index.items():
- embedding_vector = embeddings_index.get(word)
- #根据单词挑选出对应向量
- if embedding_vector is not None:
- embedding_matrix[i] = embedding_vector
-
- embedding_layer = Embedding(len(word_index)+1, EMBEDDING_DIM,
- embeddings_initializer=Constant(embedding_matrix),
- input_length = maxlen,
- trainable=True #可改为True
- )
-
- # In[8]: 构造神经网络并训练,对单词向量进行分类
- from keras import optimizers
- from keras.models import Sequential
- from keras.layers import Dense
- from keras.layers import Flatten
-
- model = Sequential()
- model.add(embedding_layer) # 注意该层的 trainable=False
- model.add(Flatten())
- model.add(Dense(64, activation='relu')) #
- #当结果是输出多个分类的概率时,用softmax激活函数,它将为30个分类提供不同的可能性概率值
- model.add(Dense(len(int_category), activation='softmax'))
-
- #对于输出多个分类结果,最好的损失函数是categorical_crossentropy
- optimizer = optimizers.Adam(lr=0.001)#调整识别率
- #optimizer = optimizers.RMSprop(lr=0.001)#调整识别率
- model.compile(optimizer, loss='categorical_crossentropy', metrics=['acc'])
- model.summary()
-
- history = model.fit(x_train, y_train, epochs=3, validation_data=(x_val, y_val), batch_size=512)
- # val_accuracy: 0.4611
- # 识别率并不高,这是因为全连接网络不适合用于识别单词序列
-
- # In[9]: 绘制训练过程中识别率和损失的变化
- import matplotlib.pyplot as plt
- acc = history.history['acc']
- val_acc = history.history['val_acc']
- loss = history.history['loss']
- val_loss = history.history['val_loss']
- epochs = range(1, len(acc) + 1)
-
- plt.title('Training and validation accuracy')
- plt.plot(epochs, acc, 'red', label='Training acc')
- plt.plot(epochs, val_acc, 'blue', label='Validation acc')
- plt.legend()
-
- plt.figure()
- plt.title('Training and validation loss')
- plt.plot(epochs, loss, 'red', label='Training loss')
- plt.plot(epochs, val_loss, 'blue', label='Validation loss')
- plt.legend()
-
- plt.show()
-
- # In[10]: 使用循环神经网络 RNN 进行分类
- from keras.layers import SimpleRNN
- model = Sequential()
- model.add(embedding_layer)
- model.add(SimpleRNN(64)) #可改变单词向量的维度
- #model.add(Dense(32, activation='Relu'))
-
- model.add(Dense(len(int_category), activation='softmax'))
-
- optimizer = optimizers.Adam(lr=0.001)#调整识别率
- #optimizer = optimizers.RMSprop(lr=0.001)#调整识别率
- #optimizer = optimizers.SGD(lr=0.001)#调整识别率
- model.compile(optimizer, loss='categorical_crossentropy', metrics=['acc'])
- history = model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val), batch_size=512)
- # val_accuracy: 0.4486
-
- # 绘制训练过程中识别率和损失的变化
- acc = history.history['acc']
- val_acc = history.history['val_acc']
- loss = history.history['loss']
- val_loss = history.history['val_loss']
- epochs = range(1, len(acc) + 1)
-
- plt.title('Training and validation accuracy')
- plt.plot(epochs, acc, 'red', label='Training acc')
- plt.plot(epochs, val_acc, 'blue', label='Validation acc')
- plt.legend()
- plt.show()
-
- # In[11]: long short term memory(长短期记忆网络)
- from keras.layers import LSTM
- model = Sequential()
- model.add(embedding_layer)
- model.add(LSTM(64,activation='relu',return_sequences=True)) #输出每一个句子,激活函数换成sigmoid,或者activation='relu'
- model.add(LSTM(32)) #再添加一个LSTM,识别率反而下降
- #model.add(Flatten()) #进行展平
- model.add(Dense(len(int_category), activation='softmax')) #
- #optimizer = optimizers.RMSprop(lr=0.001)#调整识别率
- optimizer = optimizers.Adam(lr=0.001)#调整识别率
- #optimizer = optimizers.SGD(lr=0.001)#调整识别率
- model.compile(optimizer, loss='categorical_crossentropy', metrics=['acc'])
- history = model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val), batch_size=512)
- # val_acc: 0.5920
-
- # 绘制训练过程中识别率和损失的变化
- acc = history.history['acc']
- val_acc = history.history['val_acc']
- loss = history.history['loss']
- val_loss = history.history['val_loss']
- epochs = range(1, len(acc) + 1)
-
- plt.title('Training and validation accuracy')
- plt.plot(epochs, acc, 'red', label='Training acc')
- plt.plot(epochs, val_acc, 'blue', label='Validation acc')
- plt.legend()
- plt.show()
-
- # In[12]: Gate Recurrent Unit (门控循环单元)
- from keras.layers import GRU, Conv1D, MaxPooling1D
- model = Sequential()
- model.add(embedding_layer)
- #使用一维卷积网络切割输入数据,参数5表示每各个单词作为切割小段
- model.add(Conv1D(64, 5, activation='relu'))
- #参数3表示,上层传下来的数据中,从每3个数值中抽取最大值
- model.add(MaxPooling1D(3))
- #添加一个有记忆性的GRU层,其原理与LSTM相同,运行速度更快,准确率有所降低
- model.add(GRU(64, dropout=0.1))
- model.add(Dense(len(int_category), activation='softmax'))
- #optimizer = optimizers.RMSprop(lr=0.001)#调整识别率
- optimizer = optimizers.Adam(lr=0.001)#调整识别率
- #optimizer = optimizers.SGD(lr=0.001)#调整识别率
- model.compile(optimizer, loss='categorical_crossentropy', metrics=['acc'])
- history = model.fit(x_train, y_train, epochs=5, validation_data=(x_val, y_val), batch_size=512)
- # val_acc: 0.5741
-
- # 绘制训练过程中识别率和损失的变化
- acc = history.history['acc']
- val_acc = history.history['val_acc']
- loss = history.history['loss']
- val_loss = history.history['val_loss']
- epochs = range(1, len(acc) + 1)
-
- plt.title('Training and validation accuracy')
- plt.plot(epochs, acc, 'red', label='Training acc')
- plt.plot(epochs, val_acc, 'blue', label='Validation acc')
- plt.legend()
- plt.show()

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。