当前位置:   article > 正文

文本分类-Word2vec+LSTM_word2vec lstm

word2vec lstm

目录

一、LSTM文本分类实战

 1、数据读取及预处理

2、文本序列编码

3、Embedding+LSTM(含LSTM参数介绍)

4、Word2vec+双向LSTM

二、划重点

少走10年弯路


        LSTM(Long Short-Term Memory)是一种特殊的循环神经网络(Recurrent Neural Network,RNN),用于处理序列数据和时间序列数据的建模和预测。相比于传统的RNN结构,LSTM引入了门控机制,能够更好地捕捉和记忆长期依赖关系。

         LSTM的关键思想是通过控制信息的流动和遗忘来实现长期记忆。它由一系列称为"记忆单元"的单元组成,每个记忆单元都具有一个内部状态(cell state)和三个门(gate):输入门(input gate)、遗忘门(forget gate)和输出门(output gate)。

一、LSTM文本分类实战

 1、数据读取及预处理

  1. # 导包
  2. import re
  3. import os
  4. from sqlalchemy import create_engine
  5. import pandas as pd
  6. import numpy as np
  7. import warnings
  8. warnings.filterwarnings('ignore')
  9. import sklearn
  10. from sklearn.model_selection import train_test_split
  11. from sklearn.metrics import roc_curve,roc_auc_score
  12. import xgboost as xgb
  13. from xgboost.sklearn import XGBClassifier
  14. import lightgbm as lgb
  15. import matplotlib.pyplot as plt
  16. import gc
  17. from tensorflow.keras.preprocessing.text import Tokenizer
  18. from tensorflow.keras import models
  19. from tensorflow.keras import layers
  20. from tensorflow.keras import optimizers
  21. # 2、数据读取+预处理
  22. data=pd.read_excel('Inshorts Cleaned Data.xlsx')
  23. def data_preprocess(data):
  24. df=data.drop(['Publish Date','Time ','Headline'],axis=1).copy()
  25. df.rename(columns={'Source ':'Source'},inplace=True)
  26. df=df[df.Source.isin(['YouTube','India Today'])].reset_index(drop=True)
  27. df['y']=np.where(df.Source=='YouTube',1,0)
  28. df=df.drop(['Source'],axis=1)
  29. return df
  30. df=data.pipe(data_preprocess)
  31. print(df.shape)
  32. df.head()
  33. # 导入英文停用词
  34. from nltk.corpus import stopwords
  35. from nltk.tokenize import sent_tokenize
  36. stop_english=stopwords.words('english')
  37. stop_spanish=stopwords.words('spanish')
  38. stop_english
  39. # 4、文本预处理:处理简写、小写化、去除停用词、词性还原
  40. from nltk.stem import WordNetLemmatizer
  41. from nltk.corpus import stopwords
  42. from nltk.tokenize import sent_tokenize
  43. import nltk
  44. def replace_abbreviation(text):
  45. rep_list=[
  46. ("it's", "it is"),
  47. ("i'm", "i am"),
  48. ("he's", "he is"),
  49. ("she's", "she is"),
  50. ("we're", "we are"),
  51. ("they're", "they are"),
  52. ("you're", "you are"),
  53. ("that's", "that is"),
  54. ("this's", "this is"),
  55. ("can't", "can not"),
  56. ("don't", "do not"),
  57. ("doesn't", "does not"),
  58. ("we've", "we have"),
  59. ("i've", " i have"),
  60. ("isn't", "is not"),
  61. ("won't", "will not"),
  62. ("hasn't", "has not"),
  63. ("wasn't", "was not"),
  64. ("weren't", "were not"),
  65. ("let's", "let us"),
  66. ("didn't", "did not"),
  67. ("hadn't", "had not"),
  68. ("waht's", "what is"),
  69. ("couldn't", "could not"),
  70. ("you'll", "you will"),
  71. ("i'll", "i will"),
  72. ("you've", "you have")
  73. ]
  74. result = text.lower()
  75. for word_replace in rep_list:
  76. result=result.replace(word_replace[0],word_replace[1])
  77. # result = result.replace("'s", "")
  78. return result
  79. def drop_char(text):
  80. result=text.lower()
  81. result=re.sub('[^\w\s]',' ',result) # 去掉标点符号、特殊字符
  82. result=re.sub('\s+',' ',result) # 多空格处理为单空格
  83. return result
  84. def stemed_words(text,stop_words,lemma):
  85. word_list = [lemma.lemmatize(word, pos='v') for word in text.split() if word not in stop_words]
  86. result=" ".join(word_list)
  87. return result
  88. def text_preprocess(text_seq):
  89. stop_words = stopwords.words("english")
  90. lemma = WordNetLemmatizer()
  91. result=[]
  92. for text in text_seq:
  93. if pd.isnull(text):
  94. result.append(None)
  95. continue
  96. text=replace_abbreviation(text)
  97. text=drop_char(text)
  98. text=stemed_words(text,stop_words,lemma)
  99. result.append(text)
  100. return result
  101. df['short']=text_preprocess(df.Short)
  102. df[['Short','short']]
  103. # 5、划分训练、测试集
  104. test_index=list(df.sample(2000).index)
  105. df['label']=np.where(df.index.isin(test_index),'test','train')
  106. df['label'].value_counts()

2、文本序列编码

        按照词频排序,创建长度为6000的高频词词典、来对文本进行序列化编码。

  1. from tensorflow.keras.preprocessing.text import Tokenizer
  2. def word_dict_fit(train_text_list,num_words):
  3. '''
  4. train_text_list: ['some thing today ','some thing today2']
  5. '''
  6. tok_params={
  7. 'num_words':num_words, # 词典的长度,仅保留词频top的num_words个词
  8. 'filters':'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
  9. 'lower':True,
  10. 'split':' ',
  11. 'char_level':False,
  12. 'oov_token':None, # 设定词典外的词编码
  13. }
  14. tok = Tokenizer(**tok_params) # 分词
  15. tok.fit_on_texts(train_text_list)
  16. return tok
  17. def word_dict_apply_sequences(tok_model,text_list,len_vec):
  18. '''
  19. text_list: ['some thing today ','some thing today2']
  20. '''
  21. list_tok = tok_model.texts_to_sequences(text_list) # 编码映射
  22. pad_params={
  23. 'sequences':list_tok,
  24. 'maxlen':len_vec, # 补全后向量长度
  25. 'padding':'pre', # 'pre' or 'post',在前、在后补全
  26. 'truncating':'pre', # 'pre' or 'post',在前、在后删除长度多余的部分
  27. 'value':0, # 补全0
  28. }
  29. seq_tok = pad_sequences(**pad_params) # 补全编码向量,返回二维array
  30. return seq_tok
  31. num_words,len_vec = 6000,40
  32. tok_model= word_dict_fit(df[df.label=='train'].short,num_words)
  33. tok_train = word_dict_apply_sequences(tok_model,df[df.label=='train'].short,len_vec)
  34. tok_test = word_dict_apply_sequences(tok_model,df[df.label=='test'].short,len_vec)
  35. tok_test

3、Embedding+LSTM(含LSTM参数介绍)

        LSTM层的输入是三维张量(batch_size, timesteps, input_dim),所以使用的数据可以是时间序列、也可以是文本数据的embedding;输出设置return_sequences为False,返回尺寸为 (batch_size, units) 的 2D 张量。

  1. '''
  2. LSTM层核心参数
  3. units:输出维度
  4. activation:激活函数
  5. recurrent_activation: RNN循环激活函数
  6. use_bias: 布尔值,是否使用偏置项
  7. dropout:0~1之间的浮点数,神经元失活比例
  8. recurrent_dropout:0~1之间的浮点数,循环状态的神经元失活比例
  9. return_sequences: True时返回RNN全部输出序列(3D),False时输出序列的最后一个输出(2D)
  10. '''
  11. def init_lstm_model(max_features, embed_size):
  12. model = Sequential()
  13. model.add(Embedding(input_dim=max_features, output_dim=embed_size))
  14. model.add(Bidirectional(LSTM(units=32,activation='relu', recurrent_dropout=0.1)))
  15. model.add(Dropout(0.25,seed=1))
  16. model.add(Dense(64))
  17. model.add(Dropout(0.3,seed=1))
  18. model.add(Dense(1, activation='sigmoid'))
  19. model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  20. return model
  21. def model_fit(model, x, y,test_x,test_y):
  22. return model.fit(x, y, batch_size=100, epochs=2, validation_data=(test_x,test_y))
  23. embed_size = 128
  24. lstm_model=init_lstm_model(num_words, embed_size)
  25. model_train=model_fit(lstm_model,tok_train,np.array(df[df.label=='train'].y),tok_test,np.array(df[df.label=='test'].y))
  26. lstm_model.summary()

  1. def ks_auc_value(y_value,df,model):
  2. y_pred=model.predict(df)
  3. fpr,tpr,thresholds= roc_curve(list(y_value),list(y_pred))
  4. ks=max(tpr-fpr)
  5. auc= roc_auc_score(list(y_value),list(y_pred))
  6. return ks,auc
  7. ks_auc_value(df[df.label=='train'].y,tok_train,model)
  8. '''
  9. output:
  10. (0.8611593007957995, 0.9749818730610305)
  11. '''
  12. ks_auc_value(df[df.label=='test'].y,tok_test,model)
  13. '''
  14. output:
  15. (0.7191120926957301, 0.9123405591831509)
  16. '''

4、Word2vec+双向LSTM

        双向LSTM是两个单向LSTM拼接而成,分别为按照序列从前往后、从后往前训练得到。前面可以看到由于数据量小,Embedding+LSTM的结果存在一定过拟合问题,因此使用Word2vec预训练模型+双向LSTM来训练评估,最终评估结果训练集、测试集差异明显缩小。

  1. def init_lstm_model(max_features, embed_size ,embedding_matrix):
  2. model = Sequential()
  3. model.add(Embedding(input_dim=max_features, output_dim=embed_size,weights=[embedding_matrix],trainable=False))
  4. model.add(Bidirectional(layers.LSTM(units=32,activation='relu', recurrent_dropout=0.1)))
  5. model.add(Dropout(0.3,seed=1))
  6. model.add(Dense(64,activation='relu'))
  7. model.add(Dropout(0.3,seed=1))
  8. model.add(Dense(1, activation='sigmoid'))
  9. model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  10. return model
  11. def model_fit(model, x, y,test_x,test_y):
  12. return model.fit(x, y, batch_size=100, epochs=5, validation_data=(test_x,test_y))
  13. num_words,embed_size = 6000,128
  14. lstm_model=init_lstm_model(num_words, embed_size ,embedding_matrix)
  15. model_train=model_fit(lstm_model,tok_train,np.array(df[df.label=='train'].y),tok_test,np.array(df[df.label=='test'].y))
  1. ks_auc_value(df[df.label=='train'].y,tok_train,model)
  2. '''
  3. output:
  4. (0.7223217797649937, 0.922939132379851)
  5. '''
  6. ks_auc_value(df[df.label=='test'].y,tok_test,model)
  7. '''
  8. output:
  9. (0.7046603930606234, 0.9140880065296716)
  10. '''

二、划重点

少走10年弯路

        关注威信公众号Python风控模型与数据分析,回复 文本分类4 获取本篇数据及代码

        还有更多理论、代码分享等你来拿

本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号