当前位置:   article > 正文

Pytorch简单实现BiLSTM情感分类_bilstm情感分析代码

bilstm情感分析代码

一、准备数据

  1. seq = ["我喜欢你", "我恨你", "我今天很开心", "我最近很沮丧", "我很难过", "我讨厌你", "你非常的勤奋", "我特别懒惰", "我特别痛苦"]
  2. label = [1, 0, 1, 0, 0, 0, 1, 0, 0] # 0代表消极情感, 1代表积极情感

二、数据处理

  1. 对数据采用jieba进行分词,将所有句子分词后进行频次排序高频在前,并加入index。由于每一个句子长度不同,加入PAD进行补全,如下所示:
    {'我': 1, '你': 2, '很': 3, '特别': 4, '喜欢': 5, '恨': 6, '今天': 7, '开心': 8, '最近': 9, '沮丧': 10, '难过': 11, '讨厌': 12, '非常': 13, '的': 14, '勤奋': 15, '懒惰': 16, '痛苦': 17, 'PAD': 0}
  2. 然后根据索引将句子进行分词后以此使用索引进行表示,例如:我 \ 喜欢 \ 你 = [1, 4, 2]

三、模型构建

  • 输入采用embedding生成词向量输入
  • 最后将正向LSTM和反向LSTM最后一个隐层拼接作为全连接层的输入
  • 此模型采用如下所示: 

 四、具体代码

  1. import torch
  2. import torch.nn as nn
  3. import torch.optim as optim
  4. import collections
  5. import torch.utils.data as Data
  6. from torch.autograd import Variable
  7. import jieba
  8. seq = ["我喜欢你", "我恨你", "我今天很开心", "我最近很沮丧", "我很难过", "我讨厌你", "你非常的勤奋", "我特别懒惰", "我特别痛苦"]
  9. label = [1, 0, 1, 0, 0, 0, 1, 0, 0]
  10. #分词
  11. seq_cut = []
  12. seq_cut_list = []
  13. for i in seq:
  14. cut_res = list(jieba.cut(i))
  15. seq_cut = seq_cut + cut_res
  16. seq_cut_list.append(cut_res)
  17. word2num = sorted(collections.Counter(seq_cut).items(), key=lambda item: item[1], reverse=True)
  18. # 所有词
  19. vocab = list(set(seq_cut))
  20. # 词对应索引
  21. word2index = {w[0]: i+1 for i, w in enumerate(word2num)}
  22. word2index["PAD"] = 0
  23. # 词典大小
  24. vocab_size = len(word2index)
  25. seq_size = len(seq)
  26. seq_length = max([len(i) for i in seq_cut_list])
  27. batch_size = 3
  28. embedding_size = 3
  29. num_classes = 2
  30. n_hidden = 5
  31. def make_data(seq, label):
  32. inputs = []
  33. for i in seq:
  34. seq_index = [word2index[word] for word in i]
  35. # 补全保持句子长度一致
  36. if len(seq_index) != seq_length:
  37. seq_index = seq_index + [0] * (seq_length-len(seq_index))
  38. inputs.append(seq_index)
  39. targets = [i for i in label]
  40. return inputs, targets
  41. input_batch, target_batch = make_data(seq_cut_list, label)
  42. input_batch, target_batch = Variable(torch.LongTensor(input_batch)), Variable(torch.LongTensor(target_batch))
  43. # dataset = Data.TensorDataset(input_batch, target_batch)
  44. # loader = Data.DataLoader(dataset, batch_size, shuffle=True)
  45. # 建立模型
  46. class BiLSTM(nn.Module):
  47. def __init__(self):
  48. super(BiLSTM, self).__init__()
  49. self.word_vec = nn.Embedding(vocab_size, embedding_size)
  50. # bidirectional双向LSTM
  51. self.bilstm = nn.LSTM(embedding_size, n_hidden, 1, bidirectional=True)
  52. self.fc = nn.Linear(n_hidden * 2, num_classes)
  53. def forward(self, input):
  54. embedding_input = self.word_vec(input)
  55. # 调换第一维和第二维度
  56. embedding_input = embedding_input.permute(1, 0, 2)
  57. output, (h_n, c_n) = self.bilstm(embedding_input)
  58. # 使用正向LSTM与反向LSTM最后一个输出做拼接
  59. encoding1 = torch.cat([h_n[0], h_n[1]], dim=1) # dim=1代表横向拼接
  60. # 使用双向LSTM的输出头尾拼接做文本分类
  61. encoding2 = torch.cat([output[0], output[-1]], dim=1)
  62. fc_out = self.fc(encoding1)
  63. return fc_out
  64. model = BiLSTM()
  65. print(model)
  66. criterion = nn.CrossEntropyLoss()
  67. optimizer = optim.Adam(model.parameters(), lr=0.001)
  68. # 训练
  69. for epoch in range(5000):
  70. pred = model(input_batch)
  71. loss = criterion(pred, target_batch)
  72. if (epoch + 1) % 1000 == 0:
  73. print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
  74. optimizer.zero_grad()
  75. loss.backward()
  76. optimizer.step()
  77. # 测试
  78. test_text = '我今天很痛苦'
  79. # 分词
  80. test_cut = list(jieba.cut(test_text))
  81. # 索引
  82. test_batch, _ = make_data([test_cut], [1])
  83. test_batch = torch.LongTensor(test_batch)
  84. out = model(test_batch)
  85. predict = torch.max(out, 1)[1]
  86. if predict.item() == 0:
  87. print(test_text,"is Bad Mean...")
  88. else:
  89. print(test_text,"is Good Mean!!")

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/代码探险家/article/detail/913882
推荐阅读
相关标签
  

闽ICP备14008679号