当前位置:   article > 正文

python-sklearn实现一个简易的智能问答机器人_爬虫实现聊天对话自动应答

爬虫实现聊天对话自动应答

随着AI的发展,各大企业采用智能问答机器人取代了人工客服。智能问答系统实现的方法有很多,本篇文章介绍之前做的一个简易的智能问答机器人。采用的方法是使用朴素贝叶斯模型进行问题分类,模糊匹配查询近似问题。

  • 实现步骤

1.1 总体流程设计

    问答系统总体实现步骤如下流程图主要包括数据预处理,模型训练,结果映射以及答案匹配。数据预处理主要是对语料库进行收集,并对语料库进行筛选转换为需要的格式;然后使用朴素贝叶斯对处理好的语料库进行训练,语料库输入模型训练之前需要先对其进行预处理以及转换(去停用词,分词,TF-IDF计算),通过TF-IDF计算后则输入朴素贝叶斯中进行训练,由于我的语料库比较简易,所以采用默认参数训练即可达到较好的分类效果。在结果映射步骤中,主要是对事先确定好的类别进行映射处理(可用于脚本网页跳转使用)。答案匹配采用了模糊匹配的方法对用户提的问题进行匹配,搜索出相似的问题并给出其对应的答案。

 

 

1.2 语料库收集

语料库收集如下图。这里第一列为需要分类的类别,第二列为相关的问题。本篇中的语料库主要分为人社信息语料库以及娱乐聊天语料库。

人社信息语料库:

 

娱乐聊天语料库:

 

1.3 主要程序介绍

可视化界面GUI主要采用了tkinter工具包完成,rum_main.py程序如下:

  1. #!/usr/bin/env python3
  2. # _*_ coding:utf-8 _*_
  3. from tkinter import *
  4. import time
  5. from speech_test import *
  6. '''
  7. 定义消息发送函数:
  8. 1、在<消息列表分区>的文本控件中实时添加时间;
  9. 2、获取<发送消息分区>的文本内容,添加到列表分区的文本中;
  10. 3、将<发送消息分区>的文本内容清空。
  11. '''
  12. def msgsend():
  13. msg = '我:' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + '\n'
  14. # print(msg)
  15. txt_msglist.insert(END, msg, 'green') # 添加时间
  16. query = txt_msgsend.get('0.0', END) #!!!!!!!!!!!!!!!11
  17. print(query)
  18. result = main(query) #问题输入模型入口
  19. print('result:',result)
  20. txt_msglist.insert(END, txt_msgsend.get('0.0', END)) # 获取发送消息,添加文本到消息列表
  21. txt_msglist.insert(END, '\n')
  22. txt_msgsend.delete('0.0', END) # 清空发送消息
  23. robot = '小Y:' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + '\n'
  24. txt_msglist.insert(END, robot, 'red')
  25. txt_msglist.insert(END, result+'\n')
  26. '''定义取消发送 消息 函数'''
  27. def cancel():
  28. txt_msgsend.delete('0.0', END) # 取消发送消息,即清空发送消息
  29. '''绑定up键'''
  30. def msgsendEvent(event):
  31. if event.keysym == 'Up':
  32. msgsend()
  33. tk = Tk()
  34. tk.title('聊天窗口')
  35. '''创建分区'''
  36. f_msglist = Frame(height=300, width=300) # 创建<消息列表分区 >
  37. f_msgsend = Frame(height=300, width=300) # 创建<发送消息分区 >
  38. f_floor = Frame(height=100, width=300) # 创建<按钮分区>
  39. f_right = Frame(height=700, width=100) # 创建<图片分区>
  40. '''创建控件'''
  41. txt_msglist = Text(f_msglist) # 消息列表分区中创建文本控件
  42. txt_msglist.tag_config('green', foreground='blue') # 消息列表分区中创建标签
  43. txt_msglist.tag_config('red', foreground='red') # 消息列表分区中创建标签
  44. txt_msgsend = Text(f_msgsend) # 发送消息分区中创建文本控件
  45. txt_show = Text(f_msglist) # 消息列表分区中创建文本控件
  46. txt_show.tag_config('red', foreground='red') # 消息列表分区中创建标签
  47. txt_showsend = Text(f_msgsend) # 发送消息分区中创建文本控件
  48. txt_msgsend.bind('<KeyPress-Up>', msgsendEvent) # 发送消息分区中,绑定‘UP’键与消息发送。
  49. '''txt_right = Text(f_right) #图片显示分区创建文本控件'''
  50. button_send = Button(f_floor, text='Send',command=msgsend) # 按钮分区中创建按钮并绑定发送消息函数
  51. button_cancel = Button(f_floor, text='Cancel', command=cancel) # 分区中创建取消按钮并绑定取消函数
  52. '''分区布局'''
  53. f_msglist.grid(row=0, column=0) # 消息列表分区
  54. f_msgsend.grid(row=1, column=0) # 发送消息分区
  55. f_floor.grid(row=2, column=0) # 按钮分区
  56. f_right.grid(row=0, column=1, rowspan=3) # 图片显示分区
  57. txt_msglist.grid() # 消息列表文本控件加载
  58. txt_msgsend.grid() # 消息发送文本控件加载
  59. button_send.grid(row=0, column=0, sticky=W) # 发送按钮控件加载
  60. button_cancel.grid(row=0, column=1, sticky=W) # 取消按钮控件加载
  61. tk.mainloop()

智能问答机器人相关程序为 speech_test.py,程序如下:

  1. #-*- coding:utf-8 -*-
  2. import logging
  3. logging.getLogger("requests").setLevel(logging.WARNING)
  4. import csv
  5. import jieba
  6. import pickle
  7. from fuzzywuzzy import fuzz
  8. import math
  9. from scipy import sparse
  10. from sklearn.feature_extraction.text import CountVectorizer
  11. from sklearn.feature_extraction.text import TfidfTransformer
  12. from scipy.sparse import lil_matrix
  13. import jieba.posseg as pseg
  14. import sys
  15. import pandas as pd
  16. from sklearn.naive_bayes import MultinomialNB
  17. from speech_recognition import *
  18. import warnings
  19. warnings.filterwarnings("ignore")
  20. def load_label_url():
  21. with open('znwd_label_url.csv','r',encoding='utf-8') as f:
  22. name_id = {}
  23. label_url = csv.reader(f)
  24. header = next(label_url)
  25. for power_name_id in label_url:
  26. name_id[power_name_id[0]] = power_name_id[1]
  27. return name_id
  28. def load_cut_save(filename,load = False):
  29. jieba.load_userdict('UserDefined_words.txt')
  30. corpus = []
  31. label = []
  32. with open(filename,'rt',encoding='utf-8') as f:
  33. data_corpus = csv.reader(f)
  34. header = next(data_corpus)
  35. for words in data_corpus:
  36. word = jieba.cut(words[1])
  37. doc = []
  38. for x in word:
  39. if x not in stop_words and not x.isdigit():
  40. doc.append(x)
  41. corpus.append(' '.join(doc))
  42. label.append(words[0])
  43. if load == True:
  44. with open('corpus.oj','wb') as f:
  45. pickle.dump(corpus,f)
  46. with open('label.oj','wb') as f:
  47. pickle.dump(label,f)
  48. return corpus,label
  49. def train_model():
  50. with open('corpus.oj','rb') as f_corpus:
  51. corpus = pickle.load(f_corpus)
  52. with open('label.oj','rb') as f_label:
  53. label = pickle.load(f_label,encoding='bytes')
  54. vectorizer = CountVectorizer(min_df=1)
  55. transformer = TfidfTransformer()
  56. tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
  57. words_frequency = vectorizer.fit_transform(corpus)
  58. word = vectorizer.get_feature_names()
  59. saved = input_tfidf(vectorizer.vocabulary_,sparse.csc_matrix(words_frequency),len(corpus))
  60. model = MultinomialNB()
  61. model.fit(tfidf,label)
  62. with open('model.oj','wb') as f_model:
  63. pickle.dump(model,f_model)
  64. with open('idf.oj','wb') as f_idf:
  65. pickle.dump(saved,f_idf)
  66. return model,tfidf,label
  67. class input_tfidf(object):
  68. def __init__(self,feature_index,frequency,docs):
  69. self.feature_index = feature_index
  70. self.frequency = frequency
  71. self.docs = docs
  72. self.len = len(feature_index)
  73. def key_count(self,input_words):
  74. keys = jieba.cut(input_words)
  75. count = {}
  76. for key in keys:
  77. num = count.get(key, 0)
  78. count[key] = num + 1
  79. return count
  80. def getTdidf(self,input_words):
  81. count = self.key_count(input_words)
  82. result = lil_matrix((1, self.len))
  83. frequency = sparse.csc_matrix(self.frequency)
  84. for x in count:
  85. word = self.feature_index.get(x)
  86. if word != None and word>=0:
  87. word_frequency = frequency.getcol(word)
  88. feature_docs = word_frequency.sum()
  89. tfidf = count.get(x) * (math.log((self.docs+1) / (feature_docs+1))+1)
  90. result[0, word] = tfidf
  91. return result
  92. def model_predict(input_str):
  93. f = open('idf.oj','rb')
  94. idf = pickle.load(f)
  95. f.close()
  96. f = open('model.oj','rb')
  97. model = pickle.load(f)
  98. f.close()
  99. tfidf = idf.getTdidf(input_str)
  100. classifiction = (model.predict(tfidf))
  101. # print(model.predict_proba(tfidf))
  102. prob = model.predict_proba(tfidf).max()
  103. name_id = load_label_url()
  104. if prob >= 0.5:
  105. answer1 = str(classifiction[0],'utf-8')
  106. else:
  107. answer1 = None
  108. return answer1
  109. def similarity(input_questions):
  110. with open('corpus_1233.oj', 'rb') as f:
  111. corpus = pickle.load(f,encoding='bytes')
  112. with open('question_1233.oj', 'rb') as f:
  113. question = pickle.load(f,encoding='bytes')
  114. with open('answer_1233.oj', 'rb') as f:
  115. answer = pickle.load(f,encoding='bytes')
  116. text = {}
  117. train = []
  118. answer2 = []
  119. for key, value in enumerate(corpus):
  120. similarity = fuzz.ratio(input_questions, value)
  121. if similarity > 40:
  122. text[key] = similarity
  123. if len(text) >= 3:
  124. train = sorted(text.items(), key=lambda d: d[1], reverse=True)
  125. # print(u"与您提的疑问相似的问题有\n")
  126. for i in range(3):
  127. an = {"question":question[train[i][0]],"answer":answer[train[i][0]]}
  128. answer2.append(an)
  129. # print("%d、" % (i + 1), \
  130. # " 问题:%s\n" % str(question[train[i][0]],'utf-8'), \
  131. # " 答案:%s" % str(answer[train[i][0]],'utf-8'))
  132. elif len(text) == 2:
  133. train = sorted(text.items(), key=lambda d: d[1], reverse=True)
  134. # print("与您提的疑问相似的问题有\n")
  135. for i in range(2):
  136. an = {"question":question[train[i][0]],"answer":answer[train[i][0]]}
  137. answer2.append(an)
  138. # print("%d、" % (i + 1), \
  139. # " 问题:%s\n" % str(question[train[i][0]],'utf-8'), \
  140. # " 答案:%s" % str(answer[train[i][0]],'utf-8'))
  141. elif len(text) == 1:
  142. an = {"question": question[list(text.keys())[0]], "answer": answer[list(text.keys())[0]]}
  143. answer2.append(an)
  144. # print("与您提的疑问相似的问题有:\n", \
  145. # " 问题:%s" % str(question[text.keys()[0]],'utf-8'), \
  146. # " 答案:%s" % str(answer[text.keys()[0]],'utf-8'))
  147. else:
  148. # print("您所提的疑问无其他相似问题!")
  149. an = {"question":None,"answer":None}
  150. answer2.append(an)
  151. return answer2
  152. def get_greeting(input_questions,question,answer):
  153. text = {}
  154. for key, value in enumerate(question):
  155. similarity = fuzz.ratio(input_questions, value)
  156. if similarity > 60:
  157. text[key] = similarity
  158. if len(text) > 0:
  159. train = sorted(text.items(), key=lambda d: d[1], reverse=True)
  160. answer3 = answer[train[0][0]]
  161. else:
  162. answer3 = None
  163. return answer3
  164. def sim(doc):
  165. input_questions = ''
  166. input_words = jieba.cut(doc)
  167. for x in input_words:
  168. if x not in stop_words:
  169. input_questions += x
  170. answer2 = similarity(input_questions)
  171. return answer2
  172. def ans_show(returnSet):
  173. if returnSet[2] is not None:
  174. ans = "%s"%returnSet[2]
  175. elif returnSet[0] is not None:
  176. ans = "您的问题属于<%s>专栏\n"%returnSet[0]
  177. ans1 = ""
  178. if returnSet[1][0]['question'] is not None:
  179. ans1 = "小Y还知道其他一些问题例如:\n"
  180. ans2 = ""
  181. for i in range(len(returnSet[1])):
  182. ans2 = ans2 + "%d、" % (i + 1) + " 问题:%s\n" % str(returnSet[1][i]['question'],'utf-8') + " 答案:%s" % str(returnSet[1][i]['answer'],'utf-8')
  183. ans1 = ans1 + ans2
  184. ans = ans + ans1
  185. elif returnSet[1][0]['question'] is not None:
  186. ans1 = "小Y知道相似的问题:\n"
  187. ans2 = ""
  188. for i in range(len(returnSet[1])):
  189. ans2 = ans2 + "%d、" % (i + 1) + " 问题:%s\n" % str(returnSet[1][i]['question'], 'utf-8') + " 答案:%s" % str(returnSet[1][i]['answer'], 'utf-8')
  190. ans = ans1 + ans2
  191. else:
  192. ans = "您问的问题太过深奥,Mike才疏学浅暂时无法为您解答,待我读书破万卷后成为您的百科机器人"
  193. return ans
  194. with open('stop_words.txt', 'rb') as f:
  195. stop_words = f.read().splitlines()
  196. question_greeting = []
  197. answer_greeting = []
  198. with open("greeting.csv", 'r',encoding='utf-8') as f:
  199. greeting = csv.reader(f)
  200. header = next(greeting)
  201. for words in greeting:
  202. question_greeting.append(words[0])
  203. answer_greeting.append(words[1])
  204. filename = 'znwd_corpus.csv'
  205. corpus, label = load_cut_save(filename,load=False)
  206. def main(question):
  207. if question != None:
  208. query = question #########此处会因语音无法识别还报错
  209. print("我 > %s" %query)
  210. ##############
  211. answer3 = get_greeting(query,question_greeting,answer_greeting)
  212. # print(answer3)
  213. if answer3 is None:
  214. answer1 = model_predict(query)
  215. answer2 = sim(query)
  216. else:
  217. answer1 = None
  218. answer2 = None
  219. ans = [answer1,answer2,answer3]
  220. result = ans_show(ans)
  221. else:
  222. result = "输入有误请重新输入!"
  223. query = None
  224. return result

因而,一个简单的智能问答机器人即可实现,若需要问答机器人能够回答更多的内容,可针对语料库进行修改,进而丰富智能问答机器人的聊天范围。程序中读取语料库采用了pickle工具包将预处理后的语料库进行序列化至本地,进而在下次使用语料库不需要再次进行预处理,可节省处理的时间。修改语料库后需重新覆盖序列化至本地的语料库文件和模型文件。

  • 效果展示

最终效果如下,我的语料库主要用了社保卡相关的数据,该问答系统可以当作是一个社保信息方面的问答。

 

 

工程源码:https://github.com/liangjunAI/chatting_robot

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/343958
推荐阅读
相关标签
  

闽ICP备14008679号