赞
踩
import os import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from matplotlib.pylab import style #自定义图表风格 style.use('ggplot') from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" plt.rcParams['font.sans-serif'] = ['Simhei'] # 解决中文乱码问题 import re import jieba.posseg as psg import itertools #conda install -c anaconda gensim from gensim import corpora,models #主题挖掘,提取关键信息 # pip install wordcloud from wordcloud import WordCloud,ImageColorGenerator from collections import Counter from sklearn import tree from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score import graphviz raw_data=pd.read_csv('reviews.csv') #raw_data.info() #raw_data.columns #取值分布 for cate in ['creationTime', 'nickname', 'referenceName', 'content_type']: raw_data[cate].value_counts() reviews=raw_data.copy() reviews=reviews[['content', 'content_type']] #print('去重之前:',reviews.shape[0]) reviews=reviews.drop_duplicates() #print('去重之后:',reviews.shape[0]) # 清洗之前 content=reviews['content'] #print('清洗之前''-----------') #for i in range(5,10): # print(content[i]) # print('-----------') #清洗之后,将数字、字母、京东美的电热水器字样都删除 info=re.compile('[0-9a-zA-Z]|京东|美的|电热水器|热水器|') content=content.apply(lambda x: info.sub('',x)) #替换所有匹配项 #print('清洗之后''-----------') #for i in range(5,10): #print(content[i]) #print('-----------') #分词,由元组组成的list seg_content=content.apply( lambda s: [(x.word,x.flag) for x in psg.cut(s)] ) seg_content.shape len(seg_content) #print(seg_content[5]) #统计评论词数 n_word=seg_content.apply(lambda s: len(s)) len(n_word) n_word.head(6) #得到各分词在第几条评论 n_content=[ [x+1]*y for x,y in zip(list(seg_content.index),list(n_word))] #[x+1]*y,表示复制y份,由list组成的list index_content_long=sum(n_content,[]) #表示去掉[],拉平,返回list len(index_content_long) sum([[2,2],[3,3,3]],[]) #分词及词性,去掉[],拉平 seg_content.head() seg_content_long=sum(seg_content,[]) seg_content_long type(seg_content_long) len(seg_content_long) seg_content_long[0] #得到加长版的分词、词性 word_long=[x[0] for x in seg_content_long] nature_long=[x[1] for x in seg_content_long] len(word_long) len(nature_long) #content_type拉长 n_content_type=[ [x]*y for x,y in zip(list(reviews['content_type']),list(n_word))] #[x+1]*y,表示复制y份 content_type_long=sum(n_content_type,[]) #表示去掉[],拉平 len(content_type_long) review_long=pd.DataFrame({'index_content':index_content_long, 'word':word_long, 'nature':nature_long, 'content_type':content_type_long}) review_long.shape review_long.head() review_long['nature'].unique() #print(review_long) #去除标点符号 review_long_clean=review_long[review_long['nature']!='x'] #x表示标点符合 review_long_clean.shape #导入停用词 stop_path=open('stoplist.txt','r',encoding='UTF-8') stop_words=stop_path.readlines() len(stop_words) stop_words[0:5] #停用词,预处理 stop_words=[word.strip('\n') for word in stop_words] stop_words[0:5] #得到不含停用词的分词表 word_long_clean=list(set(word_long)-set(stop_words)) #print(len(word_long_clean)) review_long_clean=review_long_clean[review_long_clean['word'].isin(word_long_clean)] #print(review_long_clean.shape) #再次统计每条评论的分词数量 n_word=review_long_clean.groupby('index_content').count()['word'] n_word index_word=[ list(np.arange(1,x+1)) for x in list(n_word)] index_word_long=sum(index_word,[]) #表示去掉[],拉平 len(index_word_long) review_long_clean['index_word']=index_word_long review_long_clean.head() review_long_clean.to_csv('1_review_long_clean.csv') n_review_long_clean=review_long_clean[[ 'n' in nat for nat in review_long_clean.nature]] n_review_long_clean.shape #print(n_review_long_clean.head()) #print(n_review_long_clean.nature.value_counts()) n_review_long_clean.to_csv('1_n_review_long_clean.csv') #font=r"C:\Windows\Fonts\msyh.ttc" font=r"C:\Windows\Fonts\msyh.ttc" background_image=plt.imread('1.png') wordcloud = WordCloud(font_path=font, max_words = 100, mode='RGBA' ,background_color='white',mask=background_image) #wordcloud = WordCloud(font_path=font, max_words = 100, background_color='white',mask=background_image) #width=1600,height=1200, mode='RGBA' #wordcloud = WordCloud(font_path=None,max_words = 100, background_color='white', mode='RGBA') #print("okok+"+review_long_clean.word.values) wordcloud.generate_from_frequencies(Counter(review_long_clean.word.values)) wordcloud.to_file('1_分词后的词云图.png') plt.figure(figsize=(20,10)) plt.imshow(wordcloud) plt.axis('off') #plt.show() font=r"C:\Windows\Fonts\msyh.ttc" background_image=plt.imread('1.png') wordcloud = WordCloud(font_path=font, max_words = 100, mode='RGBA' ,background_color='white',mask=background_image) #width=1600,height=1200 wordcloud.generate_from_frequencies(Counter(n_review_long_clean.word.values)) wordcloud.to_file('1_分词后的词云图(名词).png') plt.figure(figsize=(20,10)) plt.imshow(wordcloud) plt.axis('off') #plt.show() #第一步:构造特征空间和标签 Y=[] for ind in review_long_clean.index_content.unique(): y=[ word for word in review_long_clean.content_type[review_long_clean.index_content==ind].unique() ] Y.append(y) #print('len(Y)=',len(Y)) X=[] for ind in review_long_clean.index_content.unique(): term=[ word for word in review_long_clean.word[review_long_clean.index_content==ind].values ] X.append(' '.join(term)) #print('len(X)=',len(X)) #print(X) #print(Y) #第二步:训练集、测试集划分 x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=7) #第三步:词转向量,01矩阵 count_vec=CountVectorizer(binary=True) x_train=count_vec.fit_transform(x_train) x_test=count_vec.transform(x_test) #第四步:构建决策树 dtc=tree.DecisionTreeClassifier(max_depth=5) dtc.fit(x_train,y_train) #print('在训练集上的准确率:%.2f'% accuracy_score(y_train,dtc.predict(x_train))) y_true=y_test y_pred=dtc.predict(x_test) #print(classification_report(y_true,y_pred)) #print('在测试集上的准确率:%.2f'% accuracy_score(y_true,y_pred)) #第五步:画决策树 #这个决策数图,需要在网上下载graphviz 包,并且配置环境变量 from graphviz import Digraph import os os.environ["PATH"] += os.pathsep + 'C:\\Program Files\\Graphviz\\bin' dot_data=tree.export_graphviz(dtc ,feature_names=count_vec.get_feature_names()) graph=graphviz.Source(dot_data) #print(graph) #graph.render("test", view=True) #导入评价情感词 pos_comment=pd.read_csv('正面评价词语(中文).txt',header=None,sep='/n',engine='python') neg_comment=pd.read_csv('负面评价词语(中文).txt',header=None,sep='/n',engine='python') pos_emotion=pd.read_csv('正面情感词语(中文).txt',header=None,sep='/n',engine='python') neg_emotion=pd.read_csv('负面情感词语(中文).txt',header=None,sep='/n',engine='python') #print(pos_comment.shape) #print(neg_comment.shape) #print(pos_emotion.shape) #print(neg_emotion.shape) pos=pd.concat([pos_comment,pos_emotion],axis=0) pos.shape neg=pd.concat([neg_comment,neg_emotion],axis=0) neg.shape #增加新词 c='点赞' c in pos.values d='歇菜' d in neg.values new_pos=pd.Series(['点赞']) new_neg=pd.Series(['歇菜']) positive=pd.concat([pos,new_pos],axis=0) positive.shape negative=pd.concat([neg,new_neg],axis=0) negative.shape positive.columns=['review'] positive['weight']=pd.Series([1]*len(positive)) #print(positive) negative.columns=['review'] negative['weight']=pd.Series([-1]*len(negative)) #print(negative) pos_neg=pd.concat([positive,negative],axis=0) pos_neg.shape #合并到review_long_clean中 #表联接 data=review_long_clean.copy() review_mltype=pd.merge(data,pos_neg,how='left',left_on='word',right_on='review') review_mltype.shape review_mltype=review_mltype.drop(['review'],axis=1) review_mltype=review_mltype.replace(np.nan,0) #print(review_mltype.head()) #修正情感倾向 notdict=pd.read_csv('not.csv') #print(notdict.shape) notdict['freq']=[1]*len(notdict) #print(notdict) #准备一 review_mltype['amend_weight']=review_mltype['weight'] review_mltype['id']=np.arange(0,review_mltype.shape[0]) #print(review_mltype) # 准备二,只保留有情感值的行 only_review_mltype=review_mltype[review_mltype['weight']!=0] only_review_mltype.index=np.arange(0,only_review_mltype.shape[0]) #索引重置 only_review_mltype.shape #print(only_review_mltype) i=4 review_i=review_mltype[review_mltype['index_content']==only_review_mltype['index_content'][i]] #print(review_i)#第i个情感词的评论 # 看该情感词前2个词,来判罚否定的语气。如果在句首,则没有否词,如果在句子的第二次词,则看前1个词,来判断否定的语气。 index = only_review_mltype['id'] for i in range(0, only_review_mltype.shape[0]): review_i = review_mltype[review_mltype['index_content'] == only_review_mltype['index_content'][i]] # 第i个情感词的评论 review_i.index = np.arange(0, review_i.shape[0]) # 重置索引后,索引值等价于index_word word_ind = only_review_mltype['index_word'][i] # 第i个情感值在该条评论的位置 # 第一种,在句首。则不用判断 # 第二种,在评论的第2个为位置 if word_ind == 2: ne = sum([review_i['word'][word_ind - 1] in notdict['term']]) if ne == 1: review_mltype['amend_weight'][index[i]] = -(review_mltype['weight'][index[i]]) # 第三种,在评论的第2个位置以后 elif word_ind > 2: ne = sum([word in notdict['term'] for word in review_i['word'][[word_ind - 1, word_ind - 2]]]) # 注意用中括号[word_ind-1,word_ind-2] if ne == 1: review_mltype['amend_weight'][index[i]] = - (review_mltype['weight'][index[i]]) # print(review_mltype.shape) review_mltype[(review_mltype['weight'] - review_mltype['amend_weight']) != 0] # 说明两列值一样 #计算每条评论的情感值 #print(review_mltype.tail()) emotion_value = review_mltype.groupby('index_content', as_index=False)['amend_weight'].sum() #print(emotion_value) emotion_value.to_csv('./1_emotion_value', index=True, header=True) #查看情感分析效果 # 每条评论的amend_weight总和不等于零 content_emotion_value = emotion_value.copy() content_emotion_value.shape content_emotion_value = content_emotion_value[content_emotion_value['amend_weight'] != 0] content_emotion_value['ml_type'] = '' content_emotion_value['ml_type'][content_emotion_value['amend_weight'] > 0] = 'pos' content_emotion_value['ml_type'][content_emotion_value['amend_weight'] < 0] = 'neg' content_emotion_value.shape print(content_emotion_value) # 每条评论的amend_weight总和等于零 # 这个方法其实不好用,有一半以上的评论区分不出正、负情感。 content_emotion_value0 = emotion_value.copy() content_emotion_value0 = content_emotion_value0[content_emotion_value0['amend_weight'] == 0] content_emotion_value0.head() print(raw_data.content[6]) print(raw_data.content[7]) print(raw_data.content[8]) # 合并到大表中 content_emotion_value = content_emotion_value.drop(['amend_weight'], axis=1) review_mltype.shape review_mltype = pd.merge(review_mltype, content_emotion_value, how='left', left_on='index_content', right_on='index_content') review_mltype = review_mltype.drop(['id'], axis=1) review_mltype.shape print(review_mltype) review_mltype.to_csv('./1_review_mltype', index=True, header=True) cate = ['index_content', 'content_type', 'ml_type'] data_type = review_mltype[cate].drop_duplicates() confusion_matrix = pd.crosstab(data_type['content_type'], data_type['ml_type'], margins=True) print(confusion_matrix) data = data_type[['content_type', 'ml_type']] data = data.dropna(axis=0) print(classification_report(data['content_type'], data['ml_type'])) data = review_mltype.copy() data = data[data['amend_weight'] != 0] word_data_pos = data[data['ml_type'] == 'pos'] word_data_neg = data[data['ml_type'] == 'neg'] # 按照以上修改,显示信息 font = r"C:\Windows\Fonts\msyh.ttc" background_image = plt.imread('1.png') wordcloud = WordCloud(font_path=font, max_words=100, mode='RGBA', background_color='white', mask=background_image) # width=1600,height=1200 # wordcloud = WordCloud(max_words = 100, mode='RGBA' ,background_color='white') #width=1600,height=1200 wordcloud.generate_from_frequencies(Counter(word_data_pos.word.values)) plt.figure(figsize=(15, 7)) plt.imshow(wordcloud) plt.axis('off') #plt.show() # font=r"C:\Windows\Fonts\msyh.ttc" font = r"C:\Windows\Fonts\msyh.ttc" background_image = plt.imread('1.png') # background_image=plt.imread('./p6sad.jpg') wordcloud = WordCloud(font_path=font, max_words=100, mode='RGBA', background_color='white', mask=background_image) # width=1600,height=1200 # wordcloud = WordCloud(max_words = 100, mode='RGBA' ,background_color='white') #width=1600,height=1200 wordcloud.generate_from_frequencies(Counter(word_data_neg.word.values)) plt.figure(figsize=(15, 7)) plt.imshow(wordcloud) plt.axis('off') #plt.show() data = review_mltype.copy() word_data_pos = data[data['ml_type'] == 'pos'] word_data_neg = data[data['ml_type'] == 'neg'] font = r"C:\Windows\Fonts\msyh.ttc" background_image = plt.imread('1.png') wordcloud = WordCloud(font_path=font, max_words=100, mode='RGBA', background_color='white', mask=background_image) # width=1600,height=1200 wordcloud.generate_from_frequencies(Counter(word_data_pos.word.values)) plt.figure(figsize=(15, 7)) plt.imshow(wordcloud) plt.axis('off') #plt.show() background_image = plt.imread('1.png') wordcloud = WordCloud(font_path=font, max_words=100, mode='RGBA', background_color='white', mask=background_image) # width=1600,height=1200 wordcloud.generate_from_frequencies(Counter(word_data_neg.word.values)) plt.figure(figsize=(15, 7)) plt.imshow(wordcloud) plt.axis('off') #plt.show() data = review_mltype.copy() word_data_pos = data[data['ml_type'] == 'pos'] word_data_neg = data[data['ml_type'] == 'neg'] # 建立词典,去重 pos_dict = corpora.Dictionary([[i] for i in word_data_pos.word]) # shape=(n,1) neg_dict = corpora.Dictionary([[i] for i in word_data_neg.word]) #print(pos_dict) # 建立语料库 pos_corpus = [pos_dict.doc2bow(j) for j in [[i] for i in word_data_pos.word]] # shape=(n,(2,1)) neg_corpus = [neg_dict.doc2bow(j) for j in [[i] for i in word_data_neg.word]] len(word_data_pos.word) len(pos_dict) len(pos_corpus) #print(pos_corpus) # 元素是元组,元组(x,y),x是在词典中的位置,y是1表示存在。 # 构造主题数寻优函数 def cos(vector1, vector2): ''' 函数功能:余玄相似度函数 ''' dot_product = 0.0 normA = 0.0 normB = 0.0 for a, b in zip(vector1, vector2): dot_product += a * b normA += a ** 2 normB += b ** 2 if normA == 0.0 or normB == 0.0: return None else: return (dot_product / ((normA * normB) ** 0.5)) # 主题数寻优 # 这个函数可以重复调用,解决其他项目的问题 def LDA_k(x_corpus, x_dict): ''' 函数功能: ''' # 初始化平均余玄相似度 mean_similarity = [] mean_similarity.append(1) # 循环生成主题并计算主题间相似度 for i in np.arange(2, 11): lda = models.LdaModel(x_corpus, num_topics=i, id2word=x_dict) # LDA模型训练 for j in np.arange(i): term = lda.show_topics(num_words=50) # 提取各主题词 top_word = [] # shape=(i,50) for k in np.arange(i): top_word.append([''.join(re.findall('"(.*)"', i)) for i in term[k][1].split('+')]) # 列出所有词 # 构造词频向量 word = sum(top_word, []) # 列车所有词 unique_word = set(word) # 去重 # 构造主题词列表,行表示主题号,列表示各主题词 mat = [] # shape=(i,len(unique_word)) for j in np.arange(i): top_w = top_word[j] mat.append(tuple([top_w.count(k) for k in unique_word])) # 统计list中元素的频次,返回元组 # 两两组合。方法一 p = list(itertools.permutations(list(np.arange(i)), 2)) # 返回可迭代对象的所有数学全排列方式。 y = len(p) # y=i*(i-1) top_similarity = [0] for w in np.arange(y): vector1 = mat[p[w][0]] vector2 = mat[p[w][1]] top_similarity.append(cos(vector1, vector2)) # #两两组合,方法二 # for x in range(i-1): # for y in range(x,i): # 计算平均余玄相似度 mean_similarity.append(sum(top_similarity) / y) return mean_similarity # 计算主题平均余玄相似度 pos_k = LDA_k(pos_corpus, pos_dict) neg_k = LDA_k(neg_corpus, neg_dict) pos_k neg_k pd.Series(pos_k, index=range(1, 11)).plot() plt.title('正面评论LDA主题数寻优') plt.show() pd.Series(neg_k, index=range(1, 11)).plot() plt.title('负面评论LDA主题数寻优') plt.show() pos_lda = models.LdaModel(pos_corpus, num_topics=2, id2word=pos_dict) neg_lda = models.LdaModel(neg_corpus, num_topics=2, id2word=neg_dict) pos_lda.print_topics(num_topics=10) neg_lda.print_topics(num_topics=10)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。