当前位置:   article > 正文

【无标题】电商产品评论数据情感分析_电商评论情感分析代码

电商评论情感分析代码
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pylab import style #自定义图表风格
style.use('ggplot')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
plt.rcParams['font.sans-serif'] = ['Simhei'] # 解决中文乱码问题

import re
import jieba.posseg as psg
import itertools
#conda install -c anaconda gensim
from gensim import corpora,models #主题挖掘,提取关键信息

# pip install wordcloud
from wordcloud import WordCloud,ImageColorGenerator
from collections import Counter

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import graphviz

raw_data=pd.read_csv('reviews.csv')
#raw_data.info()
#raw_data.columns
#取值分布
for cate in ['creationTime', 'nickname', 'referenceName', 'content_type']:
    raw_data[cate].value_counts()

reviews=raw_data.copy()
reviews=reviews[['content', 'content_type']]
#print('去重之前:',reviews.shape[0])
reviews=reviews.drop_duplicates()
#print('去重之后:',reviews.shape[0])

# 清洗之前
content=reviews['content']
#print('清洗之前''-----------')
#for i in range(5,10):
  #  print(content[i])
 #   print('-----------')

#清洗之后,将数字、字母、京东美的电热水器字样都删除
info=re.compile('[0-9a-zA-Z]|京东|美的|电热水器|热水器|')
content=content.apply(lambda x: info.sub('',x))  #替换所有匹配项
#print('清洗之后''-----------')
#for i in range(5,10):
    #print(content[i])
    #print('-----------')

#分词,由元组组成的list
seg_content=content.apply( lambda s:  [(x.word,x.flag) for x in psg.cut(s)] )

seg_content.shape
len(seg_content)
#print(seg_content[5])

#统计评论词数
n_word=seg_content.apply(lambda s: len(s))

len(n_word)
n_word.head(6)

#得到各分词在第几条评论
n_content=[ [x+1]*y for x,y in zip(list(seg_content.index),list(n_word))] #[x+1]*y,表示复制y份,由list组成的list
index_content_long=sum(n_content,[]) #表示去掉[],拉平,返回list
len(index_content_long)
sum([[2,2],[3,3,3]],[])

#分词及词性,去掉[],拉平
seg_content.head()

seg_content_long=sum(seg_content,[])

seg_content_long

type(seg_content_long)
len(seg_content_long)
seg_content_long[0]
#得到加长版的分词、词性
word_long=[x[0] for x in seg_content_long]
nature_long=[x[1] for x in seg_content_long]

len(word_long)
len(nature_long)
#content_type拉长
n_content_type=[ [x]*y for x,y in zip(list(reviews['content_type']),list(n_word))] #[x+1]*y,表示复制y份
content_type_long=sum(n_content_type,[]) #表示去掉[],拉平

len(content_type_long)
review_long=pd.DataFrame({'index_content':index_content_long,
                        'word':word_long,
                        'nature':nature_long,
                        'content_type':content_type_long})
review_long.shape
review_long.head()


review_long['nature'].unique()
#print(review_long)

#去除标点符号
review_long_clean=review_long[review_long['nature']!='x'] #x表示标点符合
review_long_clean.shape
#导入停用词
stop_path=open('stoplist.txt','r',encoding='UTF-8')
stop_words=stop_path.readlines()

len(stop_words)
stop_words[0:5]
#停用词,预处理
stop_words=[word.strip('\n') for word in stop_words]
stop_words[0:5]
#得到不含停用词的分词表
word_long_clean=list(set(word_long)-set(stop_words))
#print(len(word_long_clean))

review_long_clean=review_long_clean[review_long_clean['word'].isin(word_long_clean)]
#print(review_long_clean.shape)


#再次统计每条评论的分词数量
n_word=review_long_clean.groupby('index_content').count()['word']
n_word

index_word=[ list(np.arange(1,x+1)) for x in list(n_word)]
index_word_long=sum(index_word,[]) #表示去掉[],拉平

len(index_word_long)
review_long_clean['index_word']=index_word_long
review_long_clean.head()
review_long_clean.to_csv('1_review_long_clean.csv')

n_review_long_clean=review_long_clean[[ 'n' in nat for nat in review_long_clean.nature]]
n_review_long_clean.shape
#print(n_review_long_clean.head())

#print(n_review_long_clean.nature.value_counts())
n_review_long_clean.to_csv('1_n_review_long_clean.csv')



#font=r"C:\Windows\Fonts\msyh.ttc"
font=r"C:\Windows\Fonts\msyh.ttc"

background_image=plt.imread('1.png')
wordcloud = WordCloud(font_path=font, max_words = 100, mode='RGBA' ,background_color='white',mask=background_image)
#wordcloud = WordCloud(font_path=font, max_words = 100, background_color='white',mask=background_image) #width=1600,height=1200, mode='RGBA'
#wordcloud = WordCloud(font_path=None,max_words = 100, background_color='white', mode='RGBA')
#print("okok+"+review_long_clean.word.values)
wordcloud.generate_from_frequencies(Counter(review_long_clean.word.values))
wordcloud.to_file('1_分词后的词云图.png')

plt.figure(figsize=(20,10))
plt.imshow(wordcloud)
plt.axis('off')
#plt.show()


font=r"C:\Windows\Fonts\msyh.ttc"

background_image=plt.imread('1.png')
wordcloud = WordCloud(font_path=font, max_words = 100, mode='RGBA' ,background_color='white',mask=background_image) #width=1600,height=1200
wordcloud.generate_from_frequencies(Counter(n_review_long_clean.word.values))
wordcloud.to_file('1_分词后的词云图(名词).png')

plt.figure(figsize=(20,10))
plt.imshow(wordcloud)
plt.axis('off')
#plt.show()



#第一步:构造特征空间和标签

Y=[]
for ind in review_long_clean.index_content.unique():
    y=[ word for word in review_long_clean.content_type[review_long_clean.index_content==ind].unique() ]
    Y.append(y)
#print('len(Y)=',len(Y))

X=[]
for ind in review_long_clean.index_content.unique():
    term=[ word for word in review_long_clean.word[review_long_clean.index_content==ind].values ]
    X.append(' '.join(term))
#print('len(X)=',len(X))

#print(X)
#print(Y)

#第二步:训练集、测试集划分
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=7)

#第三步:词转向量,01矩阵
count_vec=CountVectorizer(binary=True)
x_train=count_vec.fit_transform(x_train)
x_test=count_vec.transform(x_test)

#第四步:构建决策树
dtc=tree.DecisionTreeClassifier(max_depth=5)
dtc.fit(x_train,y_train)
#print('在训练集上的准确率:%.2f'% accuracy_score(y_train,dtc.predict(x_train)))

y_true=y_test
y_pred=dtc.predict(x_test)
#print(classification_report(y_true,y_pred))
#print('在测试集上的准确率:%.2f'% accuracy_score(y_true,y_pred))

#第五步:画决策树
#这个决策数图,需要在网上下载graphviz  包,并且配置环境变量
from graphviz import Digraph
import os
os.environ["PATH"] += os.pathsep + 'C:\\Program Files\\Graphviz\\bin'
dot_data=tree.export_graphviz(dtc ,feature_names=count_vec.get_feature_names())
graph=graphviz.Source(dot_data)
#print(graph)

#graph.render("test", view=True)

#导入评价情感词
pos_comment=pd.read_csv('正面评价词语(中文).txt',header=None,sep='/n',engine='python')
neg_comment=pd.read_csv('负面评价词语(中文).txt',header=None,sep='/n',engine='python')

pos_emotion=pd.read_csv('正面情感词语(中文).txt',header=None,sep='/n',engine='python')
neg_emotion=pd.read_csv('负面情感词语(中文).txt',header=None,sep='/n',engine='python')

#print(pos_comment.shape)
#print(neg_comment.shape)

#print(pos_emotion.shape)
#print(neg_emotion.shape)

pos=pd.concat([pos_comment,pos_emotion],axis=0)
pos.shape

neg=pd.concat([neg_comment,neg_emotion],axis=0)
neg.shape
#增加新词
c='点赞'
c in pos.values

d='歇菜'
d in neg.values
new_pos=pd.Series(['点赞'])
new_neg=pd.Series(['歇菜'])
positive=pd.concat([pos,new_pos],axis=0)
positive.shape

negative=pd.concat([neg,new_neg],axis=0)
negative.shape
positive.columns=['review']
positive['weight']=pd.Series([1]*len(positive))
#print(positive)
negative.columns=['review']
negative['weight']=pd.Series([-1]*len(negative))
#print(negative)
pos_neg=pd.concat([positive,negative],axis=0)
pos_neg.shape

#合并到review_long_clean中
#表联接

data=review_long_clean.copy()
review_mltype=pd.merge(data,pos_neg,how='left',left_on='word',right_on='review')
review_mltype.shape

review_mltype=review_mltype.drop(['review'],axis=1)
review_mltype=review_mltype.replace(np.nan,0)
#print(review_mltype.head())

#修正情感倾向
notdict=pd.read_csv('not.csv')
#print(notdict.shape)

notdict['freq']=[1]*len(notdict)
#print(notdict)
#准备一
review_mltype['amend_weight']=review_mltype['weight']
review_mltype['id']=np.arange(0,review_mltype.shape[0])
#print(review_mltype)
# 准备二,只保留有情感值的行

only_review_mltype=review_mltype[review_mltype['weight']!=0]
only_review_mltype.index=np.arange(0,only_review_mltype.shape[0]) #索引重置
only_review_mltype.shape
#print(only_review_mltype)
i=4
review_i=review_mltype[review_mltype['index_content']==only_review_mltype['index_content'][i]]
#print(review_i)#第i个情感词的评论
# 看该情感词前2个词,来判罚否定的语气。如果在句首,则没有否词,如果在句子的第二次词,则看前1个词,来判断否定的语气。

index = only_review_mltype['id']

for i in range(0, only_review_mltype.shape[0]):

    review_i = review_mltype[review_mltype['index_content'] == only_review_mltype['index_content'][i]]  # 第i个情感词的评论
    review_i.index = np.arange(0, review_i.shape[0])  # 重置索引后,索引值等价于index_word
    word_ind = only_review_mltype['index_word'][i]  # 第i个情感值在该条评论的位置

    # 第一种,在句首。则不用判断
    # 第二种,在评论的第2个为位置
    if word_ind == 2:
        ne = sum([review_i['word'][word_ind - 1] in notdict['term']])
        if ne == 1:
            review_mltype['amend_weight'][index[i]] = -(review_mltype['weight'][index[i]])
    # 第三种,在评论的第2个位置以后
    elif word_ind > 2:
        ne = sum([word in notdict['term'] for word in
                  review_i['word'][[word_ind - 1, word_ind - 2]]])  # 注意用中括号[word_ind-1,word_ind-2]
        if ne == 1:
            review_mltype['amend_weight'][index[i]] = - (review_mltype['weight'][index[i]])
           # print(review_mltype.shape)
            review_mltype[(review_mltype['weight'] - review_mltype['amend_weight']) != 0]  # 说明两列值一样


        #计算每条评论的情感值
        #print(review_mltype.tail())
        emotion_value = review_mltype.groupby('index_content', as_index=False)['amend_weight'].sum()
        #print(emotion_value)
        emotion_value.to_csv('./1_emotion_value', index=True, header=True)


        #查看情感分析效果
        # 每条评论的amend_weight总和不等于零

        content_emotion_value = emotion_value.copy()
        content_emotion_value.shape
        content_emotion_value = content_emotion_value[content_emotion_value['amend_weight'] != 0]
        content_emotion_value['ml_type'] = ''
        content_emotion_value['ml_type'][content_emotion_value['amend_weight'] > 0] = 'pos'
        content_emotion_value['ml_type'][content_emotion_value['amend_weight'] < 0] = 'neg'

        content_emotion_value.shape
        print(content_emotion_value)
        # 每条评论的amend_weight总和等于零
        # 这个方法其实不好用,有一半以上的评论区分不出正、负情感。

        content_emotion_value0 = emotion_value.copy()
        content_emotion_value0 = content_emotion_value0[content_emotion_value0['amend_weight'] == 0]
        content_emotion_value0.head()

        print(raw_data.content[6])
        print(raw_data.content[7])
        print(raw_data.content[8])
        # 合并到大表中

        content_emotion_value = content_emotion_value.drop(['amend_weight'], axis=1)
        review_mltype.shape
        review_mltype = pd.merge(review_mltype, content_emotion_value, how='left', left_on='index_content',
                                 right_on='index_content')
        review_mltype = review_mltype.drop(['id'], axis=1)
        review_mltype.shape
        print(review_mltype)

        review_mltype.to_csv('./1_review_mltype', index=True, header=True)
        cate = ['index_content', 'content_type', 'ml_type']
        data_type = review_mltype[cate].drop_duplicates()

        confusion_matrix = pd.crosstab(data_type['content_type'], data_type['ml_type'], margins=True)
        print(confusion_matrix)
        data = data_type[['content_type', 'ml_type']]
        data = data.dropna(axis=0)
        print(classification_report(data['content_type'], data['ml_type']))

        data = review_mltype.copy()
        data = data[data['amend_weight'] != 0]

        word_data_pos = data[data['ml_type'] == 'pos']
        word_data_neg = data[data['ml_type'] == 'neg']
        # 按照以上修改,显示信息

        font = r"C:\Windows\Fonts\msyh.ttc"

        background_image = plt.imread('1.png')
        wordcloud = WordCloud(font_path=font, max_words=100, mode='RGBA', background_color='white',
                              mask=background_image)  # width=1600,height=1200
        # wordcloud = WordCloud(max_words = 100, mode='RGBA' ,background_color='white') #width=1600,height=1200
        wordcloud.generate_from_frequencies(Counter(word_data_pos.word.values))

        plt.figure(figsize=(15, 7))
        plt.imshow(wordcloud)
        plt.axis('off')
        #plt.show()

        # font=r"C:\Windows\Fonts\msyh.ttc"
        font = r"C:\Windows\Fonts\msyh.ttc"

        background_image = plt.imread('1.png')
        # background_image=plt.imread('./p6sad.jpg')
        wordcloud = WordCloud(font_path=font, max_words=100, mode='RGBA', background_color='white',
                              mask=background_image)  # width=1600,height=1200
        # wordcloud = WordCloud(max_words = 100, mode='RGBA' ,background_color='white') #width=1600,height=1200

        wordcloud.generate_from_frequencies(Counter(word_data_neg.word.values))

        plt.figure(figsize=(15, 7))
        plt.imshow(wordcloud)
        plt.axis('off')
        #plt.show()
        data = review_mltype.copy()

        word_data_pos = data[data['ml_type'] == 'pos']
        word_data_neg = data[data['ml_type'] == 'neg']

        font = r"C:\Windows\Fonts\msyh.ttc"

        background_image = plt.imread('1.png')
        wordcloud = WordCloud(font_path=font, max_words=100, mode='RGBA', background_color='white',
                              mask=background_image)  # width=1600,height=1200
        wordcloud.generate_from_frequencies(Counter(word_data_pos.word.values))

        plt.figure(figsize=(15, 7))
        plt.imshow(wordcloud)
        plt.axis('off')
        #plt.show()

        background_image = plt.imread('1.png')
        wordcloud = WordCloud(font_path=font, max_words=100, mode='RGBA', background_color='white',
                              mask=background_image)  # width=1600,height=1200
        wordcloud.generate_from_frequencies(Counter(word_data_neg.word.values))

        plt.figure(figsize=(15, 7))
        plt.imshow(wordcloud)
        plt.axis('off')
        #plt.show()

        data = review_mltype.copy()

        word_data_pos = data[data['ml_type'] == 'pos']
        word_data_neg = data[data['ml_type'] == 'neg']
        # 建立词典,去重

        pos_dict = corpora.Dictionary([[i] for i in word_data_pos.word])  # shape=(n,1)
        neg_dict = corpora.Dictionary([[i] for i in word_data_neg.word])
        #print(pos_dict)
        # 建立语料库

        pos_corpus = [pos_dict.doc2bow(j) for j in [[i] for i in word_data_pos.word]]  # shape=(n,(2,1))
        neg_corpus = [neg_dict.doc2bow(j) for j in [[i] for i in word_data_neg.word]]
        len(word_data_pos.word)
        len(pos_dict)
        len(pos_corpus)
        #print(pos_corpus) # 元素是元组,元组(x,y),x是在词典中的位置,y是1表示存在。

        # 构造主题数寻优函数

        def cos(vector1, vector2):
            '''
            函数功能:余玄相似度函数
            '''
            dot_product = 0.0
            normA = 0.0
            normB = 0.0
            for a, b in zip(vector1, vector2):
                dot_product += a * b
                normA += a ** 2
                normB += b ** 2
            if normA == 0.0 or normB == 0.0:
                return None
            else:
                return (dot_product / ((normA * normB) ** 0.5))


        # 主题数寻优
        # 这个函数可以重复调用,解决其他项目的问题

        def LDA_k(x_corpus, x_dict):
            '''
            函数功能:
            '''
            # 初始化平均余玄相似度
            mean_similarity = []
            mean_similarity.append(1)

            # 循环生成主题并计算主题间相似度
            for i in np.arange(2, 11):
                lda = models.LdaModel(x_corpus, num_topics=i, id2word=x_dict)  # LDA模型训练

                for j in np.arange(i):
                    term = lda.show_topics(num_words=50)

                # 提取各主题词
                top_word = []  # shape=(i,50)
                for k in np.arange(i):
                    top_word.append([''.join(re.findall('"(.*)"', i)) for i in term[k][1].split('+')])  # 列出所有词

                # 构造词频向量
                word = sum(top_word, [])  # 列车所有词
                unique_word = set(word)  # 去重

                # 构造主题词列表,行表示主题号,列表示各主题词
                mat = []  # shape=(i,len(unique_word))
                for j in np.arange(i):
                    top_w = top_word[j]
                    mat.append(tuple([top_w.count(k) for k in unique_word]))  # 统计list中元素的频次,返回元组

                # 两两组合。方法一
                p = list(itertools.permutations(list(np.arange(i)), 2))  # 返回可迭代对象的所有数学全排列方式。
                y = len(p)  # y=i*(i-1)
                top_similarity = [0]
                for w in np.arange(y):
                    vector1 = mat[p[w][0]]
                    vector2 = mat[p[w][1]]
                    top_similarity.append(cos(vector1, vector2))

                #        #两两组合,方法二
                #        for x in range(i-1):
                #            for y in range(x,i):

                # 计算平均余玄相似度
                mean_similarity.append(sum(top_similarity) / y)
            return mean_similarity


        # 计算主题平均余玄相似度

        pos_k = LDA_k(pos_corpus, pos_dict)
        neg_k = LDA_k(neg_corpus, neg_dict)

        pos_k
        neg_k
        pd.Series(pos_k, index=range(1, 11)).plot()
        plt.title('正面评论LDA主题数寻优')
        plt.show()
        pd.Series(neg_k, index=range(1, 11)).plot()
        plt.title('负面评论LDA主题数寻优')
        plt.show()
        pos_lda = models.LdaModel(pos_corpus, num_topics=2, id2word=pos_dict)
        neg_lda = models.LdaModel(neg_corpus, num_topics=2, id2word=neg_dict)

        pos_lda.print_topics(num_topics=10)
        neg_lda.print_topics(num_topics=10)
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Guff_9hys/article/detail/819326
推荐阅读
相关标签
  

闽ICP备14008679号