赞
踩
本次中文情感分析源于数据挖掘与分析课大作业,主要内容为:对疫情期间的微博文本进行情感分类,进而分析情感变化。
1. 数据集:训练集和待预测数据集,其中训练集为打好标签的微博疫情相关文本,待预测训练集为情感趋势来源。
2. python库:主要使用 jieba、pandas,其余详见import
3. 主要涉及内容有:分词,去停用词,构建词向量模型,分词文本向量化,模型训练,预测等部分。
【文件路径\\、/没有修改成一致。部分代码不够简洁流畅,仅提供步骤参考,相关文件、代码(同组成员的微博爬虫、清洗、以及数据集链接)会考虑需要上传】
--分词,去停,构建词向量(这里没有用pandas,十分后悔)
1.import部分及main方法:
- import jieba
- import numpy as np
- import pandas as pd
- import os
-
- import gensim
- from gensim.test.utils import common_texts,get_tmpfile
- from gensim.models import Word2Vec
-
- import math
-
- import csv
-
- if __name__=='__main__':
-
- data = pd.read_csv('D:\\documents\\data mining\\数据集\\情感分类-疫情微博\\nCoV_100k_train.labled.csv',engine="python")
- #data = pd.read_csv('D:\\documents\\data mining\\数据集\\普通情感分类-7\\情感训练集.csv')
- #print(data.head())
-
- #提取目标列,第2列
- data1 = list(data.iloc[:,3]) #根据数据集修改 100k-3,情感训练集-0
- #print(data1[0])
- label = list(data.iloc[:,6]) #根据数据集修改 100k-6,情感训练集-1
-
- #分词
- size = 100 #词向量模型
- (data2,label) = word_cut(data1,label,size) #返回分词后列表,以字符串为元素,字符串用','隔开字符
- print('分词成功')
-
- print(len(data2),len(label))

2.分词,去停,词向量
-
- def word_cut(data1,label,size):
-
- filelist = []
- for i in data1:
- i=str(i)
- i = i.replace('展开全文c','')
- s=jieba.cut(i,cut_all=False)
- cutstr = '$$$'.join(s)
-
- '''
- s1 = iter(s)
- cutstr=''
- for i in s1:
- if cutstr =='':
- cutstr+=i
- else:
- cutstr+='$$$'
- cutstr+=i
- '''
- textlist = cutstr.split('$$$')
-
- #print(textlist)
-
- filelist.append(textlist)
-
-
- filelist = removesw(filelist) #去停用词后的list,可能有空
-
- j=0
- for i in range(len(filelist)):#删除空值
- if len(filelist[i-j])== 0:
- del filelist[i-j]
- del label[i-j]
- j+=1
-
- #print(len(filelist),len(label))
- #print(filelist[0],label[0])
- #print(filelist[1],label[1])
- #print(filelist[-2],label[-2])
- #print(filelist[-1],label[-1])
-
- #打开txt
- txtfile = open('D:/documents/data mining/数据集/代码/data_cut.txt',mode = 'w')
-
- for i in range(len(filelist)):
- string=''
- for j in filelist[i]:
- if j != '':
- if string == '':
- string += j
- else:
- string += ','
- string += j
-
- ##写入txt文件 #分词+label
- txtfile.write(string.encode("gbk", 'ignore').decode("gbk", "ignore")+' '+str(label[i])+'\n')
-
- txtfile.close()
- print('cut_word写入txt')
-
- model = Word2Vec(filelist,size=size,window=5,min_count=1,workers=4)
- model.save("D:/documents/data mining/数据集/代码/word2vec.bin")
- print('cut_word加入词向量模型')
-
- return (filelist,label)
-
-
-

本段主要为 利用结巴分词进行分词,分词结果使用$$$分隔,使用下方去停方法。
将去停后的分词文本加入词向量模型,其中word2vec中的filelist只要为可循环的变量均可,后续往词向量模型加入,以及获得文本向量的语句见part2.
-
- def removesw(filelist): #filelist:由分词构成的list
- stop_word = None
-
- with open('D:/documents/data mining/数据集/stopwords-master/cn_stopwords.txt','r',encoding = 'utf-8') as f:
- stop_words = f.readlines()
- stop_words = [word.replace('\n','') for word in stop_words]
-
- # stop word 替换
- #i=0
- for i in range(len(filelist)):
- filelist[i]=[x for x in filelist[i] if x not in stop_words]
-
- return filelist
本段去停用词,txt为网络找的停用词表,中途会根据微博语境增删改。for循环里的代码比较核心。
--本部分主要使用pandas库,对预测集分词、去停,结果加入part1中构建的词向量模型。然后利用词向量模型、训练集&预测集分析结果,构建文本向量并写入.csv文件。
1.import部分+数据清洗、分词、去停
(清洗部分希望去掉部分无意义词段,防止分词后无法去除。)
- import os
- import pandas as pd
- import jieba
-
- import gensim
- from gensim.test.utils import common_texts,get_tmpfile
- from gensim.models import Word2Vec
-
- import numpy as np
- import csv
-
-
-
- #----数据清洗,分词----
- with open('D:/documents/data mining/数据集/stopwords-master/cn_stopwords.txt','r',encoding = 'utf-8') as f:
- stop_words = f.readlines()
- stop_words = [word.replace('\n','') for word in stop_words]
- stop_words.append('\u200b')
-
- origin_dir='D:\\documents\\data mining\\数据集\\代码\\cleaned_text\\'
- files=os.listdir(origin_dir)
- after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'
-
- def clean_mix(s):
- #print(type(s))
- return s.replace('收起全文d','').replace('展开全文d','').replace('的秒拍视频','').replace('的微博视频','').replace('的快手视频','').replace('\n','').replace('O网页链接','')
-
- def after_jieba_stopword(s):
- a=jieba.cut(str(s),cut_all=False)
- b = '$$$'.join(a)
- c=[x for x in b.split('$$$') if x not in stop_words]
- return ' '.join(c)
-
- N_origin=0
- N_filter=0
- for file in files:
- data=pd.read_table(origin_dir+file,sep=',',encoding='utf-8')
- N_origin+=len(data)
- #分词
- data['cleaned_text']=data['cleaned_text'].map(lambda x:clean_mix(str(x)) if type(x)==type('') else '') #去词
- data['cleaned_text']=data['cleaned_text'].map(lambda x:after_jieba_stopword(x)) #分词,去停用词
- data['removeWellSign']=data['removeWellSign'].map(lambda x:clean_mix(str(x)) if type(x)==type('') else '')
- data['removeWellSign']=data['removeWellSign'].map(lambda x:after_jieba_stopword(x))
- data_filter=data.loc[data['cleaned_text']!='',:]
- data_filter['id']=np.arange(0,len(data_filter),1)
- N_filter+=len(data_filter)
- data_filter[['id','original_text','cleaned_text','removeWellSign']].to_csv(after_clean_dir+file,sep=',',index=None,encoding='utf-8')
- print(file,'over')
-
- print(N_origin)
- print(N_filter)

2.词向量模型训练
--待预测数据集分词结果加入词向量模型
-
- #训练模型,向量化
- after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'
- files=os.listdir(after_clean_dir)
- model = Word2Vec.load("D:/documents/data mining/数据集/代码/word2vec.bin")
-
- for file in files:
- data=pd.read_table(after_clean_dir+file,sep=',',encoding='utf-8')
- filelist=list(data['cleaned_text'].map(lambda x:x.split(' ')) )
-
- model.train(filelist,total_examples=model.corpus_count,epochs= model.iter)
- print(file,'train over')
-
- model.save("D:/documents/data mining/数据集/代码/word2vec.bin")
- print('预测文本加入词向量模型-成功')
3.文本向量化
利用分词后的文本,分别从词向量模型中获得词语对应向量(向量中不包含所有词),加总(权重为1)、平均,得到句子对应文本向量。
- #模型106万条文本的向量化
- after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'
- vectors_dir='D:\\documents\\data mining\\数据集\\代码\\vectors\\'
- files=os.listdir(after_clean_dir)
- model = Word2Vec.load("D:/documents/data mining/数据集/代码/word2vec.bin")
-
- for file in files:
- data=pd.read_table(after_clean_dir+file,sep=',',encoding='utf-8')
- filelist=list(data['cleaned_text'].map(lambda x:x.split(' ')))
-
- df=pd.DataFrame()
-
- for text in filelist:
- text_vector = np.zeros(100).reshape((1,100))
- count = 0
- for word in text:
- try:
- text_vector += model[word].reshape((1,100))
- #print(word,model[word])
- count += 1
- except KeyError:
- continue
- if count !=0:
- text_vector /= count #count个单词,所以除以count
-
- vector_list= list(list(text_vector)[0])
-
- df=df.append(pd.Series(vector_list),ignore_index=True)
-
- df.to_csv(vectors_dir+file,sep=',',index=None,header=None)
- print(file,'train over')
-
- #---训练集文本向量化---
- model = Word2Vec.load("D:/documents/data mining/数据集/代码/word2vec.bin")
- txtfile = open('D:\\documents\\data mining\\数据集\\代码\\data_cut.txt','r')
-
- data=[]
- for i in txtfile.readlines():
- a=i.split(' ')
- a = [word.replace('\n','') for word in a]
- #print(a)
- data.append(a) #[[cut_word,label],[cut_word,label]]
-
- for i in data:
- text = i[0].split(',')
- text_vector = np.zeros(100).reshape((1,100))
-
- count = 0
-
- for word in text:
- try:
- text_vector += model[word].reshape((1,100))
- count += 1
- except KeyError:
- continue
- if count !=0:
- text_vector /= count #count个单词,所以除以count
-
- vector_list= list(list(text_vector)[0])
- #print(i[0],vector_list)
- i=i.append(vector_list) #
-
- print(data[0])
-
-
- with open('D:\\documents\\data mining\\数据集\\代码\\trainText_vector.csv','w',newline='') as tf:
- writer = csv.writer(tf,delimiter = ',')
- #writer.writerow(file_columns)
- for row in data:
- #print(row)
- row1 = row[2]
- row1.append(int(row[1]))
- #print(row1)
- writer.writerow(row1)
- tf.close()
- print('训练文本向量化完成')

4.模型训练
--这里的模型为决策树模型,使用OneVsOne分类方式,是经过挑选的。训练过程中,将训练集向量9:1分为训练集和测试集,正确率较高,且在预测分类中效果较好。
- from sklearn.multiclass import OneVsOneClassifier
- from sklearn.tree import DecisionTreeRegressor
-
- from sklearn.model_selection import train_test_split
- from sklearn.preprocessing import label_binarize
-
- from joblib import dump, load
-
- #---模型训练及预测---
- after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'
- vectors_dir='D:\\documents\\data mining\\数据集\\代码\\vectors\\'
- label_dir='D:\\documents\\data mining\\数据集\\代码\\text_label\\'
- files=os.listdir(after_clean_dir)
-
- #模型训练
- labeled_path = 'D:\\documents\\data mining\\数据集\\代码\\trainText_vector.csv'
-
- labeled=pd.read_table(labeled_path,sep=',')
- n=len(labeled)#11281
-
- vectors=labeled.iloc[:,:-1]
- labels=labeled.iloc[:,-1]
-
- X_train, X_test, y_train, y_test = train_test_split(vectors, labels, test_size=0.2)
-
- y_test_list=list(y_test)
-
- y_train_list2=np.array(list(y_train.map(lambda x:[x])))
- X_train_list=np.array(X_train)
- X_test_list=np.array(X_test)
-
- n_train=len(y_train)#10152
- n_test=len(y_test)#1129
-
- def accuracy(a,b):
- c=[]
- for i in range(len(a)):
- if a[i]==b[i]:
- c.append(1)
- else:
- c.append(0)
- return sum(c)/len(c)
-
-
- model_tree_one=OneVsOneClassifier(DecisionTreeRegressor()) #2v2
- model_tree_one.fit(X_train,y_train)
- predict_tree_one=model_tree_one.predict(X_test)
- print(predict_tree_one)
- accuracy_tree_one=accuracy(predict_tree_one,y_test_list) #0.7478753541076487
- print("accuracy_tree_one:"+str(accuracy_tree_one))
-
- dump(model_tree_one,'model_tree_one.joblib')
- print('预测模型建立并存储完成')

5.情感分类预测
- #预测
- #model_tree_one=load('D:\\documents\\data mining\\数据集\\代码\\model_tree_one.joblib')
- model_tree_one=load('D:\\documents\\data mining\\数据集\\代码\\svc.joblib')
- for file in files:
- vectors_file=pd.read_table(vectors_dir+file,sep=',',header=None)
- text_file=pd.read_table(after_clean_dir+file,sep=',')
-
- result=model_tree_one.predict(vectors_file)
-
- text_file['label']=result
-
- text_file.to_csv(label_dir+file,sep=',',index=None)
- print(file,'predict over')
-
6.随便输出到.csv的分类结果(积极,消极,总数等)
- # 预测结果统计
- from pandas import DataFrame
- analysis_dir = 'D:\\documents\\data mining\\数据集\\代码\\text_label\\'
- analysis_files = os.listdir(analysis_dir)
- #analysis_data = {'date':[],'neg':[],'pos':[],'total':[]}
- analysis_df = DataFrame(data=[],index=[],columns=['deta','neg','pos','total'])
-
- for file in analysis_files:
- analysis_file = pd.read_table(analysis_dir+file,sep=',')
-
- #pos = analysis_file.loc[analysis_file['label'] == '1',:].count()
- #neg = analysis_file.loc[analysis_file['label'] == '-1',:].count()
- vc=analysis_file['label'].value_counts(normalize = False, dropna = False)
-
- pos = vc[1]
- neg = vc[-1]
- total = analysis_file['label'].count()
-
- print(file,neg,pos,total) #
-
- analysis_df=analysis_df.append(pd.DataFrame([[file.replace('.csv','').replace('.','-'),neg,pos,total]],columns=['deta','neg','pos','total']))
-
-
- analysis_df.to_csv('D:\\documents\\data mining\\数据集\\代码\\结果图.csv',sep=',',index=None)
-

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。