赞
踩
# -*- coding: utf-8 -*- """ Created on Wed Apr 22 21:22:42 2020 @author: Administrator """ # -*- coding: utf-8 -*- """ Created on Wed Apr 22 18:08:04 2020 @author: Administrator """ import os import jieba from collections import Counter from wordcloud import WordCloud import numpy as np from PIL import Image import string #去除标点符号 #打开文件,读取文字内容 class Wordcloud(): def __init__(self, path, back_coloring_path, save_path, width, height, max_words, min_length, stop_words, background_color = 'white', font_path = "simhei.ttf", cut_all = True, ): self.path = path self.save_path = save_path self.back_coloring_path = np.array(Image.open(back_coloring_path)) self.width = width self.height = height self.stop_words = stop_words self.cut_all = cut_all self.max_words = max_words self.font_path = font_path self.background_color = background_color self.min_length = min_length #去除标点符号 def replace_punctutation(self,word): #定义中文标点符号 punctutations = ['【','】','《','》',':',',','(',')','、','!','?','“','”',"。",".","/","%",";"] #去除中文标点符号 for i in punctutations: word.replace(i, "") #去除英文标点符号 for j in string.punctuation: word.replace(i, "") return word #打开文件,读取文字内容 def __open_split_file(self, path): #合并所有txt文件内容 file_path = self.join_txt(path) file_words = self.open_file(file_path) return self.__seg_words(file_words)#调用__seg_words方法 #打开文件 def open_file(self,path): file_words = [] with open(path,'r', encoding='gbk') as f: for i in f.readlines():#逐行读入到file_words中 i = ''.join(i.split()) file_words.append(i) return file_words #进行分词 def __seg_words(self,file_words): seg_lists = [] for i in file_words: seg_list = jieba.cut(i,cut_all = self.cut_all)#进行分词,cut_all=True的话分词更精细一些 seg_lists.extend(seg_list) #遍历每一行 results = [] for result in seg_lists: result = self.replace_punctutation(result)#去除标点符号 if self.stop_words: result = self.delet_words(result) #删除指定词 results.append(result) if "" in results: results.remove('')#去除空值 return results def delet_words(self,word): for i in self.stop_words: #停用词语 if i in word: word = "" return word #统计词频 def __count_words(self,results): counter = Counter()#初始化一个计数器 for word in results: if len(word) >= self.min_length and word != '\n': counter[word] += 1 return counter def word_cloud_and_count_words(self): """ 生成词云图片 统计词频 """ words = self.__open_split_file(self.path)#调用open_split_file方法,进行分词 #统计词频 counter = self.__count_words(words)#调用count_words方法 text = ' '.join(words) wordcloud = WordCloud( #初始化词云对象 background_color = self.background_color, width = self.width, height= self.height, margin = 2, max_words = self.max_words, mask = self.back_coloring_path, font_path = self.font_path, random_state= 100) #设置随机生成多少种配色方案 #调用生成词云图方法 word_cloud = wordcloud.generate(text) #转换成图片 image = word_cloud.to_image() #保存图片 image.save(self.save_path) return counter,image def join_txt(self,path): ''' 合并所有的txt文件并写入到一个文件中 ''' #找出当前目录下所有的文件 file_names = os.listdir(path) file = open(os.path.join(path,'results.txt'), 'w') for file_name in file_names: if ".txt" not in file_name: #如果后缀不是.txt,直接结束当前迭代,进入下一次迭代 continue else: file_path = os.path.join(path,file_name) #遍历单个文件,读取每行内容 for line in open(file_path, encoding='utf-8'): file.writelines(line) file.write('\n') #每个txt文件的内容用回车键隔开 file.close() #results文件的绝对路径 path = os.path.join(path,'results.txt') return path if __name__ == "__main__": #参数 args = { "path" : "D:\\cloud_words\\", # 文本所在的文件夹相对路径 "back_coloring_path" : "D:\\cloud_words\\爱心.png", #背景图路径 "save_path": "D:\\cloud_words\\词云图.png", #词云图文件保存路径 "background_color" : "white", #词云背景图颜色,默认是白色 "font_path" : "simhei.ttf", #字体,默认是黑体 "cut_all" : False, #是否全分,默认是True "width" : 300, #词云图的宽度 "height" : 400, #词云图的高度 "max_words" : 100,#最多显示多少词 "min_length":2,#词语最短长度 "stop_words":None #停用词语,直接往里添加list } word_cloud = Wordcloud(**args) counter, image = word_cloud.word_cloud_and_count_words() print(counter) image.show()
文本内容
假如你不够快乐
作者:汪国真
也不要把眉头深锁
人生本来短暂
为什么 还要栽培苦涩
打开尘封的门窗
让阳光雨露洒遍每个角落
走向生命的原野
让风儿熨平前额
博大可以稀释忧愁
深色能够覆盖浅色
背景图
最终效果如下
Counter({'假如': 1, '不够': 1, '快乐': 1, '作者': 1, '汪国真': 1, '不要': 1, '眉头': 1, '深锁': 1, '人生': 1, '本来': 1, '短暂': 1, '为什么': 1, '还要': 1, '栽培': 1, '苦涩': 1, '打开': 1, '尘封': 1, '门窗': 1, '阳光雨露': 1, '洒遍': 1, '每个': 1, '角落': 1, '走向': 1, '生命': 1, '原野': 1, '风儿': 1, '熨平': 1, '前额': 1, '博大': 1, '可以': 1, '稀释': 1, '忧愁': 1, '深色': 1, '能够': 1, '覆盖': 1, '浅色': 1})
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。