如何同时用python处理多个文本生成词云图，这篇文章就够了_python词云可以合并文本吗

作者：寸_铁 | 2024-07-25 05:36:22

踩

python词云可以合并文本吗

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 22 21:22:42 2020

@author: Administrator
"""

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 22 18:08:04 2020

@author: Administrator
"""
import os
import jieba  
from collections import Counter
from wordcloud import WordCloud
import numpy as np
from PIL import Image
import string



#去除标点符号

#打开文件，读取文字内容
class Wordcloud():
    def __init__(self,
            path, 
            back_coloring_path,
            save_path,            
            width,
            height,
            max_words,
            min_length,
            stop_words,
            background_color = 'white',
            font_path = "simhei.ttf",
            cut_all = True,
            ):
        self.path = path
        self.save_path = save_path
        self.back_coloring_path = np.array(Image.open(back_coloring_path))
        self.width = width
        self.height = height
        self.stop_words = stop_words
        self.cut_all = cut_all
        self.max_words = max_words
        self.font_path = font_path
        self.background_color = background_color
        self.min_length = min_length
        
    #去除标点符号
    def replace_punctutation(self,word):
        #定义中文标点符号
        punctutations = ['【','】','《','》','：','，','（','）','、','！','？','“','”',"。",".","/","%","；"]
        #去除中文标点符号
        for i in punctutations:
            word.replace(i, "")
        #去除英文标点符号
        for j in string.punctuation:
            word.replace(i, "")
        return word
    
    #打开文件，读取文字内容
    def __open_split_file(self, path):        
        #合并所有txt文件内容
        file_path = self.join_txt(path)
        file_words = self.open_file(file_path)
        return self.__seg_words(file_words)#调用__seg_words方法
    
    #打开文件
    def open_file(self,path):
        file_words = []
        with open(path,'r', encoding='gbk') as f:
            for i in f.readlines():#逐行读入到file_words中
                i = ''.join(i.split())
                file_words.append(i)
        return file_words
    
    #进行分词       
    def __seg_words(self,file_words):
        seg_lists = []
        for i in file_words:
            seg_list = jieba.cut(i,cut_all = self.cut_all)#进行分词，cut_all=True的话分词更精细一些
            seg_lists.extend(seg_list) 
        #遍历每一行
        results = []
        for result in seg_lists:
            result = self.replace_punctutation(result)#去除标点符号
            if self.stop_words:
                result = self.delet_words(result)  #删除指定词
            results.append(result)      
        if "" in results:
            results.remove('')#去除空值

        return results
    def delet_words(self,word):
        for i in self.stop_words:  #停用词语
            if i in word:
                word = ""
        return word
        
    #统计词频
    def __count_words(self,results):
        counter = Counter()#初始化一个计数器 
        for word in results:
            if len(word) >= self.min_length and word != '\n':
                counter[word] += 1
        return counter

    def word_cloud_and_count_words(self):
        """
        生成词云图片
        统计词频
        """
        words = self.__open_split_file(self.path)#调用open_split_file方法，进行分词
        #统计词频
        counter = self.__count_words(words)#调用count_words方法
        text = ' '.join(words)
        wordcloud = WordCloud(          #初始化词云对象
                background_color = self.background_color,
                width = self.width,
                height= self.height,
                margin = 2,
                max_words = self.max_words,
                mask = self.back_coloring_path,
                font_path = self.font_path,
                random_state= 100)  #设置随机生成多少种配色方案
        #调用生成词云图方法
        word_cloud = wordcloud.generate(text)
        #转换成图片
        image = word_cloud.to_image()
        #保存图片
        image.save(self.save_path)      
        return counter,image
    
    def join_txt(self,path):
        '''
        合并所有的txt文件并写入到一个文件中
        '''
        #找出当前目录下所有的文件
        file_names = os.listdir(path)
        file = open(os.path.join(path,'results.txt'), 'w')
        for file_name in file_names:
            if ".txt" not in file_name: #如果后缀不是.txt,直接结束当前迭代，进入下一次迭代
                continue
            else:
                file_path = os.path.join(path,file_name)
                #遍历单个文件，读取每行内容
                for line in open(file_path, encoding='utf-8'):
                    file.writelines(line)
                file.write('\n') #每个txt文件的内容用回车键隔开
        file.close()
        #results文件的绝对路径
        path = os.path.join(path,'results.txt')
        return path

if __name__ == "__main__":
    #参数
    args = {
    "path" : "D:\\cloud_words\\",  # 文本所在的文件夹相对路径
    "back_coloring_path" : "D:\\cloud_words\\爱心.png", #背景图路径
    "save_path": "D:\\cloud_words\\词云图.png", #词云图文件保存路径
    "background_color" : "white", #词云背景图颜色，默认是白色
    "font_path" : "simhei.ttf", #字体，默认是黑体
    "cut_all" : False, #是否全分，默认是True 
    "width" : 300,      #词云图的宽度
    "height" : 400,     #词云图的高度
    "max_words" : 100,#最多显示多少词
    "min_length":2,#词语最短长度
    "stop_words":None #停用词语，直接往里添加list
    }
    word_cloud = Wordcloud(**args)
    counter, image = word_cloud.word_cloud_and_count_words()
    print(counter)
    image.show()
    
    


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

文本内容
假如你不够快乐
作者：汪国真

也不要把眉头深锁
人生本来短暂
为什么还要栽培苦涩

打开尘封的门窗
让阳光雨露洒遍每个角落
走向生命的原野
让风儿熨平前额

博大可以稀释忧愁
深色能够覆盖浅色
背景图
在这里插入图片描述

最终效果如下

Counter({'假如': 1, '不够': 1, '快乐': 1, '作者': 1, '汪国真': 1, '不要': 1, '眉头': 1, '深锁': 1, '人生': 1, '本来': 1, '短暂': 1, '为什么': 1, '还要': 1, '栽培': 1, '苦涩': 1, '打开': 1, '尘封': 1, '门窗': 1, '阳光雨露': 1, '洒遍': 1, '每个': 1, '角落': 1, '走向': 1, '生命': 1, '原野': 1, '风儿': 1, '熨平': 1, '前额': 1, '博大': 1, '可以': 1, '稀释': 1, '忧愁': 1, '深色': 1, '能够': 1, '覆盖': 1, '浅色': 1})
1

在这里插入图片描述

声明：本文内容由网友自发贡献，转载请注明出处：【wpsshop】