Lda模型分析完整代码_lda模型一致性代码

作者：你好赵伟 | 2024-06-14 22:38:30

踩

lda模型一致性代码

之前参考博主的代码写的非常好，但是一些小白不是计算机的对于代码运行不起来，以至于后台私信我，这里统一放一份完整代码，代码是可以运行的，我也不是专业的，只是恰好使用到了，如有错误，多多包涵，我把完整的代码放上，大家记得给原博主点个赞，原博主代码链接。


import re
import jieba as jb
from gensim.models import LdaModel
import pyLDAvis.gensim_models
import codecs
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import gensim
from gensim import corpora, models
import matplotlib.pyplot as plt
import matplotlib
if __name__ == '__main__':
 
    #--------------------------句子拆分------------------------------
    def stopwordslist(filepath):
        stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
        return stopwords
 
    # 对句子进行分词
    def seg_sentence(sentence):
        sentence = re.sub(u'[0-9\.]+', u'', sentence)
        # jb.add_word('词汇')		# 这里是加入自定义的词来补充jieba词典
        sentence_seged = jb.cut(sentence.strip())
        stopwords = stopwordslist('自己搜来的停用词表.txt')  # 这里加载停用词的路径
        outstr = ''
        for word in sentence_seged:
            if word not in stopwords and word.__len__() > 1:
                if word != '\t':
                    outstr += word
                    outstr += " "
        return outstr
 
    inputs = open('感想.txt', 'r', encoding='utf-8')
    outputs = open('感想分词.txt', 'w', encoding='utf-8')
    for line in inputs:
        line_seg = seg_sentence(line)  # 这里的返回值是字符串
        outputs.write(line_seg + '\n')
    outputs.close()
    inputs.close()
 
 
    # --------------------------------开始构建lda模型-------------------------------
    train = []
 
    fp = codecs.open('感想分词.txt', 'r', encoding='utf8')
    for line in fp:
        if line != '':
            line = line.split()
            train.append([w for w in line])
 
    dictionary = corpora.Dictionary(train)
 
    corpus = [dictionary.doc2bow(text) for text in train]
 
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, passes=100)
    # num_topics：主题数目
    # passes：训练伦次
    # num_words：每个主题下输出的term的数目
 
    for topic in lda.print_topics(num_words=20):
        termNumber = topic[0]
        print(topic[0], ':', sep='')
        listOfTerms = topic[1].split('+')
        for term in listOfTerms:
            listItems = term.split('*')
            print('  ', listItems[1], '(', listItems[0], ')', sep='')
 
    # -------------------------------可视化拆分-------------------------------------
    train = []
    fp = codecs.open('感想分词.txt', 'r', encoding='utf8')
    for line in fp:
        if line != '':
            line = line.split()
            train.append([w for w in line])
 
    dictionary = corpora.Dictionary(train)
 
    corpus = [dictionary.doc2bow(text) for text in train]
 
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, passes=100)
    # num_topics：主题数目
    # passes：训练伦次
    # num_words：每个主题下输出的term的数目
 
    for topic in lda.print_topics(num_words=20):
        termNumber = topic[0]
        print(topic[0], ':', sep='')
        listOfTerms = topic[1].split('+')
        for term in listOfTerms:
            listItems = term.split('*')
            print('  ', listItems[1], '(', listItems[0], ')', sep='')
 
    d = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)
 
    '''
    lda: 计算好的话题模型
    corpus: 文档词频矩阵
    dictionary: 词语空间
    '''
 
    # pyLDAvis.show(d)		#展示在浏览器
    # pyLDAvis.displace(d) #展示在notebook的output cell中
    pyLDAvis.save_html(d, 'lda_pass4.html')
 
    # ----------------------------困惑都计算--------------------------------------
    # 准备数据
    PATH = "感想分词.txt"  # 已经进行了分词的文档（如何分词前面的文章有介绍）
    file_object2 = open(PATH, encoding='utf-8', errors='ignore').read().split('\n')
    data_set = []  # 建立存储分词的列表
    for i in range(len(file_object2)):
        result = []
        seg_list = file_object2[i].split()  # 读取没一行文本
        for w in seg_list:  # 读取每一行分词
            result.append(w)
        data_set.append(result)
    print(data_set)  # 输出所有分词列表
 
    dictionary = corpora.Dictionary(data_set)  # 构建 document-term matrix
    corpus = [dictionary.doc2bow(text) for text in data_set]
    Lda = gensim.models.ldamodel.LdaModel  # 创建LDA对象
 
 
    # 计算困惑度
    def perplexity(num_topics):
        ldamodel = Lda(corpus, num_topics=num_topics, id2word=dictionary, passes=50)  # passes为迭代次数，次数越多越精准
        print(ldamodel.print_topics(num_topics=num_topics, num_words=20))  # num_words为每个主题下的词语数量
        print(ldamodel.log_perplexity(corpus))
        return ldamodel.log_perplexity(corpus)
 
 
    # 绘制困惑度折线图
    x = range(1, 20)  # 主题范围数量
    y = [perplexity(i) for i in x]
    plt.plot(x, y)
    plt.xlabel('主题数目')
    plt.ylabel('困惑度大小')
    plt.rcParams['font.sans-serif'] = ['SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.title('主题-困惑度变化情况')
    plt.show()
 
    #-------------------------------一致性得分-------------------------------------
    # 准备数据
    PATH = "感想分词.txt"  # 已经进行了分词的文档（如何分词前面的文章有介绍）
 
    file_object2 = open(PATH, encoding='utf-8', errors='ignore').read().split('\n')
    data_set = []  # 建立存储分词的列表
    for i in range(len(file_object2)):
        result = []
        seg_list = file_object2[i].split()  # 读取没一行文本
        for w in seg_list:  # 读取每一行分词
            result.append(w)
        data_set.append(result)
    print(data_set)  # 输出所有分词列表
 
    dictionary = corpora.Dictionary(data_set)  # 构建 document-term matrix
    corpus = [dictionary.doc2bow(text) for text in data_set]
    Lda = gensim.models.ldamodel.LdaModel  # 创建LDA对象
 
 
    def coherence(num_topics):
        ldamodel = Lda(corpus, num_topics=num_topics, id2word=dictionary, passes=50)  # passes为迭代次数，次数越多越精准
        coherence_model_lda = CoherenceModel(model=ldamodel, texts=data_set, dictionary=dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)
        return coherence_lda
 
 
    # 绘制困惑度折线图
    x = range(1, 2)  # 主题范围数量
    y = [coherence(i) for i in x]
    plt.plot(x, y)
    plt.xlabel('主题数目')
    plt.ylabel('coherence大小')
    plt.rcParams['font.sans-serif'] = ['SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.title('主题-coherence变化情况')
    plt.show()

除此以外，你还需要在同级目录下，存放一下四个文件，

其中情感分析.py,可以自己找，我只提供一个参考，让你们知道文件内是什么


from textblob import TextBlob
 
# 创建一个TextBlob对象
text = TextBlob("I am feel sad")
 
# 分析情感
sentiment = text.sentiment.polarity
 
# 输出情感分析结果
print(sentiment)
if sentiment > 0:
    print("积极的")
elif sentiment == 0:
    print("中性词")
else:
    print("消极的")

感想.txt内容需要自己爬虫收集，以下只是部分例子，理论上你是需要非常多行数据的


甘蓝甜甜的，不像超市的苦有农药味，以后就在这家买了
卷心菜每次必选 。烩点儿饼丝 。凉拌非常好吃 。
卷心菜也非常新鲜 ，全部原拍
太惊喜了，超级新鲜，以后就这家了
包菜之前也拍过，炒出来味道好像甜一些，对于市场打药多的包菜，有机虽然贵很多，但吃着放心。
经常回购这家，包装很好，夏季保鲜有保证。
质量不好
不是很鲜
不错，菜真新鲜
蔬菜收到很新鲜，吃起来口感很好。已是第二次回购！
冬瓜特別好吃 感覺還是吃有機的安全 會一直回購的
再次买了，是新鲜的有机蔬菜，很好吃，有需要还会再来光顾，满意的好评！
菜很新鲜，有机放心，继续回购。

自己搜来的停用词表.txt里面是你要去查找的用来分割句子的单词，大概是以下样子，你百度一下就有了


啊
阿
哎
哎呀
哎哟
唉
俺
俺们
按
按照
吧
吧哒
把
罢了
被
本
本着
比
比方
比如
鄙人
彼
彼此
边
别
别的
别说

最后感想分词.txt是我们运行程序生成的，不是你自己搜集的文件。

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/你好赵伟/article/detail/719883