当前位置:   article > 正文

LDA主题模型Python实现_lda主题模型python代码

lda主题模型python代码

        如果你有一个文本文件,那么以下这段代码可以帮助你实现LDA主题模型

  1. import jieba
  2. # from nltk.corpus import stopwords
  3. import pyLDAvis.gensim_models
  4. import wordcloud
  5. from gensim.models.coherencemodel import CoherenceModel
  6. from gensim.models.ldamodel import LdaModel
  7. from gensim.corpora.dictionary import Dictionary
  8. text_h = ""
  9. with open("temp.txt", "r", encoding="utf-8") as f:
  10. for ann in f.readlines():
  11. ann = ann.strip("\n") # 去除文本中的换行符
  12. print(ann)
  13. text_h += ann
  14. # text_h = word_tokenize(text_h)
  15. text_h = jieba.cut(text_h, cut_all=True)
  16. """
  17. for i in range(len(text_h)):
  18. text_h[i] = text_h[i].lower()
  19. text_h = list(filter(lambda x: not str(x).isdigit(), text_h))
  20. print(text_h)
  21. """
  22. interpunctuations = [
  23. ",",
  24. "。",
  25. ":",
  26. ";",
  27. "?",
  28. "(",
  29. ")",
  30. "【",
  31. "】",
  32. "&",
  33. "!",
  34. "、",
  35. "*",
  36. "@",
  37. "#",
  38. "$",
  39. "%",
  40. ".",
  41. ",",
  42. ":",
  43. ":",
  44. ";",
  45. "!",
  46. '"',
  47. "“",
  48. "”",
  49. "[",
  50. "]",
  51. "‘",
  52. "’",
  53. "。”",
  54. "",
  55. ] # 定义标点符号列表
  56. text_h = [word for word in text_h if word not in interpunctuations]
  57. # text_h = [word for word in text_h if word not in stopwords.words("english")]
  58. # text_h = jieba.cut(text_h, cut_all=True)
  59. s = " ".join(text_h) # 连接成字符串
  60. wc = wordcloud.WordCloud(
  61. font_path="msyh.ttc",
  62. width=1000,
  63. height=700,
  64. background_color="white",
  65. max_words=100,
  66. stopwords=s,
  67. )
  68. wc.generate(s) # 加载词云文本
  69. wc.to_file("XXXXX.png") # 保存词云文件
  70. # 构造词典
  71. dictionary = Dictionary([text_h])
  72. # 基于词典,使【词】→【稀疏向量】,并将向量放入列表,形成【稀疏向量集】
  73. corpus = [dictionary.doc2bow(words) for words in [text_h]]
  74. # lda模型,num_topics设置主题的个数
  75. lda = LdaModel(
  76. corpus=corpus, id2word=dictionary, num_topics=2, random_state=123, iterations=50
  77. )
  78. # U_Mass Coherence
  79. ldaCM = CoherenceModel(
  80. model=lda, corpus=corpus, dictionary=dictionary, coherence="u_mass"
  81. )
  82. # 打印所有主题,每个主题显示15个词
  83. for topic in lda.print_topics(num_words=15):
  84. print(topic)
  85. # 用pyLDAvis将LDA模式可视化
  86. # plot = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)
  87. # 保存到本地html
  88. # pyLDAvis.save_html(plot, "./XXXXX.html")
  89. """
  90. """

        运行过后,你可以得到一个html文件,如下所示。

        大家可以根据自己不同的需求进行自定义修改,模型主体是不变的。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/358834
推荐阅读
相关标签
  

闽ICP备14008679号