当前位置:   article > 正文

Python课设【机械学院】_机器·学习课程设计报告

机器·学习课程设计报告

1.题目要求

        完成至少一个数据爬取的案例(例如:爬取政府工作报告,并生成词云)

2.实现

  • 爬政府工作报告
  1. from wordcloud import WordCloud
  2. from bs4 import BeautifulSoup
  3. import jieba
  4. import requests
  5. # 获取网页中的正文文本
  6. def getText(url):
  7. html = requests.get(url).content
  8. bs = BeautifulSoup(html, features="html.parser")
  9. content = bs.find_all('p')
  10. text = ''
  11. for p in content:
  12. text += p.get_text()
  13. text += '\n'
  14. return text
  15. # 词频分析
  16. def getWordFrequency(text):
  17. from collections import Counter
  18. words = [word for word in jieba.cut(text, cut_all=True) if len(word) >= 2]
  19. c = Counter(words)
  20. for word_freq in c.most_common(35):
  21. word, freq = word_freq
  22. print(word, freq)
  23. if __name__ == '__main__':
  24. url = 'http://news.sina.com.cn/c/xl/2019-03-05/doc-ihsxncvf9915493.shtml'
  25. text = getText(url)
  26. # 将爬取到的内容写进txt文件中
  27. f = open(u"政府工作报告.txt", "w", encoding="utf-8")
  28. f.write(text)
  29. f.close()
  30. getWordFrequency(text)
  31. # 词云分析
  32. words = jieba.lcut(text, cut_all=True)
  33. # 忽略的词
  34. exclude_words = ["我们", "提高", "国家", "的", "要", "和", "为", "是", "以",
  35. "随着", "对于", "对", "等", "能", "都", "中", "在", "了", "通常",
  36. "如果", "我", "我国", "他", "就", "着", "什么", "将", "没有",
  37. "到", "这", "也", "不", "与", "让", "更", "把"]
  38. for word in words:
  39. if word in exclude_words:
  40. words.remove(word)
  41. cuted = ' '.join(words)
  42. # 生成词云
  43. wc = WordCloud(font_path='msyh.ttc', width=1000, height=700, background_color='white').generate(cuted)
  44. wc.to_file("wordcloud.png")

        运行结果图没了,懒得放了

  • 爬疫情数据
  1. import requests
  2. from pyquery import PyQuery as pq
  3. import json
  4. from xlsxwriter import Workbook
  5. from wordcloud import WordCloud
  6. import matplotlib.pyplot as plt
  7. url = "https://ncov.dxy.cn/ncovh5/view/pneumonia"
  8. response = requests.get(url) # 请求
  9. if response.status_code == 200:
  10. response.encoding = "utf-8"
  11. dom = pq(response.content)
  12. # 取出各省的数据,转为json对象
  13. jsonobj = json.loads(dom("script#getAreaStat").text().split(" = ")[1].split("}catch")[0])
  14. province_data = {} # 用来存储数据
  15. # 遍历
  16. for item in jsonobj:
  17. provinceName = item["provinceName"]
  18. confirmedCount = item["confirmedCount"]
  19. province_data[provinceName] = confirmedCount
  20. # 将爬取到的内容写进文件中
  21. with open(u'全国各省疫情数据.txt', 'w', encoding="utf-8") as f:
  22. for k, v in province_data.items():
  23. f.write(f"{k}:{v}\n")
  24. f.close()
  25. # 生成词云图
  26. wc = WordCloud(background_color='white', font_path='C:\Windows\Fonts\simhei.ttf', width=2000, height=1500)
  27. wc.generate_from_frequencies(frequencies=province_data)
  28. # 画图
  29. plt.figure()
  30. plt.imshow(wc, interpolation='bilinear')
  31. plt.axis('off')
  32. plt.show()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/在线问答5/article/detail/827942
推荐阅读
相关标签
  

闽ICP备14008679号