Python课设【机械学院】_机器·学习课程设计报告

作者：在线问答5 | 2024-07-15 05:46:45

踩

机器·学习课程设计报告

1.题目要求

完成至少一个数据爬取的案例（例如：爬取政府工作报告，并生成词云）

2.实现

爬政府工作报告


from wordcloud import WordCloud
from bs4 import BeautifulSoup
import jieba
import requests
 
 
# 获取网页中的正文文本
def getText(url):
    html = requests.get(url).content
    bs = BeautifulSoup(html, features="html.parser")
    content = bs.find_all('p')
    text = ''
    for p in content:
        text += p.get_text()
        text += '\n'
    return text
 
 
# 词频分析
def getWordFrequency(text):
    from collections import Counter
    words = [word for word in jieba.cut(text, cut_all=True) if len(word) >= 2]
    c = Counter(words)
    for word_freq in c.most_common(35):
        word, freq = word_freq
        print(word, freq)
 
 
if __name__ == '__main__':
    url = 'http://news.sina.com.cn/c/xl/2019-03-05/doc-ihsxncvf9915493.shtml'
    text = getText(url)
 
    # 将爬取到的内容写进txt文件中
    f = open(u"政府工作报告.txt", "w", encoding="utf-8")
    f.write(text)
    f.close()
 
    getWordFrequency(text)
 
    # 词云分析
    words = jieba.lcut(text, cut_all=True)
    # 忽略的词
    exclude_words = ["我们", "提高", "国家", "的", "要", "和", "为", "是", "以",
                     "随着", "对于", "对", "等", "能", "都", "中", "在", "了", "通常",
                     "如果", "我", "我国", "他", "就", "着", "什么", "将", "没有",
                     "到", "这", "也", "不", "与", "让", "更", "把"]
    for word in words:
        if word in exclude_words:
            words.remove(word)
    cuted = ' '.join(words)
 
    # 生成词云
    wc = WordCloud(font_path='msyh.ttc', width=1000, height=700, background_color='white').generate(cuted)
    wc.to_file("wordcloud.png")

运行结果图没了，懒得放了

爬疫情数据


import requests
from pyquery import PyQuery as pq
import json
from xlsxwriter import Workbook
from wordcloud import WordCloud
import matplotlib.pyplot as plt
 
url = "https://ncov.dxy.cn/ncovh5/view/pneumonia"
response = requests.get(url)  # 请求
if response.status_code == 200:
    response.encoding = "utf-8"
    dom = pq(response.content)
    # 取出各省的数据，转为json对象
    jsonobj = json.loads(dom("script#getAreaStat").text().split(" = ")[1].split("}catch")[0])
 
province_data = {}  # 用来存储数据
# 遍历
for item in jsonobj:
    provinceName = item["provinceName"]
    confirmedCount = item["confirmedCount"]
    province_data[provinceName] = confirmedCount
 
# 将爬取到的内容写进文件中
with open(u'全国各省疫情数据.txt', 'w', encoding="utf-8") as f:
    for k, v in province_data.items():
        f.write(f"{k}:{v}\n")
f.close()
 
# 生成词云图
wc = WordCloud(background_color='white', font_path='C:\Windows\Fonts\simhei.ttf', width=2000, height=1500)
wc.generate_from_frequencies(frequencies=province_data)
# 画图
plt.figure()
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/在线问答5/article/detail/827942