赞
踩
完成至少一个数据爬取的案例(例如:爬取政府工作报告,并生成词云)
- from wordcloud import WordCloud
- from bs4 import BeautifulSoup
- import jieba
- import requests
-
-
- # 获取网页中的正文文本
- def getText(url):
- html = requests.get(url).content
- bs = BeautifulSoup(html, features="html.parser")
- content = bs.find_all('p')
- text = ''
- for p in content:
- text += p.get_text()
- text += '\n'
- return text
-
-
- # 词频分析
- def getWordFrequency(text):
- from collections import Counter
- words = [word for word in jieba.cut(text, cut_all=True) if len(word) >= 2]
- c = Counter(words)
- for word_freq in c.most_common(35):
- word, freq = word_freq
- print(word, freq)
-
-
- if __name__ == '__main__':
- url = 'http://news.sina.com.cn/c/xl/2019-03-05/doc-ihsxncvf9915493.shtml'
- text = getText(url)
-
- # 将爬取到的内容写进txt文件中
- f = open(u"政府工作报告.txt", "w", encoding="utf-8")
- f.write(text)
- f.close()
-
- getWordFrequency(text)
-
- # 词云分析
- words = jieba.lcut(text, cut_all=True)
- # 忽略的词
- exclude_words = ["我们", "提高", "国家", "的", "要", "和", "为", "是", "以",
- "随着", "对于", "对", "等", "能", "都", "中", "在", "了", "通常",
- "如果", "我", "我国", "他", "就", "着", "什么", "将", "没有",
- "到", "这", "也", "不", "与", "让", "更", "把"]
- for word in words:
- if word in exclude_words:
- words.remove(word)
- cuted = ' '.join(words)
-
- # 生成词云
- wc = WordCloud(font_path='msyh.ttc', width=1000, height=700, background_color='white').generate(cuted)
- wc.to_file("wordcloud.png")

运行结果图没了,懒得放了
- import requests
- from pyquery import PyQuery as pq
- import json
- from xlsxwriter import Workbook
- from wordcloud import WordCloud
- import matplotlib.pyplot as plt
-
- url = "https://ncov.dxy.cn/ncovh5/view/pneumonia"
- response = requests.get(url) # 请求
- if response.status_code == 200:
- response.encoding = "utf-8"
- dom = pq(response.content)
- # 取出各省的数据,转为json对象
- jsonobj = json.loads(dom("script#getAreaStat").text().split(" = ")[1].split("}catch")[0])
-
- province_data = {} # 用来存储数据
- # 遍历
- for item in jsonobj:
- provinceName = item["provinceName"]
- confirmedCount = item["confirmedCount"]
- province_data[provinceName] = confirmedCount
-
- # 将爬取到的内容写进文件中
- with open(u'全国各省疫情数据.txt', 'w', encoding="utf-8") as f:
- for k, v in province_data.items():
- f.write(f"{k}:{v}\n")
- f.close()
-
- # 生成词云图
- wc = WordCloud(background_color='white', font_path='C:\Windows\Fonts\simhei.ttf', width=2000, height=1500)
- wc.generate_from_frequencies(frequencies=province_data)
- # 画图
- plt.figure()
- plt.imshow(wc, interpolation='bilinear')
- plt.axis('off')
- plt.show()

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。