当前位置:   article > 正文

Python爬虫 | 爬取微博和哔哩哔哩数据_哔哩哔哩爬虫

哔哩哔哩爬虫

目录

一、bill_comment.py

二、bili_comment_pic.py

三、bilibili.py

四、bilihot_pic.py

五、bilisearch_pic.py

六、draw_cloud.py

七、weibo.py

八、weibo_comment.py

九、weibo_comment_pic.py

十、weibo_pic.py

十一、weibo_top.py

十二、weibo_top_pic.py

十三、weibo_top_pie.py

十四、pachong.py

十五、代码文件说明


一、bill_comment.py

  1. import requests# 发送请求
  2. import pandas as pd#保存csv文件
  3. import os # 判断文件是否存在
  4. import time
  5. from time import sleep# 设置等待,防止反爬
  6. import json
  7. import random# 生成随机数
  8. import os.path
  9. import requests
  10. import csv
  11. import re
  12. import bili_comment_pic
  13. def trans_date(v_timestamp):
  14. """"10位时间戳转换为时间字符串"""
  15. timeArray=time.localtime(v_timestamp)
  16. otherStyleTime = time.strftime("%Y-%m-%d %H: %M:%S", timeArray)
  17. return otherStyleTime
  18. def getoid(bv):
  19. resp=requests.get("https://www.bilibili.com/video/"+bv)
  20. obj=re.compile(f'"aid":(?P<id>.*?),"bvid":"{bv}"') #在网页源代码里可以找到id,用正则获取到
  21. oid=obj.search(resp.text).group('id')
  22. print('oid是'+oid) #在程序运行时告诉我们已经获取到了参数oid
  23. return oid
  24. def get_bili_comment(bv_list,max_page):
  25. for bvid in bv_list:
  26. #保存文件名
  27. bili_file='biliComment_{}pages_{}.csv'.format(max_page,bvid)
  28. #如果csv存在,先删除
  29. if os.path.exists(bili_file):
  30. os.remove(bili_file)
  31. print('存在,已删除:{}'.format(bili_file))
  32. #
  33. # # 请求头
  34. # headers = {
  35. # 'Authority':'api.bilibili.com',
  36. # 'Accept':'application/json, text/plain, */*',
  37. # 'Accept-Encoding':'gzip, deflate, br',
  38. # 'Accept-Language':'zh-CN,zh;q=0.9',
  39. # #需要定期更换cookie
  40. # 'Cookie':
  41. # 'buvid3=09193776-D54E-C4E9-D77E-A3CEC61048A052609infoc; b_nut=1666432252; i-wanna-go-back=-1; b_ut=7; _uuid=9837E983-2521-B3D3-E815-AF3877BF973253126infoc; buvid_fp=bca1b3ca8709dc8fafd31a3014e880cb; nostalgia_conf=-1; PVID=1; CURRENT_FNVAL=4048; rpdid=0z9ZwfQgnR|lkoRrAma|2ss|3w1Q0AxQ; sid=73446m9u; buvid4=FFE4C4F3-FFE7-4A1B-F2E9-BA77F904B1B753643-022102217-RoU6Io6eaXN5hT%2FTDpMpDggrSpyQiYXaOp1a506ie3QU%2FFwMxK3Zhw%3D%3D; b_lsid=E6E6D472_1883D6194B0',
  42. # 'Origin':'https://www.bilibili.com',
  43. # 'Referer':'https://www.bilibili.com/video/BV1zh4y1H7ZS/?spm_id_from=333.999.0.0&vd_source=7dd889e8bc19f867cf9a8b6d62c711ee',
  44. # 'Sec-Ch-Ua':'"Google Chrome";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  45. # 'Sec-Ch-Ua-Mobile':'?0',
  46. # 'Sec-Ch-Ua-Platform':'"macOS"',
  47. # 'Sec-Fetch-Dest':'empty',
  48. # 'Sec-Fetch-Mode':'cors',
  49. # 'Sec-Fetch-Site':'same-site',
  50. # 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
  51. #
  52. # }
  53. # # 更简单的网页头
  54. headers = {
  55. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
  56. "referer": "https://www.bilibili.com/"
  57. }
  58. for page in range(1,max_page + 1):
  59. #请求参数
  60. params = {
  61. 'jsonp':'jsonp',
  62. 'mode': '3',#mode=3代表按热门排序,mode=2代表按时间排序
  63. 'oid': getoid(bvid),
  64. 'next':page,
  65. 'type': '1',
  66. }
  67. # type:评论类型,这里固定值1
  68. # oid: 哪个视频
  69. # pn: 第几页的评论
  70. # sort: 排序。0: 按照时间排序。2:按照热度排序。默认2
  71. url = (f"https://api.bilibili.com/x/v2/reply/main") # 获得网页源码
  72. response = requests.get(url, headers=headers,params=params,)
  73. print(response.status_code)
  74. data_list=response.json()['data']['replies']#解析评论数据
  75. comment_list=[]#评论内容空列表
  76. time_list=[]#评论时间空列表
  77. #location_list=[]#评论IP空列表
  78. user_list=[]#评论用户名空列表
  79. like_list=[]#评论点赞数空列表
  80. replyCount_list=[]#评论回复数空列表
  81. userid_list=[]#评论用户id空列表
  82. #循环爬取每一条评论数据
  83. for a in data_list:
  84. #评论内容
  85. comment=a['content']['message']
  86. comment_list.append(comment)
  87. #评论时间
  88. time=a['ctime']
  89. time_list.append(trans_date(time))
  90. #time_list.append(trans_date(v_str=i) for i in range(time))
  91. # #IP属地(评论后一段时间会消失,所以不爬了)
  92. # location = a['source']
  93. # location_list.append(location)
  94. #评论回复数
  95. replyCount = a['rcount']
  96. replyCount_list.append(replyCount)
  97. #点赞数
  98. like = a['like']
  99. like_list.append(like)
  100. # 评论用户名
  101. user = a['member']['uname']
  102. user_list.append(user)
  103. # 评论用户名
  104. userid = a['member']['mid']
  105. userid_list.append(userid)
  106. #把列表拼接为dataFrame数据
  107. df=pd.DataFrame({
  108. #'视频链接':'https://www.bilibili.com/video/'+v_bid,
  109. '评论页码':page,
  110. '评论时间':time_list,
  111. '评论作者':user_list,
  112. '评论id': userid_list,
  113. #'IP属地':location_list,
  114. '点赞数':like_list,
  115. '评论回复数':replyCount_list,
  116. '评论内容':comment_list,
  117. })
  118. # 表头
  119. if os.path.exists(bili_file):
  120. header = None
  121. else:
  122. header = ['评论页码','评论时间', '评论作者', '评论id', '点赞数', '评论回复数', '评论内容']
  123. column=['评论页码','评论时间', '评论作者', '评论id', '点赞数', '评论回复数', '评论内容']
  124. # 保存到csv文件
  125. df.to_csv(bili_file, mode='a+', index=False, columns=column,header=header, encoding='utf-8-sig')
  126. #print('csv保存成功:{}'.format(bili_file))
  127. print('第{}页爬取完成'.format(page))
  128. #print(df)
  129. # 数据清洗、去重
  130. df = pd.read_csv(bili_file, engine='python', encoding='utf-8-sig')
  131. os.remove(bili_file)
  132. # 删除重复数据
  133. df.drop_duplicates(subset='评论内容', inplace=True, keep='first')
  134. # 再次保存csv文件
  135. column=header = ['评论页码', '评论时间', '评论作者', '评论id', '点赞数', '评论回复数', '评论内容']
  136. df.to_csv(bili_file, mode='a+', index=False, columns=column,header=header, encoding='utf-8-sig')
  137. print('数据清洗完成')
  138. bili_comment_pic.main(bili_file)
  139. if __name__=='__main__':
  140. #视频bv号,循环爬取多个视频评论
  141. #bv_list=['BV1Ss4y1M7KT','BV1VM411N7qc']
  142. bv_list = [str(x) for x in input("请输入视频bv号(示例:BV1Ss4y1M7KT,BV1VM411N7qc),以逗号分隔:").split(',')]
  143. #最大爬取页
  144. max_page=int(input("请输入搜索的页数"))
  145. #调用爬取
  146. get_bili_comment(bv_list=bv_list,max_page=max_page)

二、bili_comment_pic.py

  1. # 允许副本存在,忽略报错
  2. import os
  3. import pandas as pd
  4. import matplotlib.pyplot as plt
  5. from matplotlib import font_manager
  6. import numpy as np
  7. os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
  8. def view(info,bili_file):
  9. my_font = font_manager.FontProperties(fname='./STHeiti-TC-Medium.ttf') # 设置中文字体(图标中能显示中文)
  10. likes = info['点赞数'] # 点赞
  11. reply = info['评论回复数'] # 回复
  12. comment = info['评论内容'] # 内容
  13. # print(comment)
  14. # 为了坐标轴上能显示中文
  15. plt.rcParams['font.sans-serif'] = ['SimHei']
  16. plt.rcParams['axes.unicode_minus'] = False
  17. # **********************************************************************综合评分和播放量对比
  18. # *******点赞数条形图
  19. fig, ax1 = plt.subplots()
  20. length = len(comment)
  21. plt.bar(x=np.arange(length), tick_label=comment, height=likes, color='red') # 设置柱状图
  22. plt.title('点赞数和评论数数据分析', fontproperties=my_font) # 表标题
  23. ax1.tick_params(labelsize=6)
  24. plt.xlabel('评论内容') # 横轴名
  25. plt.ylabel('点赞数') # 纵轴名
  26. plt.xticks(rotation=90, color='green') # 设置横坐标变量名旋转度数和颜色
  27. # *******评论数折线图
  28. ax2 = ax1.twinx() # 组合图必须加这个
  29. ax2.plot(reply, color='cyan') # 设置线粗细,节点样式
  30. plt.ylabel('评论数') # y轴
  31. plt.plot(1, label='点赞数', color="red", linewidth=5.0) # 图例
  32. #plt.plot(1, label='评论回复数', color="cyan", linewidth=1.0, linestyle="-") # 图例
  33. plt.legend()
  34. plt.savefig('.\图片\pic-{}.png'.format(bili_file), dpi=1000, bbox_inches='tight') # 保存至本地
  35. plt.show()
  36. def main(bili_file):
  37. info = pd.read_csv(bili_file,engine='python', encoding='utf-8-sig')
  38. info=info.nlargest(60,'点赞数')
  39. info=info.reset_index(drop=True)
  40. view(info,bili_file)
  41. if __name__ == '__main__':
  42. main('biliComment_15pages_BV1Ss4y1M7KT.csv')

三、bilibili.py

  1. import requests
  2. from urllib.parse import quote
  3. import json
  4. import time
  5. from time import sleep
  6. import pandas as pd
  7. import hashlib
  8. import bilihot_pic
  9. import bilisearch_pic
  10. """
  11. bilisearch类的需求功能
  12. 1.初始化需要输入参数
  13. search:你需要搜索的数据
  14. page:需要查看的页数
  15. 2.使用方法
  16. a = blisearch(serch,page) 初始化类
  17. a.findall() 将爬取的数据存入excel文件中
  18. """
  19. class bilisearch():
  20. # 第一个输入的参数是搜索数据,第二个是搜素页数
  21. def __init__(self, search, page):
  22. # 对输入进行编码
  23. self.search = search
  24. self.searchurl = '&keyword=' + quote(search, 'utf-8')
  25. # 构造浏览器访问请求头
  26. # 大概是一定要cookie才能访问的 测试一下cookie过段时间还能不能访问
  27. self.head = {
  28. 'authority': 'api.bilibili.com',
  29. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44',
  30. 'Referer': "https://search.bilibili.com/all?from_source=webtop_search&spm_id_from=333.1007&search_source=5keyword=",
  31. 'referer': 'https://www.bilibili.com/',
  32. 'cookie': 'buvid3=05746C34-6526-44A7-9132-4C0A7180E63C148796infoc; LIVE_BUVID=AUTO4216287558369376; i-wanna-go-back=-1; CURRENT_BLACKGAP=0; buvid4=CE2658E1-DE0F-1555-42F9-BBE8E7E701B973047-022012116-NXuDwzBl0l7IPmxDzx269g%3D%3D; buvid_fp_plain=undefined; blackside_state=0; is-2022-channel=1; _uuid=136F106D6-AA102-198A-C5DD-7351A72CFDE849203infoc; b_nut=100; rpdid=0zbfvWJdeE|54lJB1MA|2Ln|3w1OVksf; CURRENT_QUALITY=80; hit-new-style-dyn=1; CURRENT_PID=b98a29b0-cd2f-11ed-9194-494fac97dd7c; fingerprint=5050e9471226aa5c2be3ac56100522f8; header_theme_version=CLOSE; nostalgia_conf=-1; hit-dyn-v2=1; home_feed_column=5; CURRENT_FNVAL=4048; bp_video_offset_329341133=781400043392336000; SESSDATA=0948d8e9%2C1696396399%2Cef62d%2A42; bili_jct=cb7a5dbbd0153907fff4b713334d6833; DedeUserID=329341133; DedeUserID__ckMd5=acfa5c750e5b3e7f; PVID=1; b_ut=5; innersign=0; b_lsid=7C37E147_1875B2E5B1D; bsource=search_bing; buvid_fp=5050e9471226aa5c2be3ac56100522f8'
  33. }
  34. # 需要爬取的页数
  35. self.page = page
  36. # 保存的数据
  37. # self.data=[]
  38. def dataProcess(self, data):
  39. # 存入csv的数据集
  40. storedata = []
  41. # 每一页的数据量是30个
  42. for i in range(30):
  43. if (data[i]['type'] == 'picture_ad_0'):
  44. continue
  45. # 作者
  46. author = data[i]['author']
  47. # 标题 替换<em class="keyword"> </em>
  48. title = data[i]['title'].replace('<em class="keyword">', '').replace('</em>', '')
  49. # 播放量
  50. play = data[i]['play']
  51. # 简介
  52. description = data[i]['description']
  53. # 封面
  54. pic = data[i]['pic']
  55. # 播放地址
  56. arcurl = data[i]['arcurl']
  57. # id
  58. id = data[i]['id']
  59. # 时间
  60. pubdate = data[i]['pubdate']
  61. # 10位时间戳转换为时间字符串
  62. timeArray = time.localtime(pubdate)
  63. pubdate = time.strftime("%Y-%m-%d %H: %M:%S", timeArray)
  64. # 将数据以字典的格式存入data序列中
  65. # self.data.append({'author':author,'title':title,'play':play,'description':description,'pic':pic,'arcurl':arcurl,'id':id})
  66. storedata.append([author, title, play, description, pic, arcurl, id, pubdate])
  67. return storedata
  68. def reverse(self, page):
  69. timenow = int(time.time())
  70. if (page == 1):
  71. an = f'refresh=true&_extra=&ad_resource=5646&context=&duration=&from_source=&from_spmid=333.337&highlight=1&keyword={self.search}&order=&page=1&page_size=42&platform=pc&qv_id=EfNJjEtrA0N5DxzPVKch7Kz6v33ezlFR&single_column=0&source_tag=3&web_location=1430654&wts={timenow}'
  72. wt = '55540207d820a7368ab7e104169d409d'
  73. data = an + wt
  74. md = hashlib.md5(data.encode('UTF-8'))
  75. return md.hexdigest(), timenow
  76. else:
  77. an = f'refresh=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset={str((page - 1) * 30)}&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword={self.search}&page={page}&page_size=42&platform=pc&qv_id=hJgZIEUY51fw9Pp7s8pidIVEJ7Z08KaS&search_type=video&single_column=0&source_tag=3&web_location=1430654&wts={timenow}'
  78. wt = '55540207d820a7368ab7e104169d409d'
  79. data = an + wt
  80. md = hashlib.md5(data.encode('UTF-8'))
  81. return md.hexdigest(), timenow
  82. # 综合排序
  83. def findall(self):
  84. for pnum in range(1, int(self.page) + 1):
  85. # 拼接关键字,请求数据
  86. w_rid, timenow = self.reverse(pnum)
  87. if (pnum == 1):
  88. target = requests.get(
  89. f'https://api.bilibili.com/x/web-interface/wbi/search/all/v2?__refresh__=true&_extra=&context=&page={pnum}&page_size=42&order=&duration=&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword={self.search}&qv_id=noyCOTfEBm8ZzMVGopKgzYbiqLFxoAn1&ad_resource=5646&source_tag=3&web_location=1430654&w_rid={w_rid}&wts={timenow}',
  90. headers=self.head)
  91. else:
  92. target = requests.get(
  93. f'https://api.bilibili.com/x/web-interface/wbi/search/all/v2?refresh=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset={(pnum - 1) * 30}&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword={self.search}&page={pnum}&page_size=42&platform=pc&qv_id=hJgZIEUY51fw9Pp7s8pidIVEJ7Z08KaS&search_type=video&single_column=0&source_tag=3&web_location=1430654&w_rid={w_rid}&wts={timenow}',
  94. headers=self.head)
  95. # 将数据转换为py对象
  96. data = json.loads(target.text)
  97. # 存入csv的数据集
  98. storedata = self.dataProcess(data['data']['result'][10]['data'])
  99. print('第', pnum, '页完成')
  100. # 调用storeCsvdata
  101. self.storeCsvdata('b站清单_' + str(self.search) + '_第' + str(pnum) + '页.csv', storedata, pnum)
  102. # 设置等待1s
  103. sleep(1)
  104. # 写入文件模块
  105. def storeCsvdata(self, filename, storedata, pagenum):
  106. with open(filename, 'a+') as fp:
  107. # 构造列表头
  108. name = ['作者', '标题', '播放量', '简介', '封面', '播放地址', 'id', '时间']
  109. # 写入文件
  110. writer = pd.DataFrame(storedata, columns=name)
  111. writer.to_csv(filename, index=False, encoding='utf-8-sig')
  112. bilisearch_pic.main(filename)
  113. fp.close()
  114. """
  115. bilihot类的功能
  116. 1.初始化需要的参数
  117. 2.使用方法
  118. a = bilihot() 初始化
  119. a.findall() 调用搜索
  120. a.storeCsvdata() 储存数据
  121. a.data 可以查看数据
  122. a.data[i][j] i为第几个数据集合 j为['作者','标题','播放量','简介','封面','id','播放地址','时间','分区']
  123. """
  124. class bilihot():
  125. def __init__(self):
  126. # 构造浏览器访问请求头
  127. self.head = {
  128. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44',
  129. 'Referer': "https://search.bilibili.com/all?from_source=webtop_search&spm_id_from=333.1007&search_source=5keyword=",
  130. 'referer': 'https://www.bilibili.com/v/popular/rank/all',
  131. 'authority': 'api.bilibili.com',
  132. }
  133. # 保存一份数据
  134. self.data = []
  135. def findall(self):
  136. # 请求数据
  137. target = requests.get('https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all', headers=self.head)
  138. # 将数据转换为py对象
  139. data = json.loads(target.text)
  140. for i in data['data']['list']:
  141. # 作者
  142. author = i['owner']['name']
  143. # 标题
  144. title = i['title']
  145. # 播放量
  146. play = i['stat']['view']
  147. # 简介
  148. desc = i['desc']
  149. # 封面
  150. pic = i['pic']
  151. # id
  152. id = i['aid']
  153. # 播放地址
  154. arcurl = i['short_link_v2']
  155. # 发布日期
  156. pubdate = i['pubdate']
  157. # 10位时间戳转换为时间字符串
  158. timeArray = time.localtime(pubdate)
  159. pubdate = time.strftime("%Y-%m-%d %H: %M:%S", timeArray)
  160. # 分区
  161. tname = i['tname']
  162. self.data.append([author, title, play, desc, pic, id, arcurl, pubdate, tname])
  163. print('请求数据成功')
  164. def storeCsvdata(self):
  165. with open('b站排行榜.csv', 'a+') as fp:
  166. # 构造列表头
  167. name = ['作者', '标题', '播放量', '简介', '封面', 'id', '播放地址', '时间', '分区']
  168. # 写入文件
  169. writer = pd.DataFrame(self.data, columns=name)
  170. writer.to_csv('b站排行榜.csv', index=False, encoding='utf-8-sig')
  171. print('写入成功')
  172. bilihot_pic.main('b站排行榜.csv')
  173. fp.close()
  174. if __name__ == '__main__':
  175. # search: 你需要搜索的数据
  176. search = input("请输入搜索的关键词")
  177. # page: 需要查看的页数
  178. page = int(input("请输入搜索的页数"))
  179. # 初始化类
  180. a = bilisearch(search, page)
  181. # 将爬取的数据存入excel文件中
  182. a.findall()
  183. # 初始化
  184. b = bilihot()
  185. # 调用搜索
  186. b.findall()
  187. # 储存数据
  188. b.storeCsvdata()

四、bilihot_pic.py

  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. from matplotlib import font_manager
  4. import numpy as np
  5. def view(info,bili_file):
  6. # 设置中文字体(图标中能显示中文)
  7. my_font = font_manager.FontProperties(fname='./STHeiti-TC-Medium.ttf')
  8. # 为了坐标轴上能显示中文
  9. plt.rcParams['font.sans-serif'] = ['SimHei']
  10. plt.rcParams['axes.unicode_minus'] = False
  11. title = info['标题']
  12. views = info['播放量']
  13. # *******播放量条形图
  14. fig, ax1 = plt.subplots()
  15. length = len(title)
  16. plt.barh(y=np.arange(length), tick_label=title, width=views, color='cyan') # 设置柱状图
  17. plt.title('标题和播放量的数据分析', fontproperties=my_font) # 表标题
  18. ax1.tick_params(labelsize=6)
  19. plt.xlabel('播放量') # 横轴名
  20. plt.ylabel('标题') # 纵轴名
  21. plt.yticks(color='green') # 设置横坐标变量名旋转度数和颜色
  22. plt.plot(1, label='播放量', color="cyan", linewidth=5.0) # 图例
  23. plt.legend()
  24. plt.savefig('.\图片\pic-{}.png'.format(bili_file), dpi=1000, bbox_inches='tight') # 保存至本地
  25. plt.show()
  26. def main(bili_file):
  27. info = pd.read_csv(bili_file,engine='python', encoding='utf-8-sig')
  28. info = info.nlargest(50, '播放量')
  29. info = info.sort_values('播放量', ascending=True)
  30. view(info,bili_file)
  31. if __name__ == '__main__':
  32. main('b站排行榜.csv')

五、bilisearch_pic.py

  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. from matplotlib import font_manager
  4. import numpy as np
  5. def view(info,bili_file):
  6. # 设置中文字体(图标中能显示中文)
  7. my_font = font_manager.FontProperties(fname='./STHeiti-TC-Medium.ttf')
  8. # 为了坐标轴上能显示中文
  9. plt.rcParams['font.sans-serif'] = ['SimHei']
  10. plt.rcParams['axes.unicode_minus'] = False
  11. title = info['标题']
  12. views = info['播放量']
  13. # *******播放量条形图
  14. fig, ax1 = plt.subplots()
  15. length = len(title)
  16. plt.barh(y=np.arange(length), tick_label=title, width=views, color='green') # 设置柱状图
  17. plt.title('标题和播放量的数据分析', fontproperties=my_font) # 表标题
  18. ax1.tick_params(labelsize=6)
  19. plt.xlabel('播放量') # 横轴名
  20. plt.ylabel('标题') # 纵轴名
  21. plt.yticks(color='blue') # 设置纵坐标变量名颜色
  22. plt.plot(1, label='播放量', color="green", linewidth=5.0) # 图例
  23. plt.legend()
  24. plt.savefig('.\图片\pic-{}.png'.format(bili_file), dpi=1000, bbox_inches='tight') # 保存至本地
  25. plt.show()
  26. def main(bili_file):
  27. info = pd.read_csv(bili_file,engine='python', encoding='utf-8-sig')
  28. info = info.sort_values('播放量', ascending=True)
  29. view(info,bili_file)
  30. if __name__ == '__main__':
  31. main('b站清单_疫情_第1页.csv')

六、draw_cloud.py

  1. import numpy as np
  2. import pandas as pd
  3. from wordcloud import WordCloud, ImageColorGenerator
  4. from PIL import Image
  5. def draw_cloud(weibo_file):
  6. image = Image.open('.\\background.jpg') # 作为背景轮廓图
  7. graph = np.array(image)
  8. # 参数分别是指定字体、背景颜色、最大的词的大小、使用给定图作为背景形状
  9. wc = WordCloud(font_path='msyh.ttc',background_color='white',max_words=100, mask=graph)
  10. fp = pd.read_csv(weibo_file,engine='python', encoding='utf-8-sig') # 读取词频文件
  11. name = list(fp['热搜内容']) # 词
  12. value = fp['热搜热度'] # 词的频率
  13. for i in range(len(name)):
  14. name[i] = str(name[i])
  15. dic = dict(zip(name, value)) # 词频以字典形式存储
  16. print(dic)
  17. wc.generate_from_frequencies(dic) # 根据给定词频生成词云
  18. image_color = ImageColorGenerator(graph)#生成词云的颜色
  19. wc.to_file('.\图片\draw_cloud-{}.png'.format(weibo_file)) # 图片命名
  20. if __name__ == '__main__':
  21. draw_cloud('微博top_fun.csv')

七、weibo.py

  1. import os.path
  2. import re
  3. from jsonpath import jsonpath
  4. import requests
  5. import pandas as pd
  6. import datetime
  7. from fake_useragent import UserAgent
  8. import weibo_pic
  9. def trans_time(v_str):
  10. """转换GMT时间为标准格式"""
  11. GMT_FORMAT='%a %b %d %H:%M:%S +0800 %Y'
  12. timearray=datetime.datetime.strptime(v_str,GMT_FORMAT)
  13. ret_time=timearray.strftime("%Y-%m-%d %H:%M:%S")
  14. return ret_time
  15. def get_weibo_list(v_keyword,v_max_page):
  16. """
  17. 爬取微博内容列表
  18. :param v_keyword: 搜索关键字
  19. :param v_max_page: 爬取前几页
  20. :return: None
  21. """
  22. # 保存文件名
  23. v_weibo_file = '微博清单_{}_前{}页.csv'.format(v_keyword,v_max_page)
  24. # 如果csv存在,先删除
  25. if os.path.exists(v_weibo_file):
  26. os.remove(v_weibo_file)
  27. print('微博清单存在,已删除:{}'.format(v_weibo_file))
  28. for page in range(1,v_max_page+1):
  29. print('===开始爬取第{}页微博==='.format(page))
  30. # 请求头
  31. ua = UserAgent()
  32. headers = {
  33. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42",
  34. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  35. "accept-encording": "gzip, deflate, br"
  36. }
  37. #请求地址
  38. url='https://m.weibo.cn/api/container/getIndex'
  39. #请求参数
  40. params={
  41. "containerid":"100103type=1&q={}".format(v_keyword),
  42. "page_type":"searchall",
  43. "page":page
  44. }
  45. #发送请求
  46. r=requests.get(url,headers=headers,params=params)
  47. print(r.status_code)
  48. #解析json数据
  49. cards=r.json()["data"]["cards"]
  50. #微博内容
  51. text_list=jsonpath(cards,'$..mblog.text')
  52. #微博内容-正则表达式数据清洗
  53. dr=re.compile(r'<[^>]+>',re.S)
  54. text2_list=[]
  55. print('text_list is:')
  56. print(text_list)
  57. if not text_list:#如果未获取到微博内容,则进入下一轮循环
  58. continue
  59. if type(text_list)==list and len (text_list)>0:
  60. for text in text_list:
  61. text2=dr.sub('',text)#正则表达式提取微博内容
  62. print(text2)
  63. text2_list.append(text2)
  64. #微博创建时间
  65. time_list = jsonpath(cards, '$..mblog.created_at')
  66. time_list=[trans_time(v_str=i) for i in time_list]
  67. #微博作者
  68. author_list = jsonpath(cards, '$..mblog.user.screen_name')
  69. #微博id
  70. id_list = jsonpath(cards, '$..mblog.user.id')
  71. # 微博bid
  72. bid_list = jsonpath(cards, '$..mblog.bid')
  73. # 转发数
  74. reposts_count_list = jsonpath(cards, '$..mblog.reposts_count')
  75. # 评论数
  76. comments_count_list = jsonpath(cards, '$..mblog.comments_count')
  77. # 点赞数
  78. attitudes_count_list = jsonpath(cards, '$..mblog.attitudes_count')
  79. df=pd.DataFrame(
  80. {
  81. '页码':[page]*len(id_list),
  82. '微博id':id_list,
  83. '微博bid': bid_list,
  84. '微博作者': author_list,
  85. '发布时间': time_list,
  86. '微博内容': text2_list,
  87. '转发数': reposts_count_list,
  88. '评论数': comments_count_list,
  89. '点赞数': attitudes_count_list
  90. }
  91. )
  92. #表头
  93. if os.path.exists(v_weibo_file):
  94. header=None
  95. else:
  96. header=['页码','微博id','微博bid','微博作者','发布时间','微博内容','转发数','评论数','点赞数']
  97. column=['页码','微博id','微博bid','微博作者','发布时间','微博内容','转发数','评论数','点赞数']
  98. #保存到csv文件
  99. df.to_csv(v_weibo_file,mode='a+',index=False,columns=column, header=header,encoding='utf-8-sig')
  100. print('csv保存成功:{}'.format(v_weibo_file))
  101. # 数据清洗、去重
  102. df = pd.read_csv(v_weibo_file, engine='python', encoding='utf-8-sig')
  103. os.remove(v_weibo_file)
  104. # 删除重复数据
  105. df.drop_duplicates(subset='微博bid', inplace=True, keep='first')
  106. # 再次保存csv文件
  107. header = ['页码','微博id','微博bid','微博作者','发布时间','微博内容','转发数','评论数','点赞数']
  108. column=header
  109. df.to_csv(v_weibo_file, mode='a+', index=False, columns=column, header=header,encoding='utf-8-sig')
  110. print('数据清洗完成')
  111. weibo_pic.main(v_weibo_file)
  112. if __name__=='__main__':
  113. # 爬取关键字
  114. search_keyword = input("请输入搜索的关键词")
  115. #爬取页数
  116. max_search_page=int(input("请输入搜索的页数"))
  117. #调用爬取微博函数
  118. get_weibo_list(v_keyword=search_keyword,v_max_page=max_search_page)

八、weibo_comment.py

  1. import requests# 发送请求
  2. import pandas as pd#保存csv文件
  3. import os # 判断文件是否存在
  4. import datetime
  5. import time
  6. from time import sleep# 设置等待,防止反爬
  7. import json
  8. import random# 生成随机数
  9. import os.path
  10. import requests
  11. import csv
  12. import re
  13. import weibo_comment_pic
  14. def trans_time(v_str):
  15. """转换GMT时间为标准格式"""
  16. GMT_FORMAT='%a %b %d %H:%M:%S +0800 %Y'
  17. timearray=datetime.datetime.strptime(v_str,GMT_FORMAT)
  18. ret_time=timearray.strftime("%Y-%m-%d %H:%M:%S")
  19. return ret_time
  20. def get_bili_comment(weiboID_list,max_page):
  21. for weibo_id in weiboID_list:
  22. #保存文件名
  23. wbComment_file='weiboComment_{}pages_{}.csv'.format(max_page,weibo_id)
  24. #如果csv存在,先删除
  25. if os.path.exists(wbComment_file):
  26. os.remove(wbComment_file)
  27. print('存在,已删除:{}'.format(wbComment_file))
  28. #请求头
  29. headers = {
  30. #不加cookie只能爬一页
  31. 'cookie':'__bid_n=1883c7fc76e10d57174207; FPTOKEN=IBsER/uKazbtpMIEgvaOTfAuHsmYQM5g0VL9U1G3ybs72PsWHEBbiKv0w+R59BrOvSwxDKJevIDwL0SSwPV5yWd3lIFsx6KXQ/qYPpPTjTRW5kFr+j74rsScC6MKc1G9142e5tEEf7atvY/zTxl9B6jy/y7MEo0ETLT0VjL6nbpzkWe/SnIw97Tjb+9lqYoGHS6lPqZ5yAhDPKn0KK4htwxqr0qMglAG6ZcT7mn+BUZAygRSrqWZwZ6KSE0r27qsR0bDTAI8dsQFq1gPfYONp5UHfw9FFsBiscLULixqm31wTHYziK8gxi0/R6yIQ8Tq3OQkNmx+Kw7E/8YknGOiVmpjfRn5FNShZs3/t8SNBJEcZ9qaQnw/iF/jwPoFkMXz87Tp22aQUmFgeQu/u0wAYQ==|wC9ITrusKUtoBk6wTqvs+jaY6iwSJyX4pD0y+hSvnOA=|10|acf98643db3def55913fefef5034d5ee; WEIBOCN_FROM=1110106030; loginScene=102003; SUB=_2A25JbkPWDeRhGeNH7FIV-SjKzjyIHXVqkW2erDV6PUJbkdAGLRbkkW1NSoXhCHcUhbni8gGXfjdc5HNqec9qABj_; MLOGIN=1; _T_WM=98495433469; XSRF-TOKEN=a62fb7; mweibo_short_token=9f0e28d6c9; M_WEIBOCN_PARAMS=oid%3D4903111417922777%26luicode%3D20000061%26lfid%3D4903111417922777%26uicode%3D20000061%26fid%3D4903111417922777',
  32. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
  33. 'X-Xsrf-Token':'a62fb7'
  34. }
  35. max_id = ''
  36. for page in range(1,max_page + 1):
  37. if page==1:#第一页没有max_id参数
  38. url='https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id_type=0'.format(weibo_id,weibo_id)
  39. else:
  40. if max_id == '0':#max_id=0,说明没有下一页了,结束循环
  41. print('max_id==0,break now')
  42. break
  43. url='https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type=0'.format(weibo_id,weibo_id,max_id)
  44. response = requests.get(url, headers=headers)
  45. #ok = response.json()['ok']
  46. #print(ok)
  47. print(response.status_code)
  48. max_id=response.json()['data']['max_id']
  49. #print(response.json()['data']['max_id'])
  50. print(max_id)
  51. datas= response.json()['data']['data']
  52. page_list = []
  53. id_list = []
  54. text_list=[]
  55. time_list=[]
  56. like_count_list=[]
  57. source_list=[]
  58. username_list=[]
  59. user_id_list=[]
  60. user_gender_list=[]
  61. follow_count_list=[]
  62. followers_count_list=[]
  63. for data in datas:
  64. page_list.append(page)
  65. id_list.append(data['id'])
  66. dr=re.compile(r'<[^>]+>',re.S)#用正则表达式清洗评论数据
  67. text2 = dr.sub('', data['text'])
  68. text_list.append(text2)#评论内容
  69. time_list.append(trans_time(data['created_at']))#评论时间
  70. like_count_list.append(data['like_count'])#点赞
  71. source_list.append(data['source'])#属地
  72. username_list.append(data['user']['screen_name'])#评论者姓名
  73. user_id_list.append(data['user']['id'])
  74. user_gender_list.append(data['user']['gender'])# 评论者性别
  75. follow_count_list.append(data['user']['follow_count'])#评论者关注数
  76. followers_count=str(data['user']['followers_count'])
  77. if(followers_count[-1]=='万'):
  78. followers_count=int(float(followers_count.strip('万')))*10000
  79. followers_count_list.append(followers_count)#评论者粉丝数
  80. #把列表拼接为dataFrame数据
  81. df=pd.DataFrame({
  82. '评论页码':page_list,
  83. '微博id':[weibo_id]*len(time_list),
  84. '评论id':id_list,
  85. '评论内容':text_list,
  86. '评论时间':time_list ,
  87. '评论点赞数':like_count_list,
  88. '评论属地':source_list,
  89. '评论者姓名':username_list ,
  90. '评论者id':user_id_list ,
  91. '评论者性别':user_gender_list,
  92. '评论者关注数':follow_count_list,
  93. '评论者粉丝数':followers_count_list,
  94. })
  95. # 表头
  96. if os.path.exists(wbComment_file):
  97. header = None
  98. else:
  99. header = ['评论页码','微博id', '评论id','评论内容','评论时间','评论点赞数','评论属地', '评论者姓名','评论者id','评论者性别', '评论者关注数','评论者粉丝数']
  100. column=['评论页码','微博id', '评论id','评论内容','评论时间','评论点赞数','评论属地', '评论者姓名','评论者id','评论者性别', '评论者关注数','评论者粉丝数']
  101. # 保存到csv文件
  102. df.to_csv(wbComment_file, mode='a+', index=False, columns=column, header=header, encoding='utf-8-sig')
  103. #print('csv保存成功:{}'.format(bili_file))
  104. #print(df)
  105. print('第{}页爬取完成'.format(page))
  106. # 数据清洗、去重
  107. df = pd.read_csv(wbComment_file, engine='python', encoding='utf-8-sig')
  108. os.remove(wbComment_file)
  109. # 删除重复数据
  110. df.drop_duplicates(subset='评论内容', inplace=True, keep='first')
  111. # 再次保存csv文件
  112. column=header = ['评论页码', '微博id', '评论id', '评论内容', '评论时间', '评论点赞数', '评论属地', '评论者姓名',
  113. '评论者id', '评论者性别', '评论者关注数', '评论者粉丝数']
  114. df.to_csv(wbComment_file, mode='a+', index=False, columns=column,header=header, encoding='utf-8-sig')
  115. print('数据清洗完成')
  116. weibo_comment_pic.main(wbComment_file)
  117. if __name__=='__main__':
  118. #目标微博https: // m.weibo.cn / detail / 4903111417922777
  119. #目标微博ID,可循环爬取多个(这里只爬一个)
  120. weiboID_list=[str(x) for x in input("请输入微博ID(示例:4903111417922777),以逗号分隔:").split(',')]
  121. #weiboID_list=['4903111417922777']
  122. #最大爬取页
  123. max_page=int(input("请输入搜索的页数"))
  124. #调用爬取
  125. get_bili_comment(weiboID_list=weiboID_list,max_page=max_page)

九、weibo_comment_pic.py

  1. # 允许副本存在,忽略报错
  2. import os
  3. import pandas as pd
  4. import matplotlib.pyplot as plt
  5. from matplotlib import font_manager
  6. import numpy as np
  7. os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
  8. def view(info,weibo_file):
  9. my_font = font_manager.FontProperties(fname='./STHeiti-TC-Medium.ttf') # 设置中文字体(图标中能显示中文)
  10. likes = info['评论点赞数'] # 点赞数
  11. reply = info['评论者粉丝数'] # 粉丝数
  12. forward = info['评论者关注数'] # 关注数
  13. author = info['评论者姓名'] # 作者,因为内容太长了
  14. # print(comment)
  15. # 为了坐标轴上能显示中文
  16. plt.rcParams['font.sans-serif'] = ['SimHei']
  17. plt.rcParams['axes.unicode_minus'] = False
  18. # **********************************************************************综合评分和播放量对比
  19. # *******点赞数条形图
  20. fig, ax1 = plt.subplots()
  21. length = len(author)
  22. plt.bar(x=np.arange(length), tick_label=author, height=likes, color='blue') # 设置柱状图
  23. plt.title('评论点赞数、粉丝数和关注数的数据分析', fontproperties=my_font) # 表标题
  24. ax1.tick_params(labelsize=6)
  25. plt.xlabel('微博内容') # 横轴名
  26. plt.ylabel('评论点赞数') # 纵轴名
  27. plt.xticks(rotation=90, color='green') # 设置横坐标变量名旋转度数和颜色
  28. # *******评论者粉丝数折线图
  29. ax2 = ax1.twinx() # 组合图必须加这个
  30. ax2.plot(reply, color='red') # 设置线粗细,节点样式
  31. # *******评论者关注数折线图
  32. ax2.plot(forward, color='yellow') # 设置线粗细,节点样式
  33. plt.ylabel('粉丝/关注数') # y轴
  34. plt.plot(1, label='评论者点赞数', color="blue", linewidth=5.0) # 图例
  35. #plt.plot(1, label='评论者粉丝数', color="red", linewidth=1.0, linestyle="-") # 图例
  36. #plt.plot(1, label='评论者关注数', color="yellow", linewidth=1.0, linestyle="-") # 图例
  37. plt.legend()
  38. plt.savefig('.\图片\pic-{}.png'.format(weibo_file), dpi=1000, bbox_inches='tight') # 保存至本地
  39. plt.show()
  40. def main(weibo_file):
  41. info = pd.read_csv(weibo_file,engine='python', encoding='utf-8-sig')
  42. info = info.nlargest(100, '评论点赞数')
  43. info = info.reset_index(drop=True)
  44. view(info,weibo_file)
  45. if __name__ == '__main__':
  46. main('weiboComment_15pages_4903111417922777.csv')

十、weibo_pic.py

  1. # 允许副本存在,忽略报错
  2. import os
  3. import pandas as pd
  4. import matplotlib.pyplot as plt
  5. from matplotlib import font_manager
  6. import numpy as np
  7. os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
  8. def view(info,weibo_file):
  9. my_font = font_manager.FontProperties(fname='./STHeiti-TC-Medium.ttf') # 设置中文字体(图标中能显示中文)
  10. likes = info['点赞数'] # 点赞数
  11. reply = info['评论数'] # 评论数
  12. forward = info['转发数'] # 转发数
  13. author = info['微博作者'] # 作者,因为内容太长了
  14. # print(comment)
  15. # 为了坐标轴上能显示中文
  16. plt.rcParams['font.sans-serif'] = ['SimHei']
  17. plt.rcParams['axes.unicode_minus'] = False
  18. # **********************************************************************综合评分和播放量对比
  19. # *******点赞数条形图
  20. fig, ax1 = plt.subplots()
  21. length = len(author)
  22. plt.bar(x=np.arange(length), tick_label=author, height=likes, color='blue') # 设置柱状图
  23. plt.title('点赞数、评论数和转发数的数据分析', fontproperties=my_font) # 表标题
  24. ax1.tick_params(labelsize=6)
  25. plt.xlabel('微博内容') # 横轴名
  26. plt.ylabel('点赞数') # 纵轴名
  27. plt.xticks(rotation=90, color='green') # 设置横坐标变量名旋转度数和颜色
  28. # *******评论数折线图
  29. ax2 = ax1.twinx() # 组合图必须加这个
  30. ax2.plot(reply, color='red') # 设置线粗细,节点样式
  31. # *******转发数折线图
  32. ax2.plot(forward, color='yellow') # 设置线粗细,节点样式
  33. plt.ylabel('评论/转发数') # y轴
  34. plt.plot(1, label='点赞数', color="blue", linewidth=5.0) # 图例
  35. #plt.plot(1, label='评论数', color="red", linewidth=1.0, linestyle="-") # 图例
  36. #plt.plot(1, label='转发数', color="yellow", linewidth=1.0, linestyle="-") # 图例
  37. plt.legend()
  38. plt.savefig('.\图片\pic-{}.png'.format(weibo_file), dpi=1000, bbox_inches='tight') # 保存至本地
  39. plt.show()
  40. def main(weibo_file):
  41. info = pd.read_csv(weibo_file,engine='python', encoding='utf-8-sig')
  42. info = info.nlargest(100, '点赞数')
  43. info = info.reset_index(drop=True)
  44. view(info,weibo_file)
  45. if __name__ == '__main__':
  46. main('微博清单_疫情_前10页.csv')

十一、weibo_top.py

  1. import os.path
  2. import re
  3. from jsonpath import jsonpath
  4. import requests
  5. import pandas as pd
  6. from fake_useragent import UserAgent
  7. import weibo_top_pic
  8. import weibo_top_pie
  9. import draw_cloud
  10. def get_weibo_top():
  11. keyword=list(['realtimehot','gym','game','fun'])
  12. for search_keyword in keyword:
  13. # 保存文件名
  14. v_weibo_file = '微博top_{}.csv'.format(search_keyword)
  15. # 如果csv存在,先删除
  16. if os.path.exists(v_weibo_file):
  17. os.remove(v_weibo_file)
  18. print('微博榜单存在,已删除:{}'.format(v_weibo_file))
  19. print('===开始爬取{}微博榜单==='.format(search_keyword))
  20. # 请求头
  21. ua = UserAgent()
  22. headers = {
  23. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42",
  24. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  25. "accept-encording": "gzip, deflate, br"
  26. }
  27. #请求地址
  28. url='https://m.weibo.cn/api/container/getIndex'
  29. #请求参数
  30. params={
  31. "containerid":"106003type=25&t=3&disable_hot=1&filter_type={}".format(search_keyword),
  32. "title": "微博热搜",
  33. "show_cache_when_error": 1,
  34. "extparam": "seat=1&dgr=0&filter_type=realtimehot&region_relas_conf=0&pos=0_0&c_type=30&lcate=1001&mi_cid=100103&cate=10103&display_time=1684642048&pre_seqid=144917672",
  35. "luicode": 10000011,
  36. "lfid": 231583,
  37. }
  38. #发送请求
  39. r=requests.get(url,headers=headers,params=params)
  40. print(r.status_code)
  41. #解析json数据
  42. cards=r.json()["data"]["cards"][0]["card_group"]
  43. #热搜内容
  44. text_list=jsonpath(cards,'$..desc')
  45. print('text_list is:')
  46. print(text_list)
  47. #热搜连接地址
  48. href_list = jsonpath(cards, '$..scheme')
  49. # 热搜排名
  50. order_list = jsonpath(cards, '$..pic')
  51. # 热搜热度
  52. view_count_list = jsonpath(cards, '$..desc_extr')
  53. j=1
  54. for i in range(0, len(order_list)):
  55. if order_list[i] == 'https://simg.s.weibo.com/20210408_search_point_orange.png':
  56. order_list[i] = '无'
  57. view_count_list[i]=0
  58. continue
  59. if order_list[i] == "https://simg.s.weibo.com/20180205110043_img_search_stick%403x.png":
  60. view_count_list.insert(0, 0)
  61. order_list[i] = '无'
  62. continue
  63. view_count_list[i]=str(view_count_list[i])
  64. view_count_list[i]=int(re.sub("\D", "", view_count_list[i]))
  65. order_list[i] = j
  66. j = j + 1
  67. print(len(order_list),len(text_list),len(view_count_list),len(href_list))
  68. df=pd.DataFrame(
  69. {
  70. '热搜排名':order_list,
  71. '热搜内容': text_list,
  72. '热搜热度': view_count_list,
  73. '热搜连接地址': href_list,
  74. }
  75. )
  76. #表头
  77. if os.path.exists(v_weibo_file):
  78. header=None
  79. else:
  80. header=['热搜排名','热搜内容','热搜热度','热搜连接地址']
  81. column = ['热搜排名','热搜内容','热搜热度','热搜连接地址']
  82. #保存到csv文件
  83. df.to_csv(v_weibo_file,mode='a+',index=False,columns=column, header=header, encoding='utf-8-sig')
  84. print('csv保存成功:{}'.format(v_weibo_file))
  85. weibo_top_pic.main(v_weibo_file)
  86. weibo_top_pie.pie(v_weibo_file)
  87. #draw_cloud.draw_cloud(v_weibo_file)
  88. if __name__=='__main__':
  89. #调用爬取微博函数
  90. get_weibo_top()

十二、weibo_top_pic.py

  1. # 允许副本存在,忽略报错
  2. import os
  3. import pandas as pd
  4. import matplotlib.pyplot as plt
  5. from matplotlib import font_manager
  6. import numpy as np
  7. os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
  8. def view(info,weibo_file):
  9. my_font = font_manager.FontProperties(fname='./STHeiti-TC-Medium.ttf') # 设置中文字体(图标中能显示中文)
  10. heat = info['热搜热度']
  11. content = info['热搜内容']
  12. # 为了坐标轴上能显示中文
  13. plt.rcParams['font.sans-serif'] = ['SimHei']
  14. plt.rcParams['axes.unicode_minus'] = False
  15. # **********************************************************************综合评分和播放量对比
  16. # *******点赞数条形图
  17. fig, ax1 = plt.subplots()
  18. length=len(content)
  19. plt.bar(x = np.arange(length),tick_label=content, height=heat, color='blue') # 设置柱状图
  20. plt.title('热搜内容和热搜热度的数据分析', fontproperties=my_font) # 表标题
  21. ax1.tick_params(labelsize=6)
  22. plt.xlabel('热搜内容') # 横轴名
  23. plt.ylabel('热搜热度') # 纵轴名
  24. plt.xticks(rotation=90, color='green') # 设置横坐标变量名旋转度数和颜色
  25. plt.plot(1, label='热搜热度', color="blue", linewidth=5.0) # 图例
  26. plt.legend()
  27. plt.savefig('.\图片\pic-{}.png'.format(weibo_file), dpi=1000, bbox_inches='tight') # 保存至本地
  28. plt.show()

十三、weibo_top_pie.py

  1. import pandas as pd
  2. import numpy as np
  3. from pyecharts import options as opts
  4. from pyecharts.charts import Pie
  5. import matplotlib.pyplot as plt
  6. def pie(weibo_file):
  7. plt.rcParams['font.family']=['SimHei']
  8. plt.rcParams['axes.unicode_minus']=False
  9. data=pd.read_csv(weibo_file,engine='python', encoding='utf-8-sig')
  10. df1=data['热搜内容']
  11. df2=data['热搜热度']
  12. X=df1
  13. Y=[]
  14. s=sum(df2)
  15. for i in df2:
  16. a=i/s
  17. a=round(a,2)
  18. Y.append(a)
  19. plt.figure(figsize=(12, 12))
  20. plt.pie(x=Y,
  21. labels=X,
  22. wedgeprops={'width': 0.4},
  23. startangle=90,
  24. autopct='%.2f%%',
  25. pctdistance=0.9
  26. )
  27. plt.title('热搜对应的热度占比',fontsize=20)
  28. plt.savefig('.\图片\pie-{}.png'.format(weibo_file), dpi=1000, bbox_inches='tight') # 保存至本地
  29. plt.show()
  30. if __name__ == '__main__':
  31. pie('微博top_realtimehot.csv')

十四、pachong.py

  1. import weibo
  2. import weibo_top
  3. import weibo_comment
  4. import bilibili
  5. import bili_comment
  6. net=int(input("请选择爬取的网站:1.微博 2.b站 3.停止爬取"))
  7. while(net!=3):
  8. if (net==1):
  9. choice1=int(input("请选择爬取的方向:1.排行榜 2.关键词 3.评论"))
  10. if(choice1==1):
  11. # 调用爬取微博函数
  12. weibo_top.get_weibo_top()
  13. if (choice1 == 2):
  14. # 爬取关键字
  15. search_keyword = input("请输入搜索的关键词")
  16. # 爬取页数
  17. max_search_page = int(input("请输入搜索的页数"))
  18. # 调用爬取微博函数
  19. weibo.get_weibo_list(v_keyword=search_keyword, v_max_page=max_search_page)
  20. if (choice1 == 3):
  21. # 目标微博ID,可循环爬取多个(这里只爬一个)
  22. weiboID_list = [str(x) for x in input("请输入微博ID(示例:4903111417922777),以逗号分隔:").split(',')]
  23. # 最大爬取页
  24. max_page = int(input("请输入搜索的页数"))
  25. # 调用爬取
  26. weibo_comment.get_bili_comment(weiboID_list=weiboID_list, max_page=max_page)
  27. if (net==2):
  28. choice2=int(input("请选择爬取的方向:1.排行榜 2.关键词 3.评论"))
  29. if(choice2==1):
  30. # 初始化
  31. b = bilibili.bilihot()
  32. # 调用搜索
  33. b.findall()
  34. # 储存数据
  35. b.storeCsvdata()
  36. if (choice2 == 2):
  37. # search: 你需要搜索的数据
  38. search = input("请输入搜索的关键词")
  39. # page: 需要查看的页数
  40. page = int(input("请输入搜索的页数"))
  41. # 初始化类
  42. a = bilibili.bilisearch(search, page)
  43. # 将爬取的数据存入excel文件中
  44. a.findall()
  45. if (choice2 == 3):
  46. # 视频bv号,循环爬取多个视频评论
  47. bv_list = [str(x) for x in input("请输入视频bv号(示例:BV1Ss4y1M7KT,BV1VM411N7qc),以逗号分隔:").split(',')]
  48. # 最大爬取页
  49. max_page = int(input("请输入搜索的页数"))
  50. # 调用爬取
  51. bili_comment.get_bili_comment(bv_list=bv_list, max_page=max_page)
  52. net = int(input("请选择爬取的网站:1.微博 2.b站 3.停止爬取"))

十五、代码文件说明

pachong: b站、微博爬虫与数据可视化总程序

b站:
bilibili 爬取b站热搜榜和关键词搜索
bili_comment 爬取b站评论
bilihot_pic b站热搜榜数据可视化(柱形图、折线图)
bilisearch_pic b站关键词搜索数据可视化(柱形图、折线图)
bili_comment_pic b站评论数据可视化(柱形图、折线图)

微博:
weibo_top 爬取微博热搜榜
weibo 爬取微博关键词搜索
weibo_comment 爬取微博评论
weibo_top_pic 微博热搜榜数据可视化(柱形图、折线图)
weibo_top_pie 微博热搜榜数据可视化(环形图)
weibo_pic 微博关键词搜索数据可视化(柱形图、折线图)
weibo_comment_pic 微博评论数据可视化(柱形图、折线图)

draw_cloud 微博热搜榜数据可视化(词图云)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/article/detail/57004
推荐阅读
相关标签
  

闽ICP备14008679号