当前位置:   article > 正文

哔哩哔哩视频信息爬虫(实时爬取)_哔哩哔哩爬虫

哔哩哔哩爬虫

结合  哔哩哔哩小助手程序

爬取思路

自定义模块构建及框架设计

 

文件目录

__init__.py:

  1. #__init__
  2. """
  3. 浏览json数据
  4. videoinfo = [
  5. data['aid'], # av号
  6. data['view'], # 播放量
  7. data['like'], # 点赞数
  8. data['favorite'], # 收藏数
  9. data['share'], # 转发数
  10. data['reply'], # 评论
  11. data['danmaku'], # 弹幕
  12. data['coin'], # 硬币数
  13. data['title'], # 标题
  14. data['tname'], # 分类
  15. ]
  16. """
  17. headers = {
  18. "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
  19. (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
  20. }

WebDownloader模块:(请求并加载网页模块)

  1. #WebDownloader
  2. import requests
  3. from BilibiliSpider import headers
  4. class WebDownloader:
  5. global headers
  6. def __init__(self,headers=headers,timeout=6):
  7. self.headers = headers
  8. self.timeout = timeout
  9. #获取待爬取json网页
  10. def getJsonWeb(self,url):
  11. try:
  12. r = requests.get(url,headers=self.headers,timeout=self.timeout)
  13. r.raise_for_status()
  14. r.encoding=r.apparent_encoding
  15. return r.json()
  16. except:
  17. return "error"

JsonParse模块:(网页内容解析)

  1. #JsonParse
  2. #
  3. import json
  4. from BilibiliSpider import WebDownloader
  5. import threading
  6. class JsonParse:
  7. def __init__(self,total=1,lock = threading.Lock()):
  8. self.lock = lock
  9. self.total = total
  10. #将用来存入json文件中,获取av号:[视频标题,视频分类]
  11. def parseStat(self,dict_json,jsonPage):
  12. try:
  13. View = jsonPage['data']['View']
  14. aid = View['aid']
  15. sort = View['tname']
  16. title = View['title']
  17. if View['aid']!=None: #筛选出av号,并判断是否存在
  18. dict_json[aid] = [title,sort]
  19. with self.lock:
  20. return ""
  21. except:
  22. pass
  23. #return dict_json
  24. def parseJsonImage(self,jsonPage):
  25. try:
  26. View = jsonPage['data']['View']
  27. picHref = View['pic']
  28. return picHref
  29. except:
  30. pass
  31. #用于实时爬取视频信息
  32. def parseJsonList(self,dict_json,jsonPage):
  33. try:
  34. View = jsonPage['data']['View']
  35. aid = View['aid']
  36. sort = View['tname']
  37. detail = View['desc']
  38. title = View['title']
  39. Stat = View['stat']
  40. play = Stat['view']
  41. like = Stat['like']
  42. collect = Stat['favorite']
  43. share = Stat['share']
  44. reply = Stat['reply']
  45. danmaku = Stat['danmaku']
  46. coin = Stat['coin']
  47. dict_json['视频名称:'] = title
  48. dict_json['AV号:'] = aid
  49. dict_json['分类:'] = sort
  50. dict_json['视频简介:'] = detail
  51. dict_json['播放量:'] = play
  52. dict_json['点赞:'] = like
  53. dict_json['收藏:'] = collect
  54. dict_json['转发:'] = share
  55. dict_json['评论:'] = reply
  56. dict_json['弹幕:'] = danmaku
  57. dict_json['硬币:'] = coin
  58. #for i in dict_json:
  59. #print(i,end='')
  60. #print(dict_json[i])
  61. except:
  62. pass

UrlFactory模块:(api-url工厂,获取对应标题的API链接)

  1. #UrlFactory
  2. """
  3. detail? :https://api.bilibili.com/x/web-interface/view/detail?&aid=77515252
  4. stat? :https://api.bilibili.com/x/web-interface/archive/stat?aid=11111111
  5. """
  6. #api_urlStat = 'https://api.bilibili.com/x/web-interface/archive/stat?aid='
  7. import json
  8. class UrlFactory:
  9. def __init__(self,api_urlDetail='https://api.bilibili.com/x/web-interface/view/detail?&aid='):
  10. self.api_urlDetail = api_urlDetail
  11. #从json文件中获取apiUrl
  12. def getUrlJson(self,title):
  13. with open('href.json',mode='r')as fjson:
  14. data = json.loads(fjson.read())
  15. #av号:[视频标题,视频分类]
  16. for i in data:
  17. t = data[i][0]
  18. if t == title:
  19. apiUrl = self.api_urlDetail + i
  20. break
  21. return apiUrl

主函数A

  1. #下载av号
  2. total = 0
  3. dict_json={}
  4. v = videoInfoSpider()
  5. print('开始爬取apiUrl...')
  6. for i in range(1,2019):
  7. start = 10000
  8. urls = [
  9. "https://api.bilibili.com/x/web-interface/view/detail?&aid={}".format(j)
  10. for j in range(start,start+10000)
  11. ]
  12. with futures.ThreadPoolExecutor(64)as executor:
  13. executor.map(v.apiUrlCrawl,urls)
  14. print(total)
  15. total += 1
  16. with open('href.json','a')as fjson:
  17. data = json.dumps(dict_json,indent=4)
  18. fjson.write(data)
  19. print("爬取结束!")

首先运行主函数A,得到一个json文件,作为后续实时爬取API

紧接着UrlFactory模块的作用就来了,调用UrlFactory中的 getUrlJson()即可获得对应搜索标题的视频信息URL,根据URL请求网页,最后调用JsonParse模块即可得到相应的信息啦

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/article/detail/57170
推荐阅读
相关标签
  

闽ICP备14008679号