赞
踩
结合
爬取思路:
自定义模块构建及框架设计:

文件目录:

__init__.py:
- #__init__
-
-
- """
- 浏览json数据
- videoinfo = [
- data['aid'], # av号
- data['view'], # 播放量
- data['like'], # 点赞数
- data['favorite'], # 收藏数
- data['share'], # 转发数
- data['reply'], # 评论
- data['danmaku'], # 弹幕
- data['coin'], # 硬币数
- data['title'], # 标题
- data['tname'], # 分类
-
- ]
- """
-
-
-
-
- headers = {
-
- "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
- (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
- }
-

WebDownloader模块:(请求并加载网页模块)
- #WebDownloader
-
-
-
-
-
-
- import requests
- from BilibiliSpider import headers
-
-
-
-
-
- class WebDownloader:
- global headers
- def __init__(self,headers=headers,timeout=6):
- self.headers = headers
- self.timeout = timeout
-
-
-
-
- #获取待爬取json网页
- def getJsonWeb(self,url):
- try:
- r = requests.get(url,headers=self.headers,timeout=self.timeout)
- r.raise_for_status()
- r.encoding=r.apparent_encoding
- return r.json()
- except:
- return "error"
-
-
-

JsonParse模块:(网页内容解析)
- #JsonParse
- #
-
-
-
- import json
- from BilibiliSpider import WebDownloader
-
- import threading
-
-
- class JsonParse:
- def __init__(self,total=1,lock = threading.Lock()):
- self.lock = lock
- self.total = total
-
-
-
- #将用来存入json文件中,获取av号:[视频标题,视频分类]
- def parseStat(self,dict_json,jsonPage):
-
-
- try:
- View = jsonPage['data']['View']
- aid = View['aid']
- sort = View['tname']
- title = View['title']
- if View['aid']!=None: #筛选出av号,并判断是否存在
- dict_json[aid] = [title,sort]
- with self.lock:
- return ""
-
-
-
-
- except:
- pass
-
-
- #return dict_json
-
-
- def parseJsonImage(self,jsonPage):
- try:
- View = jsonPage['data']['View']
- picHref = View['pic']
- return picHref
- except:
- pass
-
-
-
- #用于实时爬取视频信息
- def parseJsonList(self,dict_json,jsonPage):
- try:
- View = jsonPage['data']['View']
- aid = View['aid']
- sort = View['tname']
- detail = View['desc']
- title = View['title']
-
- Stat = View['stat']
- play = Stat['view']
- like = Stat['like']
- collect = Stat['favorite']
- share = Stat['share']
- reply = Stat['reply']
- danmaku = Stat['danmaku']
- coin = Stat['coin']
-
- dict_json['视频名称:'] = title
- dict_json['AV号:'] = aid
- dict_json['分类:'] = sort
- dict_json['视频简介:'] = detail
- dict_json['播放量:'] = play
- dict_json['点赞:'] = like
- dict_json['收藏:'] = collect
- dict_json['转发:'] = share
- dict_json['评论:'] = reply
- dict_json['弹幕:'] = danmaku
- dict_json['硬币:'] = coin
-
- #for i in dict_json:
- #print(i,end='')
- #print(dict_json[i])
- except:
- pass
-

UrlFactory模块:(api-url工厂,获取对应标题的API链接)
- #UrlFactory
-
-
- """
- detail? :https://api.bilibili.com/x/web-interface/view/detail?&aid=77515252
- stat? :https://api.bilibili.com/x/web-interface/archive/stat?aid=11111111
- """
- #api_urlStat = 'https://api.bilibili.com/x/web-interface/archive/stat?aid='
-
-
- import json
-
-
- class UrlFactory:
-
- def __init__(self,api_urlDetail='https://api.bilibili.com/x/web-interface/view/detail?&aid='):
- self.api_urlDetail = api_urlDetail
-
-
-
-
- #从json文件中获取apiUrl
- def getUrlJson(self,title):
- with open('href.json',mode='r')as fjson:
- data = json.loads(fjson.read())
-
- #av号:[视频标题,视频分类]
- for i in data:
-
- t = data[i][0]
- if t == title:
- apiUrl = self.api_urlDetail + i
- break
-
-
- return apiUrl
-

主函数A:
- #下载av号
- total = 0
- dict_json={}
- v = videoInfoSpider()
- print('开始爬取apiUrl...')
- for i in range(1,2019):
-
- start = 10000
- urls = [
- "https://api.bilibili.com/x/web-interface/view/detail?&aid={}".format(j)
- for j in range(start,start+10000)
- ]
- with futures.ThreadPoolExecutor(64)as executor:
- executor.map(v.apiUrlCrawl,urls)
- print(total)
- total += 1
- with open('href.json','a')as fjson:
- data = json.dumps(dict_json,indent=4)
- fjson.write(data)
-
- print("爬取结束!")

首先运行主函数A,得到一个json文件,作为后续实时爬取API

紧接着UrlFactory模块的作用就来了,调用UrlFactory中的 getUrlJson()即可获得对应搜索标题的视频信息URL,根据URL请求网页,最后调用JsonParse模块即可得到相应的信息啦
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。