当前位置:   article > 正文

Python翻页爬取B站视频_69re小视频

69re小视频

记录学习笔记之翻页爬取B站视频

 

 

  1. import re
  2. import time
  3. import requests
  4. from lxml import etree
  5. from moviepy.editor import *
  6. from lxml import etree
  7. if __name__ == '__main__':
  8. data_ = input("请输入你要搜索的内容:")
  9. pages = int(input("请输入你要爬取的页数:"))
  10. # 定义一个空列表,用于保存爬取到的每页20个视频url
  11. list_ = []
  12. for page in range(pages): # 5 0,1,2,3,4
  13. # 列表页的url
  14. url_ = f"https://search.bilibili.com/all?keyword={data_}&page={page + 1}"
  15. # 用户代理的设置
  16. headers__ = {
  17. "cookie": "_uuid=8E9986D8-D2B1-AE0F-9F4E-057B59DDE7B113399infoc; buvid3=B715F603-8F69-491D-9E9D-C6AD5F30E32C167628infoc; CURRENT_BLACKGAP=1; CURRENT_FNVAL=80; CURRENT_QUALITY=0; rpdid=|(kmJYYJJm~~0J'uYk~kmukuY; blackside_state=1; PVID=2; bsource=search_baidu; arrange=matrix; fingerprint=a19d6bac6f65a05c1eda15587959c900; buvid_fp=B715F603-8F69-491D-9E9D-C6AD5F30E32C167628infoc; buvid_fp_plain=88B631D5-026B-4E30-8AB4-32F3D649846518783infoc; sid=6ehvc2cf",
  18. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
  19. }
  20. response = requests.get(url_, headers=headers__)
  21. str_data = response.text
  22. # 保存视频主页的url,就代表着下载视频
  23. # 提取出20个视频的详情页url
  24. html_obj = etree.HTML(str_data)
  25. url_list = html_obj.xpath('//a[@class="img-anchor"]/@href')
  26. for url_ in url_list:
  27. url_ = "https:" + url_
  28. list_.append(url_)
  29. time.sleep(2)
  30. # print(len(list_))
  31. list2 = list(set(list_))
  32. print(len(list2))
  33. print(list2)
  34. for i in range (len(list2)):
  35. url = list2[i]
  36. # headers_ = {
  37. headers__ = {
  38. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
  39. 'Cookie': "INTVER=1; _uuid=09E58359-9210-BAEE-1A60-4EE46A2B82C655513infoc; sid=5ytmp0pi; LIVE_BUVID=AUTO6515760552512495; stardustvideo=1; laboratory=1-1; rpdid=|(umuum~uku|0J'ul~Y|llkl); buvid3=684E90B9-B796-4512-AE61-CED715ED0D2B53931infoc; stardustpgcv=0606; finger=158939783; DedeUserID=246639322; DedeUserID__ckMd5=0de51babcf36bfe1; SESSDATA=0453e7c3%2C1613286482%2C848b7*81; bili_jct=05a501d5099630a42cb271cebdbc3470; blackside_state=1; CURRENT_FNVAL=80; CURRENT_QUALITY=80; bsource=search_baidu; PVID=4"
  40. }
  41. # # 发送请求,得到响应对象
  42. response_ = requests.get(url, headers=headers__)
  43. str_data = response_.text # 视频主页的html代码,类型是字符串
  44. # 使用xpath解析html代码,,得到想要的url
  45. html_obj = etree.HTML(str_data) # 转换格式类型
  46. # 获取视频的名称
  47. res_ = html_obj.xpath('//title/text()')[0] #
  48. # 视频名称的获取
  49. title_ = re.findall(r'(.*?)_哔哩哔哩', res_)[0]
  50. # 影响视频合成的特殊字符的处理
  51. title_ = title_.replace('/', '')
  52. title_ = title_.replace('-', '')
  53. title_ = title_.replace('|', '')
  54. title_ = title_.replace(' ', '')
  55. title_ = title_.replace('&', '')
  56. # 使用xpath语法获取数据,取到数据为列表,索引[0]取值取出里面的字符串,即包含视频音频文件的url字符串
  57. url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]
  58. # 纯视频的url
  59. video_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]
  60. # 纯音频的url
  61. audio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]
  62. # 设置跳转字段的headers
  63. headers_ = {
  64. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
  65. 'Referer': url_
  66. }
  67. # 获取纯视频的数据
  68. response_video = requests.get(video_url, headers=headers_, stream=True)
  69. bytes_video = response_video.content
  70. # 获取纯音频的数据
  71. response_audio = requests.get(audio_url, headers=headers_, stream=True)
  72. bytes_audio = response_audio.content
  73. # 获取文件大小, 单位为KB
  74. video_size = int(int(response_video.headers['content-length']) / 1024)
  75. audio_size = int(int(response_audio.headers['content-length']) / 1024)
  76. # 保存纯视频的文件
  77. title_1 = title_ + '!' # 名称进行修改,避免重名
  78. with open(f'{title_1}.mp4', 'wb') as f:
  79. f.write(bytes_video)
  80. print(f'{title_1}>>>纯视频文件下载完毕...,大小为:{video_size}KB, {int(video_size/1024)}MB')
  81. with open(f'{title_1}.mp3', 'wb') as f:
  82. f.write(bytes_audio)
  83. print(f'{title_1}>>>纯音频文件下载完毕...,大小为:{audio_size}KB, {int(audio_size/1024)}MB')
  84. ffmpeg_tools.ffmpeg_merge_video_audio(f'{title_1}.mp4', f'{title_1}.mp3', f'{title_}.mp4')
  85. # 显示合成文件的大小
  86. res_ = int(os.stat(f'{title_}.mp4').st_size / 1024)
  87. print(f'{title_}>>>视频合成成功...,大小为{res_}KB, {int(res_/1024)}MB......')
  88. # 移除纯视频文件,
  89. os.remove(f'{title_1}.mp4')
  90. # 移除纯音频文件,
  91. os.remove(f'{title_1}.mp3')
  92. # 手动降低请求频率,,避免被反爬
  93. # time.sleep(3)
  94. # 隔开每一个视频的信息
  95. print('*' * 100)
  96. print('视频全部抓取完毕......')

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/article/detail/59809
推荐阅读
相关标签
  

闽ICP备14008679号