赞
踩
异步爬虫可以理解为非只单线程爬虫
我们下面做个例子,之前我们通过单线程爬取过梨视频 https://blog.csdn.net/potato123232/article/details/135672504
在保存视频的时候会慢一些,为了提升效率,我们使用异步爬虫爬取
目录
线程池的基本用法在这里有提到 python并发任务-CSDN博客
多线程应仅用于耗时的部分,如果我们为了省事去将所有部分都封装为一个函数就容易出错
- import requests
- from lxml import etree
- import random
- import re
- from multiprocessing.dummy import Pool
-
- # 保存根页面
- url = 'https://www.pearvideo.com/popular'
- headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'}
- response = requests.get(url=url,headers=headers)
- response.encoding = response.apparent_encoding
- with open('./test.html','w',encoding='utf-8') as f:
- f.write(response.text)
- print(response)
-
- # 获取所有细节页面url
- detail_htmls = []
- tree = etree.HTML(response.text)
- for i in range(1,len(tree.xpath('//*[@id="popularList"]/li'))+1):
- detail_htmls.append({'url':'https://www.pearvideo.com/' + tree.xpath('//*[@id="popularList"]/li[{}]/a/@href'.format(i))[0],'title':tree.xpath('//*[@id="popularList"]/li['+ str(i) +']/div[2]/a/h2/text()')[0]})
-
- # print(detail_htmls)
- p = re.compile(r'.*\/(.*?)-\d')
-
- video_detail_list = []
- for i in detail_htmls:
- contId = i['url'].split('_')[-1]
- mrd = round(random.random(), 16)
- headers['Host'] = 'www.pearvideo.com'
- headers['Referer'] = i['url']
- response = requests.get(url='https://www.pearvideo.com/videoStatus.jsp?contId=' + str(contId) + '&mrd=' + str(mrd),
- headers=headers).text
-
- srcUrl = eval(response).get('videoInfo').get('videos').get('srcUrl')
-
- need_change_part = p.findall(srcUrl)[0]
- true_video_url = srcUrl.split(need_change_part)[0] + 'cont-' + contId + srcUrl.split(need_change_part)[1]
-
- video_name = re.sub(r'[\\/:*?"<>|]', '', i['title'])
- video_detail_list.append({"name":video_name,"url":true_video_url})
-
- print(video_detail_list)
- def get_video(item):
- response = requests.get(item['url'])
- with open('./result/' + str(item['name']) + '.mp4', 'wb') as fp:
- fp.write(response.content)
- print(item['url'] + '下载成功')
-
- pool = Pool(4)
- pool.map(get_video,video_detail_list)

耗时的部分只有保存,所以我们保存的部分剥离出来,这样就能成功爬取10个视频

我们先做个简单的服务,这三个服务无论请求哪一个都会等待两秒,然后返回一个字符串

之后我们尝试只用asyncio发起异步请求

从耗时来看这段代码并没有发起异步请求

这个时候我们可以使用aiohttp进行异步请求


异步保存文件可以借助 aiofiles
- import requests
- from lxml import etree
- import random
- import re
- import aiohttp
- import asyncio
- import aiofiles
-
- # 保存根页面
- url = 'https://www.pearvideo.com/popular'
- headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'}
- response = requests.get(url=url,headers=headers)
- response.encoding = response.apparent_encoding
- with open('./test.html','w',encoding='utf-8') as f:
- f.write(response.text)
- print(response)
-
- # 获取所有细节页面url
- detail_htmls = []
- tree = etree.HTML(response.text)
- for i in range(1,len(tree.xpath('//*[@id="popularList"]/li'))+1):
- detail_htmls.append({'url':'https://www.pearvideo.com/' + tree.xpath('//*[@id="popularList"]/li[{}]/a/@href'.format(i))[0],'title':tree.xpath('//*[@id="popularList"]/li['+ str(i) +']/div[2]/a/h2/text()')[0]})
-
- # print(detail_htmls)
- p = re.compile(r'.*\/(.*?)-\d')
-
- video_detail_list = []
- for i in detail_htmls:
- contId = i['url'].split('_')[-1]
- mrd = round(random.random(), 16)
- headers['Host'] = 'www.pearvideo.com'
- headers['Referer'] = i['url']
- response = requests.get(url='https://www.pearvideo.com/videoStatus.jsp?contId=' + str(contId) + '&mrd=' + str(mrd),
- headers=headers).text
-
- srcUrl = eval(response).get('videoInfo').get('videos').get('srcUrl')
-
- need_change_part = p.findall(srcUrl)[0]
- true_video_url = srcUrl.split(need_change_part)[0] + 'cont-' + contId + srcUrl.split(need_change_part)[1]
-
- video_name = re.sub(r'[\\/:*?"<>|]', '', i['title'])
- video_detail_list.append({"name":video_name,"url":true_video_url})
-
- print(video_detail_list)
-
- async def test(item):
- async with aiohttp.ClientSession() as session:
- async with await session.get(item['url']) as response:
- async with aiofiles.open('./result/' + str(item['name']) + '.mp4', 'wb') as fp:
- await fp.write(await response.read())
- print(item['url'] + '下载成功')
-
- future_list = []
- for something1 in video_detail_list:
- a = test(something1)
- future = asyncio.ensure_future(a)
- future_list.append(future)
-
- loop = asyncio.get_event_loop()
- loop.run_until_complete(asyncio.wait(future_list))
-

可以爬取成功,每个视频都可以点开看

但是代码在pycharm的返回值并不是0

就梨视频的例子来说,单线程最慢,多线程第二(因为我只用4线程,如果10线程应该还会快一些),感觉上来讲协程最快
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。