当前位置:   article > 正文

Python 爬虫总结——案例代码_python爬虫案例代码

python爬虫案例代码

目录

request的基本使用

urllib使用

图片爬取

获取动态数据

session和cokkie的处理

使用xpath解析

使用正则解析

BeautifulSoup使用

seleium自动化爬虫

其他自动化操作

实现无界面

自动化处理iframe标签

基于selenium的12306用户登录

代理的使用

验证码解析

协程的使用

同步爬虫

多线程异步爬虫的使用

 线程池

异步协程

aiohttp实现任务异步协程

分布式爬虫

 简单练手项目

肯德基破解 

爬取简历模板

百度AI实现爬虫


好久之前做的python非框架爬虫全集笔记一直没整理,今天有空整理了一番,方便以后查看。

request的基本使用

案例一

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. if __name__ == "__main__":
  4. # step 1:指定url
  5. url = 'https://www.sogou.com/'
  6. # step 2:发起请求
  7. response = requests.get(url=url)
  8. # step 3:获取响应数据.text返回的是字符串形式的响应数据
  9. page_text = response.text
  10. print(page_text)
  11. # step_4:持久化存储
  12. with open('./sogou.html', 'w', encoding='utf-8') as fp:
  13. fp.write(page_text)
  14. print('爬取数据结束!!!')

案例二:

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. import json
  4. if __name__ == "__main__":
  5. url = 'https://movie.douban.com/j/search_subjects'
  6. param = {
  7. 'type': 'movie',
  8. 'tag': "喜剧",
  9. 'sort': 'recommend',
  10. 'page_limit': 20, # 一次取出的个数
  11. 'page_start': 20, # 从库中的第几部电影去取
  12. }
  13. headers = {
  14. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'
  15. }
  16. response = requests.get(url=url, params=param, headers=headers)
  17. list_data = response.json()
  18. fp = open('./douban.json', 'w', encoding='utf-8')
  19. json.dump(list_data, fp=fp, ensure_ascii=False)
  20. print('over!!!')

案例三

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. import json
  4. if __name__ == "__main__":
  5. # 1.指定url
  6. post_url = 'https://fanyi.baidu.com/sug'
  7. # 2.进行UA伪装
  8. headers = {
  9. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'
  10. }
  11. # 3.请求参数处理(同get请求一致)
  12. word = input('enter a word:')
  13. data = {
  14. 'kw': word
  15. }
  16. # 4.请求发送
  17. response = requests.post(url=post_url, data=data, headers=headers)
  18. # 5.获取响应数据:json()方法返回的是obj_(如果确认响应数据是json类型的,才可以使用json())
  19. dic_obj = response.json()
  20. # 持久化存储
  21. fileName = word + '.json'
  22. fp = open(fileName, 'w', encoding='utf-8')
  23. json.dump(dic_obj, fp=fp, ensure_ascii=False)
  24. print('over!!!')

案例四

  1. # -*- coding: utf-8 -*-
  2. # 每次爬取需要进行UA伪装,伪装成某款浏览器
  3. # User-Agent(请求载体生份标识)
  4. import requests
  5. if __name__ == "__main__":
  6. # UA伪装:将对应的User-Agent封装到一个字典中
  7. headers = {
  8. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'
  9. }
  10. url = 'https://www.sogou.com/web'
  11. # 处理url携带的参数:封装到字典中
  12. kw = input('enter a word:')
  13. param = {
  14. 'query': kw
  15. }
  16. # 对指定的url发起的请求对应的url是携带参数的,并且请求过程中处理了参数
  17. response = requests.get(url=url, params=param, headers=headers)
  18. page_text = response.text
  19. fileName = kw + '.html'
  20. with open(fileName, 'w', encoding='utf-8') as fp:
  21. fp.write(page_text)
  22. print(fileName, '保存成功!!!')

urllib使用

  1. import requests
  2. import re
  3. import os
  4. import urllib
  5. dirName = "imgLab"
  6. if not os.path.exists(dirName):
  7. os.mkdir(dirName)
  8. url = "https://www.baidu.com/s?wd=%E7%8B%97&tn=98012088_5_dg&ch=11"
  9. headers = {
  10. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.37',
  11. }
  12. response = requests.get(url=url, headers=headers)
  13. page_text = response.text
  14. ex = '<div class="op-img-address-divide-high">.*?<img src="(.*?)" class=.*?</div>'
  15. img_src_list = re.findall(ex, page_text, re.S)
  16. for src in img_src_list:
  17. imgPath = dirName + "/" + src.split('/')[-1]
  18. src = src + '&fm=26'
  19. urllib.request.urlretrieve(src, imgPath)
  20. print(imgPath, '下载成功!')

图片爬取

案例一

  1. from lxml import etree
  2. import requests
  3. import os
  4. import urllib
  5. fileName = "图片"
  6. if not os.path.exists(fileName):
  7. os.mkdir(fileName)
  8. url = "https://pic.netbian.com/"
  9. headers = {
  10. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
  11. }
  12. response = requests.get(url=url, headers=headers)
  13. page_text = response.text
  14. tree = etree.HTML(page_text)
  15. li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
  16. arr = []
  17. for li in li_list:
  18. href = 'https://pic.netbian.com' + li.xpath(' ./a/span/img/@src')[0]
  19. arr.append(href)
  20. for ar in arr:
  21. filePath = fileName + '/' + ar.split('/')[-1]
  22. urllib.request.urlretrieve(ar, filePath)
  23. print("爬取结束!!!")

案例二

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. if __name__ == "__main__":
  4. # 如何爬取图片数据
  5. url = 'https://th.bing.com/th/id/R6706ad2e7a68edabddbc1b5644707c4f?rik=u8uR%2bWe5bxIosA&riu=http%3a%2f%2fpic.lvmama.com%2fuploads%2fpc%2fplace2%2f2016-09-14%2f9aab9bb7-2593-4ca6-8c5a-31355443aebc.jpg&ehk=HpOwqU6w6%2fssF4CJQMbTOshMh4lIXJONXU%2btYNsAKSI%3d&risl=1&pid=ImgRaw'
  6. # content返回的是二进制形式的图片数据
  7. # text(字符串) content(二进制) json() (对象)
  8. img_data = requests.get(url=url).content
  9. with open('./qiutu.jpg', 'wb') as fp:
  10. fp.write(img_data)

获取动态数据

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. import json
  4. if __name__ == "__main__":
  5. headers = {
  6. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'
  7. }
  8. # 批量获取不同企业的id值
  9. url = ''
  10. # 参数的封装
  11. id_list = [] # 存储企业的id
  12. all_data_list = [] # 存储所有的企业详情数据
  13. for page in range(1, 6):
  14. page = str(page)
  15. data = {
  16. }
  17. json_ids = requests.post(url=url, headers=headers, data=data).json()
  18. for dic in json_ids['list']:
  19. id_list.append(dic['ID'])
  20. # 获取所有的企业详情数据
  21. post_url = ''
  22. for id in id_list:
  23. data = {
  24. 'id': id
  25. }
  26. detail_json = requests.post(url=url, headers=headers, data=data).json()
  27. all_data_list.append(detail_json)
  28. # 持久化存储all_data_list
  29. fp = open('./allData.json', 'w', encoding='utf-8')
  30. json.dump(all_data_list, fp=fp, ensure_ascii=False)
  31. print('over!!!')

session和cokkie的处理

使用xpath解析

 

案例一

  1. # -*- coding: utf-8 -*-
  2. from lxml import etree
  3. if __name__ == '__main__':
  4. # 实例化好了一个etree对象,且将被解析的源码加载到了该对象中
  5. tree = etree.parse('r.html')
  6. # r=tree.xpath('/html/body/div')
  7. # r=tree.xpath('/html//div')
  8. # r=tree.xpath('//div')
  9. # r=tree.xpath('//div[@class="song"]')
  10. # r=tree.xpath('//div[@class="tang"]//li[5]/a/text()')[0]
  11. # r=tree.xpath('//li[7]//text()')
  12. # r=tree.xpath('//div[@class="tang"]//text()')
  13. r = tree.xpath('//div[@class="song"]/img/@src')
  14. print(r)

 案例二

  1. # -*- coding: utf-8 -*-
  2. # 需求: 爬取58二手房的房源信息
  3. import requests
  4. from lxml import etree
  5. if __name__ == "__main__":
  6. headers = {
  7. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/53'
  8. }
  9. # 爬取到页面源码数据
  10. url = 'https://www.58.com/ershoufang/'
  11. page_text = requests.get(url=url, headers=headers).text
  12. # 数据解析
  13. tree = etree.HTML(page_text)
  14. # 存储的就是标签对象
  15. td_list = tree.xpath('//td[@class="t"]')
  16. fp = open('58.txt', 'w', encoding='utf-8')
  17. for td in td_list:
  18. title = td.xpath('./a/text()')[0]
  19. print(title)
  20. fp.write(title + '\n')
  21. fp.close()

案例三

  1. # -*- coding: utf-8 -*-
  2. # 需求: 解析下载图片数据
  3. import requests
  4. from lxml import etree
  5. import os
  6. if __name__ == "__main__":
  7. headers = {
  8. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/53'
  9. }
  10. # 爬取到页面源码数据
  11. url = 'https://pic.netbian.com/4kyouxi/'
  12. response = requests.get(url=url, headers=headers)
  13. # 手动设定响应数据编码格式
  14. # response.encoding='utf-8'
  15. page_text = response.text
  16. # 数据解析
  17. tree = etree.HTML(page_text)
  18. li_list = tree.xpath('//div[@class="slist"]/ul/li')
  19. # 创建一个文件夹
  20. if not os.path.exists('./picLibs'):
  21. os.mkdir('./picLibs')
  22. for li in li_list:
  23. img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0]
  24. img_name = li.xpath('./a/img/@alt')[0] + '.jpg'
  25. # 通用解决中文乱码方案
  26. img_name = img_name.encode('iso-8859-1').decode('gbk')
  27. # print(img_name,img_src)
  28. # 请求图片进行持久化存储
  29. img_data = requests.get(url=img_src, headers=headers).content
  30. img_path = 'picLibs/' + img_name
  31. with open(img_path, 'wb') as fp:
  32. fp.write(img_data)
  33. print(img_name, '下载成功!!!')

案例四

  1. # -*- coding: utf-8 -*-
  2. # 需求: 解析出所有城市名称
  3. import requests
  4. from lxml import etree
  5. if __name__ == "__main__":
  6. '''headers={
  7. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/53'
  8. }
  9. #爬取到页面源码数据
  10. url='https://www.aqistudy.cn/historydata/'
  11. response=requests.get(url=url,headers=headers)
  12. #手动设定响应数据编码格式
  13. #response.encoding='utf-8'
  14. page_text=response.text
  15. #数据解析
  16. tree=etree.HTML(page_text)
  17. host_li_list=tree.xpath('//div[@class="bottom"]/ul/li')
  18. all_city_names=[]
  19. #解析到热门城市的城市名称
  20. for li in host_li_list:
  21. hot_city_name=li.xpath('./a/text()')[0]
  22. all_city_names.append(hot_city_name)
  23. #解析全部城市的名称
  24. city_names_list=tree.xpath('div[@class="bottom"]/ul/div[2]/li')
  25. for li in city_names_list:
  26. city_name=li.xpath('./a/text()')[0]
  27. all_city_names.append(city_name)
  28. print(all_city_names,len(all_city_names))'''
  29. headers = {
  30. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/53'
  31. }
  32. # 爬取到页面源码数据
  33. url = 'https://www.aqistudy.cn/historydata/'
  34. response = requests.get(url=url, headers=headers)
  35. # 手动设定响应数据编码格式
  36. # response.encoding='utf-8'
  37. page_text = response.text
  38. # 数据解析
  39. tree = etree.HTML(page_text)
  40. a_list = tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/div[2]/li/a')
  41. all_city_names = []
  42. for a in a_list:
  43. city_name = a.xpath('./text()')[0]
  44. all_city_names.append(city_name)
  45. print(all_city_names, len(all_city_names))

使用正则解析

案例一

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. import re
  4. import os
  5. # 需求: 爬取糗事百科中糗图板块下所有的糗图图片
  6. if __name__ == '__main__':
  7. # 创建一个文件夹,保存所有的图片
  8. if not os.path.exists('./qiutuLibs'):
  9. os.mkdir('./qiutuLibs')
  10. url = 'https://www.qiushibaike.com/imgrank/'
  11. headers = {
  12. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'
  13. }
  14. # 使用通用爬虫对url对应的一整张页面进行爬取
  15. page_text = requests.get(url=url, headers=headers).text
  16. # 使用聚焦爬虫将页面中所有的糗图进行爬取
  17. ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
  18. img_src_list = re.findall(ex, page_text, re.S)
  19. # print(img_src_list)
  20. for src in img_src_list:
  21. # 拼接一个完整的图片url
  22. src = 'https:' + src
  23. # 请求到了图片的二进制数据
  24. img_data = requests.get(url=src, headers=headers).content
  25. # 生成图片名称
  26. img_name = src.split('/')[-1]
  27. # 图片最终存储的路径
  28. imgPath = './qiutuLibs/' + img_name
  29. with open(imgPath, 'wb') as fp:
  30. fp.write(img_data)
  31. print(img_name, '下载成功!!!')

 案例二

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. import re
  4. import os
  5. # 需求: 爬取糗事百科中糗图板块下所有的糗图图片
  6. if __name__ == '__main__':
  7. headers = {
  8. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'
  9. }
  10. # 创建一个文件夹,保存所有的图片
  11. if not os.path.exists('./qiutuLibs'):
  12. os.mkdir('./qiutuLibs')
  13. # 设置一个通用的url模板
  14. url = 'https://www.qiushibaike.com/imgrank/page/%d/'
  15. for pageNum in range(1, 3):
  16. # 对应页面的url
  17. new_url = format(url % pageNum)
  18. page_text = requests.get(url=new_url, headers=headers).text
  19. # 使用聚焦爬虫将页面中所有的糗图进行爬取
  20. ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
  21. img_src_list = re.findall(ex, page_text, re.S)
  22. # print(img_src_list)
  23. for src in img_src_list:
  24. # 拼接一个完整的图片url
  25. src = 'https:' + src
  26. # 请求到了图片的二进制数据
  27. img_data = requests.get(url=src, headers=headers).content
  28. # 生成图片名称
  29. img_name = src.split('/')[-1]
  30. # 图片最终存储的路径
  31. imgPath = './qiutuLibs/' + img_name
  32. with open(imgPath, 'wb') as fp:
  33. fp.write(img_data)
  34. print(img_name, '下载成功!!!')

BeautifulSoup使用

案例一

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. from bs4 import BeautifulSoup
  4. if __name__=='__main__':
  5. #对首页的页面数据进行爬取
  6. headers={
  7. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76'
  8. }
  9. url='https://www.shicimingju.com/book/sanguoyanyi.html'
  10. page_text=requests.get(url=url,headers=headers)
  11. page_text.encoding='utf-8'
  12. page_text=page_text.text
  13. #在首页中解析出章节的标题和详情页的url
  14. #1.实例化BeautifulSoup对象,需要将页面源码数据加载到该对象中
  15. soup=BeautifulSoup(page_text,'lxml')
  16. #解析章节标题和详情的url
  17. li_list=soup.select('.book-mulu>ul>li')
  18. fp=open('./sanguo.txt','w',encoding='utf-8')
  19. for li in li_list:
  20. title=li.a.string
  21. detail_url='https://www.shicimingju.com'+li.a['href']
  22. #对详情页发起请求,解析出章节内容
  23. detail_page_text=requests.get(url=detail_url,headers=headers)
  24. detail_page_text.encoding='utf-8'
  25. detail_page_text=detail_page_text.text
  26. #解析出详情页中的相关的章节内容
  27. detail_soup=BeautifulSoup(detail_page_text,'lxml')
  28. div_tag=detail_soup.find('div',class_='chapter_content')
  29. #解析到了章节的内容
  30. content=div_tag.text
  31. fp.write(title+':'+content+'\n')
  32. print(title,'爬取成功!!!')

案例二

  1. from bs4 import BeautifulSoup
  2. import requests
  3. import os
  4. fileName = 'novel'
  5. if not os.path.exists(fileName):
  6. os.mkdir(fileName)
  7. headers = {
  8. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.37',
  9. 'Connection': 'close'
  10. }
  11. url = "https://www.shicimingju.com/book/sanguoyanyi.html"
  12. response = requests.get(url=url, headers=headers)
  13. response.encoding = 'utf-8'
  14. soup = BeautifulSoup(response.text, 'lxml')
  15. title = soup.select('.book-mulu > ul > li > a')
  16. cnt = 0
  17. for t in title:
  18. href = 'https://www.shicimingju.com' + t['href']
  19. response = requests.get(url=href, headers=headers)
  20. response.encoding = 'utf-8'
  21. page_text = response.text
  22. soup = BeautifulSoup(page_text, 'lxml')
  23. div = soup.find('div', class_='card bookmark-list')
  24. filePath = fileName + '/' + t.string + '.txt'
  25. pageTxt = div.text
  26. with open(filePath, 'w', encoding='utf-8') as fp:
  27. fp.write(pageTxt)
  28. print('爬取成功!!!')
  29. cnt += 1
  30. if cnt == 10:
  31. break

seleium自动化爬虫

 解决iframe问题

 案例一

  1. # -*- coding: utf-8 -*-
  2. # 需求:模拟登录
  3. from selenium import webdriver
  4. from time import sleep
  5. bro = webdriver.Chrome(executable_path='./chromedriver')
  6. bro.get('https://qzone.qq.com/')
  7. bro.switch_to.frame('login_frame')
  8. a_tag = bro.find_element_by_id("switcher_plogin")
  9. a_tag.click()
  10. userName_tag = bro.find_element_by_id('u')
  11. password_tag = bro.find_element_by_id('p')
  12. sleep(1)
  13. userName_tag.send_keys('1292161328')
  14. sleep(1)
  15. password_tag.send_keys('1234567890')
  16. sleep(1)
  17. btn = bro.find_element_by_id('login_button')
  18. btn.click()
  19. sleep(3)
  20. bro.quit()

案例二

  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from lxml import etree
  4. from time import sleep
  5. # 实例化一个浏览器对象(传入的驱动程序)
  6. bro = webdriver.Chrome(executable_path='./chromedriver')
  7. bro.add_argument('-kiosk')
  8. # 让浏览器发起一个指定url对应请求
  9. bro.get('http://scxk.nmpa.gov.cn:81/xk/')
  10. # 获取浏览器当前页面的页面源码数据
  11. page_text = bro.page_source
  12. # 解析企业名称
  13. tree = etree.HTML(page_text)
  14. li_list = tree.xpath('//ul[@id="gzlist"]/li')
  15. for li in li_list:
  16. name = li.xpath('./dl/@title')[0]
  17. print(name)
  18. sleep(5)
  19. bro.quit()

案例三

其他自动化操作

  1. from selenium import webdriver
  2. from time import sleep
  3. bro = webdriver.Chrome(executable_path='./chromedriver')
  4. bro.get('http://www.taobao.com/')
  5. # 实现标签定位
  6. search_input = bro.find_element_by_id('q')
  7. # 标签交互
  8. search_input.send_keys('Iphone')
  9. # 执行一组js程序
  10. bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
  11. sleep(2)
  12. # 点击搜索按钮
  13. btn = bro.find_element_by_css_selector('.btn-search')
  14. btn.click()
  15. bro.get('https://www.baidu.com')
  16. sleep(2)
  17. # 回退
  18. bro.back()
  19. sleep(2)
  20. # 前进
  21. bro.forward()
  22. sleep(5)
  23. bro.quit()

案例四

实现无界面

  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from time import sleep
  4. # 实现无可视化界面
  5. from selenium.webdriver.chrome.options import Options
  6. # 实现规避检测
  7. from selenium.webdriver import ChromeOptions
  8. # 实现无可视化界面的操作
  9. chrome_options = Options()
  10. chrome_options.add_argument('--headless')
  11. chrome_options.add_argument('--disable-gpu')
  12. # 实现规避检测
  13. option = ChromeOptions()
  14. option.add_experimental_option('excludeSwitches', ['enable-automation'])
  15. # 如何实现selenium的规避风险
  16. bro = webdriver.Chrome(executable_path='./chromedriver', chrome_options=chrome_options, options=option)
  17. # 无可视化界面(无头浏览器) phantomJs
  18. bro.get('https://www.baidu.com')
  19. print(bro.page_source)
  20. sleep(2)
  21. bro.quit()

案例五

  1. from selenium import webdriver
  2. from time import sleep
  3. # 后面是你的浏览器驱动位置,记得前面加r'','r'是防止字符转义的
  4. driver = webdriver.Chrome(r'./chromedriver')
  5. # 用get打开百度页面
  6. driver.get("http://www.baidu.com")
  7. # 查找页面的“设置”选项,并进行点击
  8. # driver.find_elements_by_link_text('设置')[0].click()
  9. # sleep(2)
  10. # # # 打开设置后找到“搜索设置”选项,设置为每页显示50条
  11. # driver.find_elements_by_link_text('搜索设置')[0].click()
  12. # sleep(2)
  13. # #选中每页显示50条
  14. # m = driver.find_element_by_id('nr')
  15. # sleep(2)
  16. # m.find_element_by_xpath('//*[@id="nr"]/option[3]').click()
  17. # m.find_element_by_xpath('.//option[3]').click()
  18. # sleep(2)
  19. # # 点击保存设置
  20. # driver.find_elements_by_class_name("prefpanelgo")[0].click()
  21. # sleep(2)
  22. # # 处理弹出的警告页面 确定accept() 和 取消dismiss()
  23. # driver.switch_to_alert().accept()
  24. # sleep(2)
  25. # 找到百度的输入框,并输入 美女
  26. driver.find_element_by_id('kw').send_keys('美女')
  27. sleep(2)
  28. # 点击搜索按钮
  29. driver.find_element_by_id('su').click()
  30. sleep(2)
  31. # 在打开的页面中找到“Selenium - 开源中国社区”,并打开这个页面
  32. driver.find_elements_by_link_text('美女_海量精选高清图片_百度图片')[0].click()
  33. sleep(3)
  34. # 关闭浏览器
  35. driver.quit()

案例六

自动化处理iframe标签

  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from time import sleep
  4. # 导入动作链对应的类
  5. from selenium.webdriver import ActionChains
  6. bro = webdriver.Chrome(executable_path='./chromedriver')
  7. bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
  8. # 如果定位的标签是存在于iframe标签之中则必须通过如下操作在进行标签定位
  9. bro.switch_to.frame('iframeResult') # 切换浏览器标签定位的作用域
  10. div = bro.find_element_by_id('draggable')
  11. # 动作链
  12. action = ActionChains(bro)
  13. # 点击长按的标签
  14. action.click_and_hold(div)
  15. for i in range(5):
  16. # perform()立即执行动作链操作
  17. # move_by_offset(x,y):x水平方向,y竖直方向
  18. action.move_by_offset(17, 0).perform()
  19. sleep(0.3)
  20. # 释放地址链
  21. action.release()
  22. print(div)

案例七

基于selenium的12306用户登录

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. from hashlib import md5
  4. class Chaojiying_Client(object):
  5. def __init__(self, username, password, soft_id):
  6. self.username = username
  7. password = password.encode('utf8')
  8. self.password = md5(password).hexdigest()
  9. self.soft_id = soft_id
  10. self.base_params = {
  11. 'user': self.username,
  12. 'pass2': self.password,
  13. 'softid': self.soft_id,
  14. }
  15. self.headers = {
  16. 'Connection': 'Keep-Alive',
  17. 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
  18. }
  19. def PostPic(self, im, codetype):
  20. """
  21. im: 图片字节
  22. codetype: 题目类型 参考 http://www.chaojiying.com/price.html
  23. """
  24. params = {
  25. 'codetype': codetype,
  26. }
  27. params.update(self.base_params)
  28. files = {'userfile': ('ccc.jpg', im)}
  29. r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
  30. headers=self.headers)
  31. return r.json()
  32. def ReportError(self, im_id):
  33. """
  34. im_id:报错题目的图片ID
  35. """
  36. params = {
  37. 'id': im_id,
  38. }
  39. params.update(self.base_params)
  40. r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
  41. return r.json()
  42. # 使用elenium打开登录页面
  43. from selenium import webdriver
  44. import time
  45. from PIL import Image
  46. from selenium.webdriver import ActionChains
  47. bro = webdriver.Chrome(executable_path='./chromedriver')
  48. bro.maximize_window() # 全屏
  49. bro.get('https://kyfw.12306.cn/otn/resources/login.html')
  50. time.sleep(1)
  51. # 点击账号登录
  52. bro.find_elements_by_link_text('账号登录')[0].click()
  53. time.sleep(1)
  54. # save_screenshot将当前页面进行截图并保存
  55. bro.save_screenshot('aa.png')
  56. # 确定验证码图片对应的左上角和右下角的坐标(裁剪区域确定)
  57. code_img_ele = bro.find_element_by_css_selector('#J-loginImg')
  58. location = code_img_ele.location # 验证码图片左上角的坐标
  59. print('location:', location)
  60. size = code_img_ele.size # 验证码标签对应的长和宽
  61. print('size:', size)
  62. # 左上角和右下角坐标
  63. rangle = (
  64. int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
  65. # 至此验证码图片区域就确定下来了
  66. i = Image.open('./aa.png')
  67. code_img_name = './code.png'
  68. # crop根据指定区域进行图片裁剪
  69. frame = i.crop(rangle)
  70. frame.save(code_img_name)
  71. # 将验证码图片交给超级鹰进行识别
  72. chaojiying = Chaojiying_Client('1292161328', 'wuxiangnong', '915445') # 用户中心>>软件ID 生成一个替换 96001
  73. im = open('code.png', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
  74. print(chaojiying.PostPic(im, 9004)['pic_str'])
  75. result = chaojiying.PostPic(im, 9004)['pic_str']
  76. all_list = [] # 要存储即将被点击的点的坐标[x1,y1][x2,y2]
  77. if '|' in result:
  78. list_1 = result.split('|')
  79. count_1 = len(list_1)
  80. for i in range(count_1):
  81. xy_list = []
  82. x = int(list_1[i].split(',')[0])
  83. y = int(list_1[i].split(',')[1])
  84. xy_list.append(x)
  85. xy_list.append(y)
  86. all_list.append(xy_list)
  87. else:
  88. x = int(result.split(',')[0])
  89. y = int(result.split(',')[1])
  90. xy_list = []
  91. xy_list.append(x)
  92. xy_list.append(y)
  93. all_list.append(xy_list)
  94. print(all_list)
  95. # 遍历列表,使用动作链对每一个列表元素对应的x,y指定的位置进行点击操作
  96. for l in all_list:
  97. x = l[0]
  98. y = l[1]
  99. ActionChains(bro).move_to_element_with_offset(code_img_ele, x, y).click().perform()
  100. time.sleep(0.5)
  101. bro.find_element_by_id('J-userName').send_keys('19828430139')
  102. time.sleep(2)
  103. bro.find_element_by_id('J-password').send_keys('wuxiangnong9595')
  104. time.sleep(2)
  105. bro.find_element_by_id('J-login').click()
  106. time.sleep(3)
  107. bro.quit()

代理的使用

 

 案例一

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. url = 'https://www.baidu.com/s?wd=ip'
  4. headers = {
  5. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76'
  6. }
  7. page_text = requests.get(url=url, headers=headers, proxies={"https": "222.110.147.50:3128"})
  8. with open('ip.html', 'w', encoding='utf-8') as fp:
  9. fp.write(page_text)

验证码解析

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. from lxml import etree
  4. '''导入一个打码类'''
  5. # 封装识别验证码图片下载到本地的函数
  6. def getCodeText(imgPath, codeType):
  7. # 普通用户用户名
  8. username = 'bobo328410948'
  9. # 普通用户密码
  10. password = 'bobo328410948'
  11. # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
  12. appid = 6003
  13. # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
  14. appkey = '1f4b564483ae5c907a1d34f8e2f2776c'
  15. # 图片文件:即将被识别的验证码图片的路径
  16. filename = imgPath
  17. # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
  18. codetype = codeType
  19. # 超时时间,秒
  20. timeout = 20
  21. result = None
  22. # 检查
  23. if (username == 'username'):
  24. print('请设置好相关参数再测试')
  25. else:
  26. # 初始化
  27. yundama = YDMHttp(username, password, appid, appkey)
  28. # 登陆云打码
  29. uid = yundama.login();
  30. print('uid: %s' % uid)
  31. # 查询余额
  32. balance = yundama.balance();
  33. print('balance: %s' % balance)
  34. # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
  35. cid, result = yundama.decode(filename, codetype, timeout);
  36. print('cid: %s, result: %s' % (cid, result))
  37. return result
  38. if __name__ == "__main__":
  39. headers = {
  40. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76'
  41. }
  42. url = 'https://so.gushiwen.org/user/login.aspx'
  43. page_text = requests.get(url=url, headers=headers).text
  44. # 解析验证码图片img中的属性值
  45. tree = etree.HTML(page_text)
  46. code_img_src = 'https://so.gushiwen.org' + tree.xpath('//*[@id="imgCode"]/@src')
  47. img_data = requests.get(url=code_img_src, headers=headers).content
  48. # 验证码图片保存到了本地
  49. with open('./code.jpg', 'wb') as fp:
  50. fp.write(img_data)
  51. # 打码咯

协程的使用

 案例一

  1. # -*- coding: utf-8 -*-
  2. import asyncio
  3. async def request(url):
  4. print('正在请求的url是', url)
  5. print('请求成功', url)
  6. return url
  7. # async修饰的函数,调用之后返回的一个协程对象
  8. c = request('www.baidu.com')
  9. # 创建一个事件循环对象
  10. # loop=asyncio.get_event_loop()
  11. # #将协程对象注册到loop中,然后启动loop
  12. # loop.run_until_complete(c)
  13. # task的使用
  14. loop = asyncio.get_event_loop()
  15. # 基于loop创建一个task对象
  16. task = loop.create_task(c)
  17. print(task)
  18. # future的使用
  19. # loop=asyncio.get_event_loop()
  20. # task=asyncio.ensure_future(c)
  21. # print(task)
  22. # loop.run_until_complete(task)
  23. # print(task)
  24. def callback_func(task):
  25. # result返回的就是任务对象中封装对象对应函数的返回值
  26. print(task.result)
  27. # 绑定回调
  28. loop = asyncio.get_event_loop()
  29. task = asyncio.ensure_future(c)
  30. # 将回调函数绑定到任务对象中
  31. task.add_done_callback(callback_func)
  32. loop.run_until_complete(task)

同步爬虫

  1. import requests
  2. headers = {
  3. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76'
  4. }
  5. urls = {
  6. 'http://xmdx.sc.chinaz.net/Files/DownLoad/jianli/201904/jianli10231.rar',
  7. 'http://zjlt.sc.chinaz.net/Files/DownLoad/jianli/201904/jianli10229.rar',
  8. 'http://xmdx.sc.chinaz.net/Files/DownLoad/jianli/201904/jianli10231.rar'
  9. }
  10. def get_content(url):
  11. print('正在爬取:', url)
  12. response = requests.get(url=url, headers=headers)
  13. if response.status_code == 200:
  14. return response.content
  15. def parse_content(content):
  16. print('响应数据的长度为:', len(content))
  17. for url in urls:
  18. content = get_content(url)
  19. parse_content(content)

多线程异步爬虫的使用

 案例一

  1. # -*- coding: utf-8 -*-
  2. import asyncio
  3. import time
  4. async def request(url):
  5. print('正在下载', url)
  6. # 在异步协程中如果出现了同步模块相关代码,那么就无法实现异步
  7. # time.sleep(2)
  8. # 当在asyncio中遇到阻塞操作必须进行手动挂起
  9. await asyncio.sleep(2)
  10. print('下载完毕', url)
  11. start = time.time()
  12. urls = {
  13. 'www.baidu.com',
  14. 'www.sogou.com',
  15. 'www.goubanjia.com'
  16. }
  17. # 任务对象:存放多个任务对象
  18. stasks = []
  19. for url in urls:
  20. c = request(url)
  21. task = asyncio.ensure_future(c)
  22. stasks.append(task)
  23. loop = asyncio.get_event_loop()
  24. # 需要将任务列表封装到wait中
  25. loop.run_until_complete(asyncio.wait(stasks))
  26. print(time.time() - start)
'
运行

案例二

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. import asyncio
  4. import time
  5. start = time.time()
  6. urls = [
  7. 'http://127.0.0.1:5000/bobo',
  8. 'http://127.0.0.1:5000/jay',
  9. 'http://127.0.0.1:5000/tom'
  10. ]
  11. async def get_page(url):
  12. print('正在下载', url)
  13. # requests.get发起的请求是基于同步,必须基于异步的网络请求模块进行指定的url
  14. # aiohttp:基于异步网络请求的模块
  15. response = requests.get(url=url)
  16. print(response.text)
  17. tasks = []
  18. for url in urls:
  19. c = get_page(url)
  20. task = asyncio.ensure_future(c)
  21. tasks.append(task)
  22. loop = asyncio.get_event_loop()
  23. loop.run_until_complete(asyncio.wait(tasks))
  24. end = time.time()
  25. print('耗时', end - start)
'
运行

 线程池

 案例三

  1. # -*- coding: utf-8 -*-
  2. import time
  3. # 使用单线程串行方式的执行
  4. def get_page(str):
  5. print("正在下载:", str)
  6. time.sleep(2)
  7. print('下载成功:', str)
  8. name_list = ['xiaozi', 'aa', 'bb', 'cc']
  9. start_time = time.time()
  10. for i in range(len(name_list)):
  11. get_page(name_list[i])
  12. end_time = time.time()
  13. print('%d second' % (end_time - start_time))
'
运行

案例四

  1. # -*- coding: utf-8 -*-
  2. import time
  3. # 导入线程池对应模块的类
  4. from multiprocessing.dummy import Pool
  5. # 使用线程池方式执行
  6. start_time = time.time()
  7. def get_page(str):
  8. print("正在下载:", str)
  9. time.sleep(2)
  10. print('下载成功:', str)
  11. name_list = ['xiaozi', 'aa', 'bb', 'cc']
  12. # 实例化一个线程池
  13. pool = Pool(4)
  14. # 将列表每一个元素传递给get_page进行处理
  15. pool.map(get_page, name_list)
  16. end_time = time.time()
  17. print(end_time - start_time)
'
运行

异步协程

 案例五

aiohttp实现任务异步协程

  1. # -*- coding: utf-8 -*-
  2. import time
  3. import asyncio
  4. import aiohttp
  5. start = time.time()
  6. urls = [
  7. 'http://127.0.0.1:5000/bobo',
  8. 'http://127.0.0.1:5000/jay',
  9. 'http://127.0.0.1:5000/tom'
  10. ]
  11. async def get_page(url):
  12. async with aiohttp.ClientSession() as session:
  13. # get()、post():
  14. # headers,params/data,proxy='http://ip:port'
  15. async with session.get(url) as response:
  16. # text()可以返回字符串形式的响应数据
  17. # read()返回二进制形式的响应数据
  18. # json()返回的就是json对象
  19. # 注意:获取响应数据操作之前一定要使用await进行手动挂起
  20. page_text = await response.text()
  21. print(page_text)
  22. tasks = []
  23. for url in urls:
  24. c = get_page(url)
  25. task = asyncio.ensure_future(c)
  26. tasks.append(task)
  27. loop = asyncio.get_event_loop()
  28. loop.run_until_complete(asyncio.wait(tasks))
  29. end = time.time()
  30. print('耗时', end - start)

分布式爬虫

 简单练手项目

 

肯德基破解 

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. if __name__ == "__main__":
  4. url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
  5. param = {
  6. 'cname': '',
  7. 'pid': '',
  8. 'keyword': '北京',
  9. 'pageIndex': '1',
  10. 'pageSize': '10',
  11. }
  12. headers = {
  13. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'
  14. }
  15. response = requests.get(url=url, params=param, headers=headers)
  16. list_data = response.text
  17. fp = open('./KFC.text', 'w', encoding='utf-8')
  18. fp.write(list_data)
  19. fp.close()
  20. print('over!!!')

爬取简历模板

  1. import requests
  2. import os
  3. from lxml import etree
  4. if __name__ == "__main__":
  5. headers = {
  6. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/53'
  7. }
  8. url = 'https://sc.chinaz.com/jianli/free.html'
  9. page_text = requests.get(url=url, headers=headers).text
  10. # 创建文件夹
  11. if not os.path.exists('./new'):
  12. os.mkdir('./new')
  13. # 实例对象
  14. tree = etree.HTML(page_text)
  15. a_lists = tree.xpath('//div[@id="container"]/div/a')
  16. for a in a_lists:
  17. href = a.xpath('./@href')[0]
  18. src = 'https:' + href
  19. page_text_detail = requests.get(url=src, headers=headers).text
  20. treeDetail = etree.HTML(page_text_detail)
  21. a_lists_products = treeDetail.xpath('//div[@class="clearfix mt20 downlist"]/ul/li')[0]
  22. href2 = a_lists_products.xpath('./a/@href')[0]
  23. products_name = href2[-7:]
  24. response = requests.get(url=href2, headers=headers)
  25. data_products = response.content
  26. data_path = 'new/' + products_name
  27. with open(data_path, 'wb') as fp:
  28. fp.write(data_products)
  29. fp.close()
  30. print(products_name, "下载成功!!!")

百度AI实现爬虫

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/木道寻08/article/detail/866253
推荐阅读
相关标签
  

闽ICP备14008679号