当前位置:   article > 正文

python爬虫实验浏览量——凉凉_python抓取网页浏览量

python抓取网页浏览量

我之前的一篇爬虫爬取信息练习里使用了请求头:User-Agent,让网页人为刚刚进行访问的是浏览器,所以我在想是否可以使用这种方法去增加我CSDN博客的访问量,所以我使用这篇博客进行了测试。

第一次我没有使用代理IP去request.get访问这篇博客,也没有使用多个进程,效果成功但访问量刷新效率低,而且IP地址容易被网站的反爬虫检索出来然后封锁。

  1. import requests
  2. import re
  3. import time
  4. import random
  5. # 下载一个网页
  6. url_list ='https://blog.csdn.net/qq_36171287/article/details/91352388'
  7. user_agent_list=[
  8. 'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)',
  9. 'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
  10. 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
  11. 'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11',
  12. 'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
  13. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
  14. 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
  15. 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
  16. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
  17. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
  18. ]
  19. # 请求头,告诉服务器这是浏览器
  20. # header = {
  21. # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
  22. #print(header,url)
  23. # 模拟浏览器发送HTTP请求
  24. i = 0
  25. while i<200:
  26. header = {
  27. 'User-Agent': random.choice(user_agent_list)
  28. }
  29. url = url_list
  30. try:
  31. response = requests.get(url, headers=header)
  32. print("success")
  33. except :
  34. print("mistake")
  35. time.sleep(30)
  36. i = i + 1
  37. print("**********")

第二次我使用了代理IP和多个进程,效率就提高了很多

使用代理IP

开8个进程

  1. import requests
  2. import re
  3. import time
  4. import random
  5. from threading import Thread
  6. from bs4 import BeautifulSoup
  7. # 下载一个网页
  8. url_list = 'https://blog.csdn.net/qq_36171287/article/details/91352388'
  9. user_agent_list=[
  10. 'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)',
  11. 'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
  12. 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
  13. 'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11',
  14. 'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
  15. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
  16. 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
  17. 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
  18. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
  19. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
  20. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
  21. 'Opera/8.0 (Windows NT 5.1; U; en)',
  22. 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
  23. 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
  24. 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
  25. 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
  26. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
  27. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
  28. 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
  29. 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
  30. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
  31. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
  32. 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
  33. 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
  34. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
  35. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
  36. ]
  37. def get_ip_list(url, headers):
  38. #获取代理IP地址
  39. web_data = requests.get(url, headers=headers)
  40. soup = BeautifulSoup(web_data.text, 'lxml')
  41. ips = soup.find_all('tr')
  42. ip_list = []
  43. for i in range(1, len(ips)):
  44. ip_info = ips[i]
  45. tds = ip_info.find_all('td')
  46. ip_list.append(tds[1].text + ':' + tds[2].text)
  47. return ip_list
  48. def get_random_ip(ip_list):
  49. #获取代理IP地址
  50. proxy_list = []
  51. for ip in ip_list:
  52. proxy_list.append('http://' + ip)
  53. proxy_ip = random.choice(proxy_list)
  54. proxies = {'http': proxy_ip}
  55. return proxies
  56. def fun(url,proxies):
  57. # 模拟浏览器发送HTTP请求
  58. #访问需要的目标网页
  59. header = {
  60. 'User-Agent': random.choice(user_agent_list)
  61. }
  62. try:
  63. response = requests.get(url, headers=header,proxies=proxies)
  64. print("success")
  65. except:
  66. print("mistake")
  67. time.sleep(30)
  68. def run():
  69. """
  70. 多线程运行
  71. :param url: 目标网址
  72. :param proxy_url: 代理API接口
  73. :return:
  74. """
  75. threads = []
  76. # 同时开启8个线程运行,可修改
  77. for i in range(8):
  78. url = url_list
  79. proxies = get_random_ip(ip_list)
  80. print(proxies)
  81. threads.append(Thread(target=fun, args=(url, proxies)))
  82. for t in threads:
  83. t.start()
  84. for t in threads:
  85. t.join()
  86. if __name__ == '__main__':
  87. url = 'http://www.xicidaili.com/nn/'
  88. headers = {
  89. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
  90. }
  91. #获取代理IP地址
  92. ip_list = get_ip_list(url, headers=headers)
  93. #运行访问
  94. while True:
  95. run()

但是在后来进行测试时发现已经不会增长了,因为网页进行了加密

使用下面代码进行爬取时,爬出的是加密js

  1. import requests
  2. import re
  3. # 下载一个网页
  4. url = 'https://blog.csdn.net/qq_36171287/article/details/91352388'
  5. # 请求头,告诉服务器这是浏览器
  6. header = {
  7. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
  8. # 模拟浏览器发送HTTP请求
  9. response = requests.get(url, headers=header)
  10. print(response.text)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/代码探险家/article/detail/1004849
推荐阅读
相关标签
  

闽ICP备14008679号