赞
踩
使用requests请求网址 lxml中etree请求数据 time延时 openpyxl保存再excel中
我们可以输入爬取的页数:
运行代码:
将数据储存在excel中:
源代码如下:
在这里插入代码片 # _*_ coding:utf _*_ # 邮箱:3195841740@qq.com # 人员:21292 # 日期:2020/3/10 10:42 # 工具:PyCharm import requests from lxml import etree import time import openpyxl headers = { 'Host': 'bbs.csdn.net', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Cookie': 'uuid_tt_dd=10_18634657010-1580480278567-205683; dc_session_id=10_1580480278567.959197; __gads=ID=442022e467108b24:T=1580480282:S=ALNI_Ma1eS1wB7Jxj3O7hnYAWcbLta-ROg; UserName=DHKSHFJ; UserInfo=00291c5cf64747cc8c74b36657573e33; UserToken=00291c5cf64747cc8c74b36657573e33; UserNick=%E8%BF%81%E5%B0%B10423; AU=EE3; UN=DHKSHFJ; BT=1580538639376; p_uid=U000000; searchHistoryArray=%255B%2522%25E8%25B1%2586%25E7%2593%25A3%25E7%2588%25AC%25E8%2599%25AB%2522%252C%2522%25E7%2588%25AC%25E8%2599%25AB%2522%252C%2522scrapy%25E7%2588%25AC%25E5%258F%2596%25E5%2588%25B0%25E7%259A%2584%25E6%2595%25B0%25E6%258D%25AE%25E5%2586%2599%25E5%2585%25A5%25E6%2596%2587%25E4%25BB%25B6%2522%252C%2522requests%2522%252C%2522tkinter%2522%255D; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1583729124,1583729168,1583738250,1583738329; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=5744*1*DHKSHFJ!6525*1*10_18634657010-1580480278567-205683; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1583762044; c_ref=https%3A//www.baidu.com/link%3Furl%3DYRI-WAKUg0fJa8HulbdWQC291VDpdN-rTAvCbTu45gnxD4WF6iz2JUM7jAHAaRsr-vT8h3ulBiZWga31NFtQrq%26wd%3D%26eqid%3D975e2ab900002c82000000055e65edd2; dc_tos=q6xhxh; announcement=%257B%2522isLogin%2522%253Atrue%252C%2522announcementUrl%2522%253A%2522https%253A%252F%252Fblog.csdn.net%252Fblogdevteam%252Farticle%252Fdetails%252F103603408%2522%252C%2522announcementCount%2522%253A0%252C%2522announcementExpire%2522%253A3600000%257D; _csdn_newbbs_session=BAh7CEkiD3Nlc3Npb25faWQGOgZFRkkiJTRlNjc5YzY2YzQ4ODAwNzZlZDg1YmRhMjc5ZDJiYTY1BjsAVEkiDHVzZXJfaWQGOwBGaQSZ%2FacESSIQX2NzcmZfdG9rZW4GOwBGSSIxSWtITHZFc2hDSmdpbXdzTlU4QytFL2RtWiswNkhrTGd6WjhTVUpXN01NWT0GOwBG--44720c2c33cce034ee7b1f2d2f3f04b0a5688ff3; TY_SESSION_ID=d958c2e9-b7dd-4cb7-8264-d68d2605cb95', 'Upgrade-Insecure-Requests': '1', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'TE': 'Trailers' } #创建一个空的字典,储存爬取的key,value blog_detials = {} #爬取论坛的名称与对应的网页 def get_name(url): response = requests.get(url, headers=headers) text = response.content.decode('utf-8') html = etree.HTML(text) #名称 names = html.xpath('//div[@class = "list_1"]/ul/li/a/text()') #网页 urls = html.xpath('//div[@class = "list_1"]/ul/li/a/@href') #提取这个网页名称对应的网页 for each in range(len(names)): print(names[each]) #字典赋值 blog_detials[names[each]] = 'https://bbs.csdn.net' + urls[each] #保存到excel中 save_openpyxl(blog_detials) #延时 time.sleep(0.005) #储存函数line代表excel的行数 def save_openpyxl(blog_detials, line=[2]): file = openpyxl.Workbook() sheet = file.active sheet.title = '论坛' sheet['A1'] = '序号' sheet['B1'] = "名称" sheet['C1'] = '请求网址' line[0] = 2 for each in blog_detials: #对单元格赋值 sheet['A' + str(line[0])] = line[0] - 1 sheet['B' + str(line[0])] = each sheet['C' + str(line[0])] = blog_detials[each] line[0] = line[0] + 1 file.save('CSDN.xlsx') #开始运行程序,进行爬取 def start(page): for page in range(1, page+1, 1): #每页的url url = 'https://bbs.csdn.net/tech_hot_topics?page=' + str(page) print('*' * 30 + '正在爬取第' + str(page) + '页' + '*' * 30) get_name(url) time.sleep(0.005) if __name__ == '__main__': #该论坛共有100页 page = int(input("CSDN共100页,请输入你与要爬取页面个数:")) #爬取该网页 start(page)
豆瓣:https://blog.csdn.net/DHKSHFJ/article/details/104739831
电影天堂:https://blog.csdn.net/DHKSHFJ/article/details/104740106
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。