赞
踩
通过关键字,爬取微博内容,微博内容,时间,链接等
例如:(文本有些折叠了)
关键词:台风

代码如下:
from selenium import webdriver from lxml import etree from urllib import parse from time import sleep import datetime from xlutils.copy import copy import xlrd import time keyword = '台风' # 爬取的关键词 y = 2020 # 起始年 m = 3 # 起始月 d = 10 # 起始日 days = 20 # 爬days天 url_keyword = parse.quote(keyword) # 将关键词转换成为网址可识别 def getday(y, m, d, n): # 封装日期 the_date = datetime.datetime(y, m, d) result_date = the_date + datetime.timedelta(days=n) d = result_date.strftime('%Y-%m-%d') return d def p(days, x): # 爬取解析存储 for i in range(days): data = getday(y, m, d, +i) for j in range(24): # 获取24小时的网址 if j == 23: data_add_hour = data + '-' + str(j) + ':' + getday(y, m, d, -(i - 1)) + '-' + str(0) else: data_add_hour = data + '-' + str(j) + ':' + data + '-' + str(j + 1) # selenium bro = webdriver.Chrome(executable_path=r'D:\python\chorm\chromedriver.exe') url = 'https://s.weibo.com/weibo?q=' + url_keyword + '&typeall=1&suball=1×cope=custom:' + data_add_hour print(url) bro.get(url) sleep(2) # 等待完整加载 page_text = bro.page_source # 完整页面 sleep(2) bro.quit() # 关闭网页 # 开始解析 tree = etree.HTML(page_text) print(tree) wb_list = tree.xpath("//div[@class='card-feed']") # # wb_list = tree.xpath(".// *[ @ id = 'pl_feedlist_index'] //div[@class='card-feed']") # # // *[ @ id = "pl_feedlist_index"] / div[2] / div[3] / div / div[1] # wb_list = tree.xpath("// *[ @ id = 'pl_feedlist_index'] / div[2] / div[3] / div / div[1]") wb_time = tree.xpath(".//*[@id='pl_feedlist_index']/div[2]/div[1]/div/div[1]/div[2]/p[2]/a[1]/text()") wb_name = tree.xpath( ".//*[@id='pl_feedlist_index']/div[2]/div[2]/div/div[1]/div[2]/div[1]/div[2]/a[1]/text()") wb_text = tree.xpath(".//*[@id='pl_feedlist_index']/div[2]/div[2]/div/div[1]/div[2]/p[1]//text() ") wb_from = tree.xpath(".//*[@id='pl_feedlist_index']/div[2]/div[5]/div/div[1]/div[2]/p[3]/a[2]/text()") wb_href = tree.xpath(".//*[@id='pl_feedlist_index']/div[2]/div[1]/div/div[1]/div[2]/p[2]/a[1]/@href") # print(wb_href) rb = xlrd.open_workbook('wb.xls') # 打开文件 wb = copy(rb) # 利用xlutils.copy下的copy函数复制 ws = wb.get_sheet(0) # 获取表单0 ws.write(x, 1, wb_name) print(wb_name) ws.write(x, 2, wb_href) print(wb_href) ws.write(x, 3, wb_text) print(wb_text) ws.write(x, 4, wb_time) print(wb_time) ws.write(x, 5, wb_from) print(wb_from) x = x + 1 print(x) wb.save('wb.xls') # 保存文件 if __name__ == '__main__': p(days, 1)
有几个问题还没完善
使用selenium太慢了(考虑多线程同时
获取的文本和时间有多余空格(正则
可以在某个时间没有微博爬到空的(添加一个判断
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。