赞
踩
网址:网址
目标:爬取共205咨询师的ID和姓名。
分析:姓名就不多解释,主要解释ID。打开开发者工具,随便定位一个咨询师。可以定位其在html中的标签。我们只需要解析出href的数据,抽取其中的数字即可。
由于采集咨询师的ID和姓名并不需要点击进入咨询师的详情界面,因此这里不需要使用webdriver,可以使用bs4等方式解析网页获取数据,这样速度更快,编写代码也更简单!完整代码如下:
#!/usr/bin/env python # encoding=utf-8 """ Author:YJY Need:爬取壹点零网站的咨询师列表LIST """ import codecs import requests from bs4 import BeautifulSoup import pandas as pd import warnings warnings.filterwarnings('ignore') DOWNLOAD_URL = 'https://ydl.com/experts/' def download_page(url): return requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' }).content def parse_html(html, list): # print("parse_html执行一次") try: soup = BeautifulSoup(html) doctor_list_soup = soup.find('div', attrs={'class': 'expertsList_items'}) # find_all获取该页面的所有咨询师信息。每个咨询师完整信息包含在<item>标签下。 for doctor_item in doctor_list_soup.find_all('div', attrs={'class': 'item'}): doctor_info = doctor_item.find('div', attrs={'class': 'info'}) doctor_information = doctor_info.find('h3').find('a') # 获取姓名 doctor_name = doctor_information.getText() # 获取id doctor_id = doctor_information['href'] doctor_id = ''.join(filter(lambda i: i in ['#'] or i.isdigit(), doctor_id)) # 将姓名和ID添加至咨询师信息列表 doctor_information_list.append([doctor_id, doctor_name.replace('\n','').replace(' ','')]) # 判断下一页是否存在 next_page = soup.find('li', attrs={'class': 'next'}).find('a') print(next_page['href']) if next_page: #下一页存在,则返回当前的咨询师列表信息和下一页的url return doctor_information_list, DOWNLOAD_URL + next_page['href'] # 下一页不存在,则返回当前的咨询师列表,url设置为None终止while循环 return doctor_information_list, None # 对采集的异常进行简单处理 except Exception as e: return doctor_information_list, DOWNLOAD_URL + next_page['href'] def main(): url = DOWNLOAD_URL page = 0 doctors = [] try: while url: # 打印当前正在打印的页数(前两次的抓取结果均为第一页,没有太仔细想这个bug,不过问题不大) print("正在打印第%d页" % page) page += 1 # 解析函数 try: html = download_page(url) doctors, url = parse_html(html, doctors) except Exception as e: # 第102页会异常(具体原因没有细想),所以遇到异常时手动设置下一页URL html = download_page(DOWNLOAD_URL + '/experts?experts=&page=' + str(page)) # 简单处理异常 except Exception as e: print(e) # 将列表转换为DataFrame,导出至csv文件 name = ['DocID', 'DocName'] doctor_information = pd.DataFrame(columns=name, data=doctors) doctor_information.to_csv('data/doctor_information.csv', encoding='utf-8', index=None) if __name__ == '__main__': main()
这样我们就得到了所有咨询师的ID和姓名信息,便于后续数据的采集。代码写的比较匆忙,还可以有很多修改,仅仅为了实现功能所以糙了点,异常处理都很随意哈哈哈哈,不要太纠结代码。
下面开始代码编写:
放在前面,对于xpath和selenium、webdriver不熟悉的朋友参考以下教程:
首先,导入相关包
import pandas as pd
import time
from selenium import webdriver
设置YiDianLing类,包含五个属性和两个函数。
class YiDianLing(object): def __init__(self): self.url = "https://ydl.com/experts/" # 创建浏览器 self.driver = webdriver.Chrome(r"chromedriver.exe") # 创建子浏览器(因为webdriver返回前一页,可能会发生数据丢失,因此我们直接采用子浏览器打开咨询师的具体问答,主浏览器仍保持咨询师问答列表,这里有很多坑,大家需要小心!) self.driver_son = webdriver.Chrome(r"chromedriver.exe") # 读取咨询师的ID和姓名 self.DoctorIdlist = pd.read_csv("data/doctor_information.csv")['DocID'] self.DoctorNamelist = pd.read_csv("data/doctor_information.csv")['DocName'] # 爬取问答数据 def AskDataCrawl(self): # 爬取文章数据(不多说明,代码在最后) def ArticleDataCrawl(self):
创建问答数据列表(最后将该列表转换为DataFrame,导出csv)。遍历前面爬取的咨询师信息文件,遍历每个咨询师的主页。
# 创建问答数据列表
ArticleData = []
# 遍历咨询师列表
for i in range(len(self.DoctorIdlist)):
# 访问第i+1个咨询师的主界面
doctorurl = self.url + str(self.DoctorIdlist[i])
self.driver.get(doctorurl)
time.sleep(1)
点击页面中的“查看所有”,跳转到问答界面。
# 跳转到问答板快(查看所有)
self.driver.find_element_by_link_text('查看全部').click()
# 关闭旧界面
self.driver.switch_to.window(self.driver.window_handles[0])
self.driver.close()
# 设置当前页面为最后一个界面
self.driver.switch_to.window(self.driver.window_handles[-1])
time.sleep(1)
遍历当前界面的所有问答数据
# 获取所有问答 questionsList = self.driver.find_elements_by_class_name('item') for question in questionsList: time.sleep(3) try:# 获取问题URL QuestionUrl = question.find_element_by_tag_name('a').get_attribute('href') print(QuestionUrl) # 获取问题ID和咨询师回复ID QuestionID = QuestionUrl.split('/')[4] AnswerID = QuestionID + str(self.DoctorIdlist[i]) # 打开子浏览器 self.driver_son.get(QuestionUrl) time.sleep(1) # 获取问题内容 QuestionText = self.driver_son.find_element_by_xpath('//*[@class="content"]/p').text QuestionText = QuestionText.replace('\n', '') # 获取提问时间 QuestionDate = self.driver_son.find_element_by_class_name('ask_right').text.split('\n') QuestionDate = QuestionDate[0] # 咨询师回复时间 AnswerDate = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/time').text # 获取咨询师回复内容 AnswerText = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/p').text AnswerText = AnswerText.replace('\n', '') # 获取咨询师回复收到的感谢数量 AnswerThanks = self.driver_son.find_element_by_xpath('//*[@class="votable"]/a/font').text # 添加至列表 AskData.append([self.DoctorIdlist[i], self.DoctorNamelist[i], QuestionID, QuestionText, QuestionDate, AnswerID, AnswerDate, AnswerText, AnswerThanks]) except Exception as e: continue
跳转至下一页,直至无法跳转时退出循环,遍历下一个咨询师的主页,重复上述循环。
# 跳转到下一页
try:
page += 1
self.driver.get('https://ydl.com/experts/' + str(self.DoctorIdlist[i]) + '/answerList/p' + str(page))
time.sleep(3)
except Exception as e:
break
完整代码如下,包括问大数据获取和文章数据的获取。代码编写比较匆忙,或多或少有些小问题,还请大家见谅!
#!/usr/bin/env python # encoding=utf-8 """ Author:YJY Need:爬取壹点零网站的咨询师问答数据和文章数据 """ import pandas as pd import time from selenium import webdriver class YiDianLing(object): def __init__(self): self.url = "https://ydl.com/experts/" # 创建浏览器 self.driver = webdriver.Chrome(r"chromedriver.exe") self.driver_son = webdriver.Chrome(r"chromedriver.exe") self.DoctorIdlist = pd.read_csv("data/doctor_information.csv")['DocID'] self.DoctorNamelist = pd.read_csv("data/doctor_information.csv")['DocName'] def AskDataCrawl(self): # 创建问答数据列表 AskData = [] # 遍历咨询师列表 nums = 1 for i in range(len(self.DoctorIdlist)): # 访问第i个咨询师的主界面 doctorurl = self.url + str(self.DoctorIdlist[i]) self.driver.get(doctorurl) time.sleep(1) # 跳转到问答板快(查看所有) self.driver.find_element_by_link_text('查看全部').click() self.driver.switch_to.window(self.driver.window_handles[0]) self.driver.close() self.driver.switch_to.window(self.driver.window_handles[-1]) time.sleep(1) # 问题列表界面page page = 1 while True: # 获取所有问答 questionsList = self.driver.find_elements_by_class_name('item') for question in questionsList: time.sleep(3) try: # 获取问题URL QuestionUrl = question.find_element_by_tag_name('a').get_attribute('href') print(QuestionUrl) # 获取问题ID和咨询师回复ID QuestionID = QuestionUrl.split('/')[4] AnswerID = QuestionID + str(self.DoctorIdlist[i]) # 打开子浏览器 self.driver_son.get(QuestionUrl) time.sleep(1) # 获取问题内容 QuestionText = self.driver_son.find_element_by_xpath('//*[@class="content"]/p').text QuestionText = QuestionText.replace('\n', '') # 获取提问时间 QuestionDate = self.driver_son.find_element_by_class_name('ask_right').text.split('\n') QuestionDate = QuestionDate[0] # 咨询师回复时间 AnswerDate = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/time').text # 获取咨询师回复内容 AnswerText = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/p').text AnswerText = AnswerText.replace('\n', '') # 获取咨询师回复收到的感谢数量 AnswerThanks = self.driver_son.find_element_by_xpath('//*[@class="votable"]/a/font').text # 添加至列表 AskData.append([self.DoctorIdlist[i], self.DoctorNamelist[i], QuestionID, QuestionText, QuestionDate, AnswerID, AnswerDate, AnswerText, AnswerThanks]) print(AskData) # self.driver_son.close() except Exception as e: continue # 跳转到下一页 try: page += 1 self.driver.get('https://ydl.com/experts/' + str(self.DoctorIdlist[i]) + '/answerList/p' + str(page)) time.sleep(3) except Exception as e: break nums += 1 # 导出csv Name = ['DocID', 'DocName', 'QuestionID', 'QuestionText', 'QuestionDate', 'AnswerID', 'AnswerDate', 'AnswerText', 'AnswerThanks'] df = pd.DataFrame(columns=Name, data=AskData) df.to_csv('data/doctor_ask.csv') def ArticleDataCrawl(self): # 创建问答数据列表 ArticleData = [] # 遍历咨询师列表 for i in range(len(self.DoctorIdlist)): # 访问第i+1个咨询师的主界面 doctorurl = self.url + str(self.DoctorIdlist[i]) self.driver.get(doctorurl) time.sleep(1) # 判断是否有文章,有则进入该板块 try: if '文章' in self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[5]').text: self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[5]').click() self.driver.switch_to.window(self.driver.window_handles[0]) self.driver.close() self.driver.switch_to.window(self.driver.window_handles[-1]) time.sleep(1) elif '文章' in self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[6]').text: self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[6]').click() self.driver.switch_to.window(self.driver.window_handles[0]) self.driver.close() self.driver.switch_to.window(self.driver.window_handles[-1]) time.sleep(1) else: continue except Exception as e: continue # 文章列表界面page page = 1 err_appear = False while not err_appear: articlesList = self.driver.find_elements_by_xpath('//*[@class="ui-content testings index testings_index chrome modern mac webkit desktop"]/div') for article in articlesList: time.sleep(3) try: # 获取文章URL ArticleUrl = article.find_element_by_tag_name('a').get_attribute('href') print(ArticleUrl) except: err_appear = True break # 获取文章ID ArticleID = ArticleUrl.split('/')[4] # 打开子浏览器 self.driver_son.get(ArticleUrl) time.sleep(1) try: # 获取文章发表时间和文章阅读量和获赞数 ArticleInfo = self.driver_son.find_element_by_xpath('//*[@class="post_desc"]').text ArticleDate = ArticleInfo.split(' ')[1] ArticleViews = ArticleInfo.split(' ')[2] ArticleViews = ''.join(filter(str.isdigit, ArticleViews)) ArticleVotes = ArticleInfo.split(' ')[3] ArticleVotes = ''.join(filter(str.isdigit, ArticleVotes)) # 获取文章内容 ArticleText = self.driver_son.find_element_by_class_name('event_content').text ArticleText = ArticleText.replace('\n', '') # 获取文章图片数量 ArticleImageNum = len(self.driver_son.find_elements_by_xpath('//*[@class="event_content"]//img')) # 添加至列表 ArticleData.append([self.DoctorIdlist[i], self.DoctorNamelist[i], ArticleID, ArticleDate, ArticleImageNum, ArticleText, ArticleViews, ArticleVotes]) print(ArticleData) except Exception as e: continue # 跳转到下一页 try: page += 1 self.driver.get('https://www.ydl.com/experts/' + str(self.DoctorIdlist[i]) + '/jingyan/p' + str(page)) time.sleep(3) except Exception as e: break # 导出csv Name = ['DocID', 'DocName', 'ArticleID', 'ArticleDate', 'ArticleImageNum', 'ArticleText', 'ArticleViews', 'ArticleVotes'] df = pd.DataFrame(columns=Name, data=ArticleData) df.to_csv('data/doctor_article.csv') if __name__ == '__main__': yidianling = YiDianLing() yidianling.AskDataCrawl() yidianling.ArticleDataCrawl()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。