赞
踩
目标是爬取今天的酒店信息,并将这些信息存成文本文件。
功能分解:
搜索功能,在搜索框输出地点和入住时间,点击搜索按钮
获取一页完整的数据,由于去哪网一个页面数据分为两次加载,第一次加载数据,这时候需要将页面拉到底部,完成第二次数据加载。
获取一页完整且渲染过的HTML文档后,使用BeautifulSoup将其中的酒店细腻些提取出来进行存储
解析完成,点击下一页,继续抽取数据
import codecs import datetime import time from cffi.backend_ctypes import unicode from selenium import webdriver from selenium.webdriver import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup url = 'xxxx' class QunaSpider(object): def get_hotel(self, driver, to_city, fromdate, todate): # driver.refresh() ele_toCity = driver.find_element(by=By.XPATH, value='/html/body/div[3]/div/section/section[1]/div[1]/div[2]/div/div/input') ele_fromDate = driver.find_element(by=By.XPATH, value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[1]/div/input') ele_toDate = driver.find_element(by=By.XPATH, value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[2]/div/input') ele_search_div = driver.find_element(by=By.XPATH, value='/html/body/div[3]/div/section/section[1]/div[1]/div[5]') ele_search = driver.find_element(by=By.CLASS_NAME, value='main') ele_toCity.clear() ele_toCity.send_keys(to_city) time.sleep(1) ele_toCity.send_keys(Keys.ENTER) ele_fromDate.clear() ele_fromDate.send_keys(fromdate) ele_toDate.clear() ele_toDate.send_keys(todate) # driver_wait = WebDriverWait(driver, 60) # driver_wait.until(EC.element_to_be_clickable(ele_search)) ele_search.click() page_num = 0 while True: try: WebDriverWait(driver, 10).until( EC.title_contains(to_city) ) except Exception as e: print(e) break time.sleep(2) js = 'window.scrollTo(0, document.body.scrollHeight)' driver.execute_script(js) htm_const = driver.page_source soup = BeautifulSoup(htm_const, 'html.parser', from_encoding='utf-8') infos = soup.findAll(class_='hotel-card-detail-btn') f = codecs.open(to_city + fromdate + '.html', 'a', 'utf-8') f.write(str(page_num) + '--' * 50) f.write('\r\n') for info in infos: href = url + info['href'] title = info['title'] f.write(href + " " + title) f.write('\r\n') f.close() try: time.sleep(5) next_page = WebDriverWait(driver, 10).until( EC.visibility_of(driver.find_element(by=By.CLASS_NAME, value='next')) ) next_page.click() page_num += 1 time.sleep(8) except Exception as e: print(e) break def crawl(self, root_url, to_city): today = datetime.date.today().strftime('%Y-%m-%d') tomorrow = datetime.date.today() + datetime.timedelta(days=1) tomorrow = tomorrow.strftime('%Y-%m-%d') # 一些网站可以识别出是否使用了Selenium 下面进行绕过selenium检测方法之一 option = webdriver.ChromeOptions() # option.add_argument('--headless') option.add_experimental_option('useAutomationExtension', False) option.add_experimental_option('excludeSwitches', ['enable-automation']) driver = webdriver.Chrome(options=option) # 弹出浏览器,要给浏览器驱动的地址 # 打开页面优先执行的js,execute_cdp_cmd driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) """ }) driver.implicitly_wait(10) driver.set_page_load_timeout(50) driver.get(root_url) driver.maximize_window() driver.implicitly_wait(10) self.get_hotel(driver, to_city, today, tomorrow) if __name__ == '__main__': spider = QunaSpider() spider.crawl('xxxxx', u'长沙')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。