python-爬取酒店信息_python爬取酒店数据

作者：很楠不爱3 | 2024-04-07 17:40:45

踩

python爬取酒店数据

爬虫：爬取酒店信息

目标是爬取今天的酒店信息，并将这些信息存成文本文件。
功能分解：
    搜索功能，在搜索框输出地点和入住时间，点击搜索按钮
    获取一页完整的数据，由于去哪网一个页面数据分为两次加载，第一次加载数据，这时候需要将页面拉到底部，完成第二次数据加载。
    获取一页完整且渲染过的HTML文档后，使用BeautifulSoup将其中的酒店细腻些提取出来进行存储
    解析完成，点击下一页，继续抽取数据
1
2
3
4
5
6

import codecs
import datetime
import time

from cffi.backend_ctypes import unicode
from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

url = 'xxxx'


class QunaSpider(object):

    def get_hotel(self, driver, to_city, fromdate, todate):
        # driver.refresh()
        ele_toCity = driver.find_element(by=By.XPATH,
                                         value='/html/body/div[3]/div/section/section[1]/div[1]/div[2]/div/div/input')
        ele_fromDate = driver.find_element(by=By.XPATH,
                                           value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[1]/div/input')
        ele_toDate = driver.find_element(by=By.XPATH,
                                         value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[2]/div/input')

        ele_search_div = driver.find_element(by=By.XPATH,
                                             value='/html/body/div[3]/div/section/section[1]/div[1]/div[5]')
        ele_search = driver.find_element(by=By.CLASS_NAME, value='main')

        ele_toCity.clear()
        ele_toCity.send_keys(to_city)
        time.sleep(1)
        ele_toCity.send_keys(Keys.ENTER)

        ele_fromDate.clear()
        ele_fromDate.send_keys(fromdate)

        ele_toDate.clear()
        ele_toDate.send_keys(todate)

        # driver_wait = WebDriverWait(driver, 60)
        # driver_wait.until(EC.element_to_be_clickable(ele_search))

        ele_search.click()

        page_num = 0
        while True:
            try:
                WebDriverWait(driver, 10).until(
                    EC.title_contains(to_city)
                )
            except Exception as e:
                print(e)
                break
            time.sleep(2)
            js = 'window.scrollTo(0, document.body.scrollHeight)'
            driver.execute_script(js)

            htm_const = driver.page_source
            soup = BeautifulSoup(htm_const, 'html.parser', from_encoding='utf-8')
            infos = soup.findAll(class_='hotel-card-detail-btn')

            f = codecs.open(to_city + fromdate + '.html', 'a', 'utf-8')
            f.write(str(page_num) + '--' * 50)
            f.write('\r\n')
            for info in infos:
                href = url + info['href']
                title = info['title']
                f.write(href + "   " + title)
                f.write('\r\n')
            f.close()
            try:
                time.sleep(5)
                next_page = WebDriverWait(driver, 10).until(
                    EC.visibility_of(driver.find_element(by=By.CLASS_NAME, value='next'))
                )
                next_page.click()
                page_num += 1
                time.sleep(8)
            except Exception as e:
                print(e)
                break

    def crawl(self, root_url, to_city):
        today = datetime.date.today().strftime('%Y-%m-%d')
        tomorrow = datetime.date.today() + datetime.timedelta(days=1)
        tomorrow = tomorrow.strftime('%Y-%m-%d')

        # 一些网站可以识别出是否使用了Selenium 下面进行绕过selenium检测方法之一
        option = webdriver.ChromeOptions()
        # option.add_argument('--headless')
        option.add_experimental_option('useAutomationExtension', False)
        option.add_experimental_option('excludeSwitches', ['enable-automation'])
        driver = webdriver.Chrome(options=option)  # 弹出浏览器，要给浏览器驱动的地址
        # 打开页面优先执行的js,execute_cdp_cmd
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                    Object.defineProperty(navigator, 'webdriver', {
                      get: () => undefined
                    })
                  """
        })

        driver.implicitly_wait(10)
        driver.set_page_load_timeout(50)
        driver.get(root_url)
        driver.maximize_window()
        driver.implicitly_wait(10)

        self.get_hotel(driver, to_city, today, tomorrow)


if __name__ == '__main__':
    spider = QunaSpider()
    spider.crawl('xxxxx', u'长沙')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/很楠不爱3/article/detail/379810