当前位置:   article > 正文

python-爬取酒店信息_python爬取酒店数据

python爬取酒店数据
爬虫:爬取酒店信息
目标是爬取今天的酒店信息,并将这些信息存成文本文件。
功能分解:
    搜索功能,在搜索框输出地点和入住时间,点击搜索按钮
    获取一页完整的数据,由于去哪网一个页面数据分为两次加载,第一次加载数据,这时候需要将页面拉到底部,完成第二次数据加载。
    获取一页完整且渲染过的HTML文档后,使用BeautifulSoup将其中的酒店细腻些提取出来进行存储
    解析完成,点击下一页,继续抽取数据
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
import codecs
import datetime
import time

from cffi.backend_ctypes import unicode
from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

url = 'xxxx'


class QunaSpider(object):

    def get_hotel(self, driver, to_city, fromdate, todate):
        # driver.refresh()
        ele_toCity = driver.find_element(by=By.XPATH,
                                         value='/html/body/div[3]/div/section/section[1]/div[1]/div[2]/div/div/input')
        ele_fromDate = driver.find_element(by=By.XPATH,
                                           value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[1]/div/input')
        ele_toDate = driver.find_element(by=By.XPATH,
                                         value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[2]/div/input')

        ele_search_div = driver.find_element(by=By.XPATH,
                                             value='/html/body/div[3]/div/section/section[1]/div[1]/div[5]')
        ele_search = driver.find_element(by=By.CLASS_NAME, value='main')

        ele_toCity.clear()
        ele_toCity.send_keys(to_city)
        time.sleep(1)
        ele_toCity.send_keys(Keys.ENTER)

        ele_fromDate.clear()
        ele_fromDate.send_keys(fromdate)

        ele_toDate.clear()
        ele_toDate.send_keys(todate)

        # driver_wait = WebDriverWait(driver, 60)
        # driver_wait.until(EC.element_to_be_clickable(ele_search))

        ele_search.click()

        page_num = 0
        while True:
            try:
                WebDriverWait(driver, 10).until(
                    EC.title_contains(to_city)
                )
            except Exception as e:
                print(e)
                break
            time.sleep(2)
            js = 'window.scrollTo(0, document.body.scrollHeight)'
            driver.execute_script(js)

            htm_const = driver.page_source
            soup = BeautifulSoup(htm_const, 'html.parser', from_encoding='utf-8')
            infos = soup.findAll(class_='hotel-card-detail-btn')

            f = codecs.open(to_city + fromdate + '.html', 'a', 'utf-8')
            f.write(str(page_num) + '--' * 50)
            f.write('\r\n')
            for info in infos:
                href = url + info['href']
                title = info['title']
                f.write(href + "   " + title)
                f.write('\r\n')
            f.close()
            try:
                time.sleep(5)
                next_page = WebDriverWait(driver, 10).until(
                    EC.visibility_of(driver.find_element(by=By.CLASS_NAME, value='next'))
                )
                next_page.click()
                page_num += 1
                time.sleep(8)
            except Exception as e:
                print(e)
                break

    def crawl(self, root_url, to_city):
        today = datetime.date.today().strftime('%Y-%m-%d')
        tomorrow = datetime.date.today() + datetime.timedelta(days=1)
        tomorrow = tomorrow.strftime('%Y-%m-%d')

        # 一些网站可以识别出是否使用了Selenium 下面进行绕过selenium检测方法之一
        option = webdriver.ChromeOptions()
        # option.add_argument('--headless')
        option.add_experimental_option('useAutomationExtension', False)
        option.add_experimental_option('excludeSwitches', ['enable-automation'])
        driver = webdriver.Chrome(options=option)  # 弹出浏览器,要给浏览器驱动的地址
        # 打开页面优先执行的js,execute_cdp_cmd
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                    Object.defineProperty(navigator, 'webdriver', {
                      get: () => undefined
                    })
                  """
        })

        driver.implicitly_wait(10)
        driver.set_page_load_timeout(50)
        driver.get(root_url)
        driver.maximize_window()
        driver.implicitly_wait(10)

        self.get_hotel(driver, to_city, today, tomorrow)


if __name__ == '__main__':
    spider = QunaSpider()
    spider.crawl('xxxxx', u'长沙')

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/很楠不爱3/article/detail/379810
推荐阅读
相关标签
  

闽ICP备14008679号