当前位置:   article > 正文

Python+selenium 爬取贝壳房源数据_爬虫获取贝壳二手房数据

爬虫获取贝壳二手房数据

提示:点开主页可以下载全部代码跟数据进行参考!

文章目录


前言

本文仅用于技术分享交流,如有用于其他用途的,自行承担后果

一、封装selenium

  1. from selenium import webdriver
  2. # from selenium.webdriver.chrome.options import Options
  3. import undetected_chromedriver as uc
  4. def share_browser():
  5. # 初始化
  6. chrome_options = uc.ChromeOptions()
  7. # chrome_options.add_argument((f'--proxy-server=http://27.150.162.104:4278'))
  8. # chrome_options.add_argument('--headless')
  9. chrome_options.add_argument('--disable-gpu')
  10. chrome_options.add_argument( 'service_args = [’–ignore - ssl - errors = true’, ‘–ssl - protocol = TLSv1’]') # 忽略ssl验证
  11. chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
  12. chrome_options.add_argument('--no-sandbox')
  13. chrome_options.add_argument("disable-infobars")
  14. # options.add_argument('--start-maximized')
  15. # options.add_argument('--start-fullscreen')
  16. chrome_options.add_argument('--single-process')
  17. chrome_options.add_argument('--disable-dev-shm-usage')
  18. chrome_options.add_argument('--disable-blink-features=AutomationControlled')
  19. chrome_options.add_experimental_option('useAutomationExtension', False)
  20. chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  21. chrome_options.add_argument("log-level=3")
  22. chrome_options.add_argument("--disable-extensions")
  23. chrome_options.add_argument("--disable-popup-blocking")
  24. chrome_options.add_argument("--profile-directory=Default")
  25. chrome_options.add_argument("--ignore-certificate-errors")
  26. chrome_options.add_argument("--disable-plugins-discovery")
  27. chrome_options.add_argument("--incognito")
  28. chrome_options.add_argument('--no-first-run')
  29. chrome_options.add_argument('--no-service-autorun')
  30. chrome_options.add_argument('--no-default-browser-check')
  31. chrome_options.add_argument('--password-store=basic')
  32. chrome_options.add_argument(
  33. 'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36')
  34. # 这个路径是谷歌浏览器的路径
  35. path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
  36. chrome_options.binary_location = path
  37. browser = webdriver.Chrome(chrome_options=chrome_options)
  38. browser.maximize_window()
  39. return browser

二、抓取流程

1.翻页功能

代码如下(示例):

  1. def get_href(page):
  2. print(f'获取第:{page}页数据...')
  3. url = f'https://bj.zu.ke.com/zufang/pg{page}rs%E5%85%AC%E5%AF%93/#contentList'
  4. res = requests.get(url)
  5. tree = etree.HTML(res.text)
  6. href_list = tree.xpath('//*[@id="content"]/div[1]/div[1]/div/div/p[1]/a/@href')
  7. return href_list

2.通过xpath获取数据

代码如下(示例):

  1. def get_content(href):
  2. url = 'https://bj.zu.ke.com/' + href
  3. with open('used_url.txt', 'r', encoding='utf-8') as fp:
  4. used_url_list = [f.strip('\n') for f in fp.readlines()]
  5. data_list = []
  6. if url not in used_url_list:
  7. try:
  8. driver.get(url)
  9. # wait.until(EC.invisibility_of_element_located((By.XPATH, '/html/body/div[3]/div[1]/div[10]/div/div[1]/img')))
  10. # driver.find_element(By.XPATH, '/html/body/div[3]/div[1]/div[10]/div/div[1]/img').click()
  11. tree = etree.HTML(driver.page_source)
  12. # 标题
  13. title = tree.xpath('/html/body/div[3]/div[1]/div[3]/p/text()')[0].strip('\n').strip(' ')
  14. # 价格
  15. price = tree.xpath('//*[@id="aside"]/div[1]/span/text()') # 元/月
  16. price = price[0] if price else ''
  17. # 房屋类型
  18. house_class = tree.xpath('//*[@id="aside"]/ul/li[2]/text()')[0]
  19. # 面积
  20. area = tree.xpath('//*[@id="info"]/ul[1]/li[2]/text()')[0].replace('面积:', ' ')
  21. # 维护
  22. maintain = tree.xpath('//*[@id="info"]/ul[1]/li[5]/text()')[0].replace('维护:', ' ')
  23. # 楼层
  24. floor = tree.xpath('//*[@id="info"]/ul[1]/li[8]/text()')[0].replace('楼层:', ' ')
  25. # 车位
  26. parking_lot = tree.xpath('//*[@id="info"]/ul[1]/li[11]/text()')[0].replace('车位:', ' ')
  27. # 用电
  28. use_cable = tree.xpath('//*[@id="info"]/ul[1]/li[14]/text()')[0].replace('用电:', ' ')
  29. # 采暖
  30. heating = tree.xpath('//*[@id="info"]/ul[1]/li[17]/text()')[0].replace('采暖:', ' ')
  31. # 朝向
  32. point = tree.xpath('//*[@id="info"]/ul[1]/li[3]/text()')[0].replace('朝向:', ' ')
  33. # 入住时间
  34. move_into_time = tree.xpath('//*[@id="info"]/ul[1]/li[6]/text()')[0].replace('入住:', ' ')
  35. # 电梯
  36. lift = tree.xpath('//*[@id="info"]/ul[1]/li[9]/text()')[0].replace('电梯:', ' ')
  37. # 用水
  38. use_water = tree.xpath('//*[@id="info"]/ul[1]/li[12]/text()')[0].replace('用水:', ' ')
  39. # 燃气
  40. gas = tree.xpath('//*[@id="info"]/ul[1]/li[15]/text()')[0].replace('燃气:', ' ')
  41. # 租期
  42. tenancy = tree.xpath('//*[@id="info"]/ul[2]/li[2]/text()')[0].replace('租期:', ' ')
  43. # 状态
  44. # 洗衣机
  45. xiyiji = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[2]/@class')[0]
  46. status1 = '有' if 'no' not in xiyiji else '无'
  47. # 空调
  48. kongtiao = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[3]/@class')[0]
  49. status2 = '有' if 'no' not in kongtiao else '无'
  50. # 衣柜
  51. yigui = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[4]/@class')[0]
  52. status3 = '有' if 'no' not in yigui else '无'
  53. # 电视
  54. dianshi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[5]/@class')[0]
  55. status4 = '有' if 'no' not in dianshi else '无'
  56. # 冰箱
  57. bingxiang = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[6]/@class')[0]
  58. status5 = '有' if 'no' not in bingxiang else '无'
  59. # 热水器
  60. reshuqi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[7]/@class')[0]
  61. status6 = '有' if 'no' not in reshuqi else '无'
  62. # 床
  63. chuang = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[8]/@class')[0]
  64. status7 = '有' if 'no' not in chuang else '无'
  65. # 暖气
  66. nuanqi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[9]/@class')[0]
  67. status8 = '有' if 'no' not in nuanqi else '无'
  68. # 宽带
  69. kuandai = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[10]/@class')[0]
  70. status9 = '有' if 'no' not in kuandai else '无'
  71. # 天然气
  72. tianranqi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[11]/@class')[0]
  73. status10 = '有' if 'no' not in tianranqi else '无'
  74. # 地铁站距离
  75. # distance = tree.xpath('//*[@id="around"]/ul[2]/li[1]/span[2]/text()')
  76. # distance = distance[0] if distance else ' '
  77. # 滚动到想要元素
  78. ditu = driver.find_element(By.XPATH, '//*[@id="around"]/h3')
  79. driver.execute_script("arguments[0].scrollIntoView();", ditu)
  80. # wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[1]/span/')))
  81. time.sleep(1)
  82. # driver.execute_script('window.scrollBy(0,2000)')
  83. # js_button = 'document.documentElement.scrollTop=2400' //*[@id="around"]/h3
  84. # driver.execute_script(js_button)
  85. # time.sleep(2)
  86. # js_button_ = 'document.documentElement.scrollTop=2600'
  87. # driver.execute_script(js_button_)
  88. # # wait.until(EC.invisibility_of_element_located((By.XPATH, '//*[@id="around"]/ul[1]/li[2]')))
  89. # time.sleep(2)
  90. # js_button_ = 'document.documentElement.scrollTop=300'
  91. # driver.execute_script(js_button_)
  92. # time.sleep(0.5)
  93. # 地铁
  94. tree = etree.HTML(driver.page_source)
  95. metro = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li/p[1]/span/text()')
  96. metro = metro[0] if metro else ''
  97. # 公交
  98. button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[2]')
  99. driver.execute_script("$(arguments[0]).click()", button)
  100. time.sleep(1)
  101. tree = etree.HTML(driver.page_source)
  102. bus_route = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[2]/text()')
  103. bus_route = bus_route[0] if bus_route else ''
  104. # 学校
  105. button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[3]')
  106. driver.execute_script("$(arguments[0]).click()", button)
  107. time.sleep(2)
  108. tree = etree.HTML(driver.page_source)
  109. school = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[1]/span/text()')
  110. school = str(school).strip('[').strip(']')
  111. # 医院
  112. button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[4]')
  113. driver.execute_script("$(arguments[0]).click()", button)
  114. time.sleep(2)
  115. tree = etree.HTML(driver.page_source)
  116. hospital = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[1]/span/text()')
  117. hospital = str(hospital).strip('[').strip(']')
  118. # # 购物
  119. # button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[7]')
  120. # driver.execute_script("$(arguments[0]).click()", button)
  121. # time.sleep(2)
  122. # tree = etree.HTML(driver.page_source)
  123. # supermarket = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li/p[2]/text()')
  124. # supermarket = str(supermarket).strip('[').strip(']')
  125. data_list = [title, price, house_class, area, maintain, floor, parking_lot, use_cable, heating, point,
  126. move_into_time, lift, use_water, gas, tenancy,
  127. status1, status2, status3, status4, status5, status6, status7, status8, status9, status10,
  128. metro, bus_route, school,hospital,url
  129. ]
  130. print(data_list)
  131. with open('used_url.txt','a+',encoding='utf-8') as fp:
  132. fp.write(url+'\n')
  133. except:
  134. print(f'出错url:{url}')
  135. return data_list

总结

点开主页可以下载全部代码跟数据进行参考!

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/不正经/article/detail/212254
推荐阅读
相关标签
  

闽ICP备14008679号