赞
踩
提示:点开主页可以下载全部代码跟数据进行参考!
本文仅用于技术分享交流,如有用于其他用途的,自行承担后果
- from selenium import webdriver
- # from selenium.webdriver.chrome.options import Options
- import undetected_chromedriver as uc
-
- def share_browser():
- # 初始化
- chrome_options = uc.ChromeOptions()
- # chrome_options.add_argument((f'--proxy-server=http://27.150.162.104:4278'))
- # chrome_options.add_argument('--headless')
- chrome_options.add_argument('--disable-gpu')
- chrome_options.add_argument( 'service_args = [’–ignore - ssl - errors = true’, ‘–ssl - protocol = TLSv1’]') # 忽略ssl验证
- chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
- chrome_options.add_argument('--no-sandbox')
- chrome_options.add_argument("disable-infobars")
- # options.add_argument('--start-maximized')
- # options.add_argument('--start-fullscreen')
- chrome_options.add_argument('--single-process')
- chrome_options.add_argument('--disable-dev-shm-usage')
- chrome_options.add_argument('--disable-blink-features=AutomationControlled')
- chrome_options.add_experimental_option('useAutomationExtension', False)
- chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
- chrome_options.add_argument("log-level=3")
-
- chrome_options.add_argument("--disable-extensions")
- chrome_options.add_argument("--disable-popup-blocking")
- chrome_options.add_argument("--profile-directory=Default")
- chrome_options.add_argument("--ignore-certificate-errors")
- chrome_options.add_argument("--disable-plugins-discovery")
- chrome_options.add_argument("--incognito")
- chrome_options.add_argument('--no-first-run')
- chrome_options.add_argument('--no-service-autorun')
- chrome_options.add_argument('--no-default-browser-check')
- chrome_options.add_argument('--password-store=basic')
-
- chrome_options.add_argument(
- 'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36')
-
- # 这个路径是谷歌浏览器的路径
- path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
- chrome_options.binary_location = path
-
- browser = webdriver.Chrome(chrome_options=chrome_options)
- browser.maximize_window()
-
- return browser

代码如下(示例):
- def get_href(page):
- print(f'获取第:{page}页数据...')
- url = f'https://bj.zu.ke.com/zufang/pg{page}rs%E5%85%AC%E5%AF%93/#contentList'
- res = requests.get(url)
- tree = etree.HTML(res.text)
- href_list = tree.xpath('//*[@id="content"]/div[1]/div[1]/div/div/p[1]/a/@href')
- return href_list
代码如下(示例):
- def get_content(href):
- url = 'https://bj.zu.ke.com/' + href
- with open('used_url.txt', 'r', encoding='utf-8') as fp:
- used_url_list = [f.strip('\n') for f in fp.readlines()]
- data_list = []
- if url not in used_url_list:
- try:
- driver.get(url)
-
- # wait.until(EC.invisibility_of_element_located((By.XPATH, '/html/body/div[3]/div[1]/div[10]/div/div[1]/img')))
- # driver.find_element(By.XPATH, '/html/body/div[3]/div[1]/div[10]/div/div[1]/img').click()
- tree = etree.HTML(driver.page_source)
- # 标题
- title = tree.xpath('/html/body/div[3]/div[1]/div[3]/p/text()')[0].strip('\n').strip(' ')
- # 价格
- price = tree.xpath('//*[@id="aside"]/div[1]/span/text()') # 元/月
- price = price[0] if price else ''
- # 房屋类型
- house_class = tree.xpath('//*[@id="aside"]/ul/li[2]/text()')[0]
- # 面积
- area = tree.xpath('//*[@id="info"]/ul[1]/li[2]/text()')[0].replace('面积:', ' ')
- # 维护
- maintain = tree.xpath('//*[@id="info"]/ul[1]/li[5]/text()')[0].replace('维护:', ' ')
- # 楼层
- floor = tree.xpath('//*[@id="info"]/ul[1]/li[8]/text()')[0].replace('楼层:', ' ')
- # 车位
- parking_lot = tree.xpath('//*[@id="info"]/ul[1]/li[11]/text()')[0].replace('车位:', ' ')
- # 用电
- use_cable = tree.xpath('//*[@id="info"]/ul[1]/li[14]/text()')[0].replace('用电:', ' ')
- # 采暖
- heating = tree.xpath('//*[@id="info"]/ul[1]/li[17]/text()')[0].replace('采暖:', ' ')
- # 朝向
- point = tree.xpath('//*[@id="info"]/ul[1]/li[3]/text()')[0].replace('朝向:', ' ')
- # 入住时间
- move_into_time = tree.xpath('//*[@id="info"]/ul[1]/li[6]/text()')[0].replace('入住:', ' ')
- # 电梯
- lift = tree.xpath('//*[@id="info"]/ul[1]/li[9]/text()')[0].replace('电梯:', ' ')
- # 用水
- use_water = tree.xpath('//*[@id="info"]/ul[1]/li[12]/text()')[0].replace('用水:', ' ')
- # 燃气
- gas = tree.xpath('//*[@id="info"]/ul[1]/li[15]/text()')[0].replace('燃气:', ' ')
- # 租期
- tenancy = tree.xpath('//*[@id="info"]/ul[2]/li[2]/text()')[0].replace('租期:', ' ')
-
- # 状态
- # 洗衣机
- xiyiji = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[2]/@class')[0]
- status1 = '有' if 'no' not in xiyiji else '无'
- # 空调
- kongtiao = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[3]/@class')[0]
- status2 = '有' if 'no' not in kongtiao else '无'
- # 衣柜
- yigui = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[4]/@class')[0]
- status3 = '有' if 'no' not in yigui else '无'
- # 电视
- dianshi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[5]/@class')[0]
- status4 = '有' if 'no' not in dianshi else '无'
- # 冰箱
- bingxiang = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[6]/@class')[0]
- status5 = '有' if 'no' not in bingxiang else '无'
- # 热水器
- reshuqi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[7]/@class')[0]
- status6 = '有' if 'no' not in reshuqi else '无'
- # 床
- chuang = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[8]/@class')[0]
- status7 = '有' if 'no' not in chuang else '无'
- # 暖气
- nuanqi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[9]/@class')[0]
- status8 = '有' if 'no' not in nuanqi else '无'
- # 宽带
- kuandai = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[10]/@class')[0]
- status9 = '有' if 'no' not in kuandai else '无'
- # 天然气
- tianranqi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[11]/@class')[0]
- status10 = '有' if 'no' not in tianranqi else '无'
- # 地铁站距离
- # distance = tree.xpath('//*[@id="around"]/ul[2]/li[1]/span[2]/text()')
- # distance = distance[0] if distance else ' '
- # 滚动到想要元素
- ditu = driver.find_element(By.XPATH, '//*[@id="around"]/h3')
- driver.execute_script("arguments[0].scrollIntoView();", ditu)
- # wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[1]/span/')))
- time.sleep(1)
- # driver.execute_script('window.scrollBy(0,2000)')
- # js_button = 'document.documentElement.scrollTop=2400' //*[@id="around"]/h3
- # driver.execute_script(js_button)
- # time.sleep(2)
- # js_button_ = 'document.documentElement.scrollTop=2600'
- # driver.execute_script(js_button_)
- # # wait.until(EC.invisibility_of_element_located((By.XPATH, '//*[@id="around"]/ul[1]/li[2]')))
- # time.sleep(2)
- # js_button_ = 'document.documentElement.scrollTop=300'
- # driver.execute_script(js_button_)
- # time.sleep(0.5)
- # 地铁
- tree = etree.HTML(driver.page_source)
- metro = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li/p[1]/span/text()')
- metro = metro[0] if metro else ''
- # 公交
- button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[2]')
- driver.execute_script("$(arguments[0]).click()", button)
- time.sleep(1)
- tree = etree.HTML(driver.page_source)
- bus_route = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[2]/text()')
- bus_route = bus_route[0] if bus_route else ''
- # 学校
- button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[3]')
- driver.execute_script("$(arguments[0]).click()", button)
- time.sleep(2)
- tree = etree.HTML(driver.page_source)
- school = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[1]/span/text()')
- school = str(school).strip('[').strip(']')
- # 医院
- button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[4]')
- driver.execute_script("$(arguments[0]).click()", button)
- time.sleep(2)
- tree = etree.HTML(driver.page_source)
- hospital = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[1]/span/text()')
- hospital = str(hospital).strip('[').strip(']')
- # # 购物
- # button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[7]')
- # driver.execute_script("$(arguments[0]).click()", button)
- # time.sleep(2)
- # tree = etree.HTML(driver.page_source)
- # supermarket = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li/p[2]/text()')
- # supermarket = str(supermarket).strip('[').strip(']')
-
- data_list = [title, price, house_class, area, maintain, floor, parking_lot, use_cable, heating, point,
- move_into_time, lift, use_water, gas, tenancy,
- status1, status2, status3, status4, status5, status6, status7, status8, status9, status10,
- metro, bus_route, school,hospital,url
- ]
- print(data_list)
- with open('used_url.txt','a+',encoding='utf-8') as fp:
- fp.write(url+'\n')
- except:
- print(f'出错url:{url}')
- return data_list

点开主页可以下载全部代码跟数据进行参考!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。