赞
踩
猎聘网的有关ptython的职位信息的获取(仅用于练习和代码测试)
利用chrome的自动控制,进行数据的获取
主要获取的字段为,公司名称、职位名称、薪资、应聘要求。
最后将数据存放到数据库中。
- import pymysql
- import sys
-
- def save(table):
- print('------------------------------')
- global conn
- conn = pymysql.connect(host='127.0.0.1',
- user='root',
- passwd='XXX',
- port=8080,
- charset='utf8')
- global cur
- cur = conn.cursor()
- print('获取游标')
- try:
- cur.execute("create database lp character set utf8;")
- except Exception as e:
- print(e)
- cur.execute('use lp;')
- try:
- cur.execute("create table "+table+"(id int,company char(100),job char(200),\
- address char(100),salary char(100),ask varchar(5000))character set utf8;"
- )
- except Exception as e:
- print(e)
- print('创建表完成')
- def inser_data(table,id,company,job,address,salary,ask):
- sql_insert = 'insert into '+table+'(id,company,job,address,salary,ask) values (%s,%s,%s,%s,%s,%s);'
- try:
- cur.execute(sql_insert,[id,company,job,address,salary,ask])
- except Exception as e:
- print(e)
- conn.commit()
-
-
- def my_txt(table,ask):
- f = open(table+'.txt','a+',encoding='utf-8')
- f.write(ask)
- f.close()
-

- '''
- 职位要求得数据全部存储在本地txt文档制作词云
- 公司名称,职位名称和薪资字段全部存放于数据库
- 由于薪资字段得数据显示方式为“XX-XX”的范围所以全部以字符串的形式进行存放
- '''
- from selenium import webdriver
- from time import sleep
- import random
- import re
- from lp_spider import save_data
- # from lp_spider import py_cloud
- start_url = 'https://www.liepin.com/zhaopin/'
-
- def open_url():
- global driver
- driver = webdriver.Chrome()
- driver.get(start_url)
- driver.maximize_window()
-
- def get_page(type):
- #隐形等待,网页完全打开
- driver.implicitly_wait(20)
- #输入需要查找的类型
- driver.find_element_by_xpath('//*[@id="sojob"]/div[1]/form/div[1]/div/div/div[1]/input').send_keys(type)
- #点击进行查找
- driver.find_element_by_xpath('//*[@id="sojob"]/div[1]/form/div[1]/div/div/div[1]/button').click()
- # 滑动滑块
- driver.execute_script('window.scrollBy(0, 500)')
-
-
-
- def get_info(table):
- global id # 标号
- id = 0
- for j in range(1,101):
- for i in range(1,41):
-
- global company # 公司名称
- global job # 职位名称
- global salary # 薪资
- global Ask # 职位要求
- try:
- ty = driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li['+str(i)+']/i/b').text
- except:
- ty = '无'
- print(ty)
- if ty == '企':
- #sleep(random.choice(range(5, 15)))
- #打开对应页面
- try:
- #打开对应的页面
- driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li['+str(i)+']/div/div[1]/h3/a').click()
- #print(i)
- #跳转
- print('站点地址:',end=' ')
- print(driver.current_url)
- handles = driver.window_handles
- driver.switch_to.window(handles[len(handles)-1])
- #print(driver.current_url)
- driver.implicitly_wait(20)
- #开始进行获取信息
- try:
- company = driver.find_element_by_xpath(
- '//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[1]/h3/a[@title]').text
- except Exception as e:
- print(e)
- try:
- company = driver.find_element_by_xpath(
- '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h3').text
- except Exception as e:
- print(e)
- company = driver.find_element_by_xpath(
- '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h1[@title]').text
-
- #print(company)
- try:
- job = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[1]/h1').text
- except Exception as e:
- print(e)
- job = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h1[@title]').text
- #print(job)
- #sleep(random.choice(range(1,5)))
- try:
- salary = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[1]').text
- salary_m = re.findall('[\u4e00-\u9fa5]+',salary)
- if (salary_m[0] == '面议'):
- salary = ['面议']
- else:
- salary = driver.find_element_by_xpath(
- '//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[1]').text
- if (len(salary)<8):
- salary = [salary]
- else:
- salary = re.findall('[0-9]*.[0-9]*.[\u4e00-\u9fa5]+', salary)
-
- except Exception as e:
- print(e)
- salary = driver.find_element_by_xpath(
- '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[2]/div/div/p[1]').text
- if (len(salary) < 8):
- salary = [salary]
- else:
- salary = re.findall('[0-9]*.[0-9]*.[\u4e00-\u9fa5]+', salary)
- #print(salary)#!salary经过处理后变成字典形式
- try:
-
- address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span/a').text
- except Exception as e:
- print(e)
- try:
- address = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[2]/div/div/p[2]/span').text
- except Exception as e:
- print(e)
- try:
- address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span/text()').text
- except Exception as e:
- print(e)
- address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span').text
- #print(address)
- #移动滑动条
- driver.execute_script('window.scrollBy(0,400)')
- #sleep(10)
- try:
- Ask= driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[3]/div').text
-
- except Exception as e:
- Ask = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[3]/div').text
- #Ask = Ask.replace("\n",'')
- try:
- Ask = Ask.replace("任职要求:", "")
- except:
- #print(Ask)
- pass
- try:
- Ask = Ask.replace("岗位职责:", "")
- except:
- #print(Ask)
- pass
- try:
- Ask = Ask.replace("职位描述:", "")
- except:
- #print(Ask)
- pass
- try:
- Ask = Ask.replace("岗位要求:", "")
- except:
- #print(Ask)
- pass
- try:
- Ask = Ask.replace("职责描述:", "")
-
- except:
- #print(Ask)
- pass
- try:
- Ask = Ask.replace("任职资格:", "")
- except:
- #print(Ask)
- pass
- # print(Ask)
- driver.close()
- handles = driver.window_handles
- sleep(random.choice(range(1, 5)))
- driver.switch_to.window(handles[len(handles)-2])
- # #滑动滑块
- # driver.execute_script('window.scrollBy(0, 145)')
- print(j, end='.')
- print(i)
- #print('————————————————————————————————————————————————————————————————————————' * 10)
- save_data.inser_data(table,str(id), company, job, address, salary[0], Ask)
- save_data.my_txt(table,Ask)
- id = id + 1
- except:
- pass
- else:
- print(j, end='.')
- print(i,end='完成')
- #print('————————————————————————————————————————————————————————————————————————'*10)
- if i<40:
- if ty == '企':
- # 滑动滑块
- driver.execute_script('window.scrollBy(0, 145)')
- if ty == '猎':
- driver.execute_script('window.scrollBy(0,141)')
- if ty == '直':
- driver.execute_script('window.scrollBy(0,145)')
- if ty == '无':
- driver.execute_script('window.scrollBy(0,137)')
- if ty == '优':
- driver.execute_script('window.scrollBy(0,139)')
- try:
- driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a[8]').click()
- except:
- driver.execute_script('window.scrollTo(0,0)')#返回到页面首位
- driver.execute_script('window.scrollBy(0,{})'.format(145 * 42))
- driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a[8]').click()
- sleep(random.choice(range(3,5)))
- driver.execute_script('window.scrollBy(0, 500)')
- save_data.cur.close()
- save_data.conn.close()
- if __name__ == '__main__':
- while(1):
- print('输入爬取职位类别名称,输入后按回车继续-->',end='')
- ty = input()
- save_data.save(ty)
- open_url()
- get_page(ty)
- get_info(ty)
- #py_cloud.make_cloud('python')
- print('爬取结束')

- # 词云
-
-
- from wordcloud import WordCloud
- import cv2
- import jieba
-
- with open('lp.txt', 'r',encoding='utf-8') as f:
- text = f.read()
-
- cut_text = " ".join(jieba.cut(text))
-
- color_mask = cv2.imread('python1.jpg')
-
- cloud = WordCloud(
- # 设置字体,不指定就会出现乱码
- font_path=" C:\\Windows\\Fonts\\STXINGKA.TTF",
- # font_path=path.join(d,'simsun.ttc'),
- # 设置背景色
- background_color='white',
- # 词云形状
- mask=color_mask,
- # 允许最大词汇
- max_words=10000,
- # 最大号字体
- max_font_size=100
- )
-
- wCloud = cloud.generate(cut_text)
- wCloud.to_file('cloud.png')
-
- import matplotlib.pyplot as plt
-
- plt.imshow(wCloud, interpolation='bilinear')
- plt.axis('off')
- plt.show()

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。