python 多线程爬虫实例（爬取智联招聘信息）_python获取智联招聘求职者电话

作者：Guff_9hys | 2024-08-06 23:57:23

踩

python获取智联招聘求职者电话

python程序和注释如下

#python requests测试

from bs4 import BeautifulSoup
import requests
from urllib import request
import re
import time
import threading
#打开文件

#伪装浏览器
headers = {'User-Agent' : r'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)' }
address='http://sou.zhaopin.com/jobs/searchresult.ashx?jl=653&kw=c%2b%2b&sm=0&sg=6729587202404b258ac42bedd469d049&p='
max_thread_count=11 #定义最大线程数
threadlist=[] #定义线程链表
thread=list(range(15)) #range(15)不是list类型的所以必须先转化为list类型
def spider(y,address,headers):
    print("thread is start",y)
    address = address+str(y)
    req=request.Request(address, headers=headers)
    page=request.urlopen(req).read()
    r=page.decode('utf-8')
    nresult=[]
#print(r.content)
#python 爬虫BeautifulSoup测试#根据类筛选公司，职位，月薪，地址
    soup = BeautifulSoup(r,"html.parser")
    result = soup.select('.gsmc')
    result2 =soup.select('.zwyx')
    result3 = soup.select('.gzdd')
    result4 = soup.select('.zwmc')
#正则表达式匹配，千万不能加括号，原因不明
    result = re.findall('[\u4e00-\u9fa5]+', str(result))
    result2 = re.findall('[\u4e00-\u9fa5]+|\d+-\d+', str(result2))
    result3 = re.findall('[\u4e00-\u9fa5]+', str(result3))
    result4 = re.findall('>.*[\u4e00-\u9fa5]+.*<', str(result4))
#滤除<>
    for i in result4:
        B=i.strip('<')
        nresult.append(B.strip('>'))#[\u4e00-\u9fa5]+[0-9a-zA-Z\_/()-]+[\u4e00-\u9fa5]+|[\u4e00-\u9fa5]+
    length= len(result4)
    f = open('jobtable'+str(y)+'.txt', 'a+',encoding='utf-8') #以追加的方式创建txt文件
    for i in range(length):
        f.write(nresult[i]+"\n") #职位
        time.sleep(0.01)          #
        f.write(result[i])        #公司
        time.sleep(0.01)
        f.write(result2[i])       #月薪
        time.sleep(0.01)
        f.write(result3[i]+"\n") #地址
        time.sleep(0.01)
#关闭文件
    f.close()

return True

if __name__ == '__main__':
    x= list(range(15))

    for y in x:
        while (not(threading.active_count()<max_thread_count)):
            print("i am in while")
            for n in threadlist:
                if not n.is_alive():
                    threadlist.remove(n)
        print(y)
        thread[y]=threading.Thread(target=spider,args=(y,address,headers))
        thread[y].daemon = True
        thread[y].start()
        time.sleep(0.5)
        threadlist.append(thread[y])
    for y in x:

thread[y].join() #必需阻塞否则主线程退出会直接导致子线程也被迫退出

通过以上程序就可以在txt上愉快地爬取招聘信息了。

本文内容由网友自发贡献，转载请注明出处：【wpsshop博客】