赞
踩
目的:自己输入指定地方,指定职业,起始页数,程序会自动打印页面的所有信息。
实现过程:这次程序的代码跟以往的都不一样,这次也是我第一次使用面向对象式编程,并且并不是分析网页代码,分析json字符串得到数据
下面是代码
import requests,random,re import urllib user_agent = [ "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52" ] headers={"User-Agent": random.choice(user_agent)} class zhilianzhaopin(object): '''设置一个类''' url='https://fe-api.zhaopin.com/c/i/sou?'#设置一个url属性,这个属性值时链接的默认属性 #url中不变的内容,要和参数进行拼接组成完整的url def __init__(self,jl,kw,star_page,end_page): '''初始化函数,表示每次使用这个类时,这里的参数都要再次进行重新初始化''' self.jl=jl self.kw=kw self.star_page=star_page self.end_page=end_page self.list=[] def handle_request(self,page): data={ 'cityId':self.jl,#这里的self.jl引用的就是初始化函数的值 'kw':self.kw,#同上 'kt':page } url_now=self.url+urllib.parse.urlencode(data) print(url_now) #shel.url引用类属性 "urllib.parse.urlencode(data)"的作用是将data数据和类属性组合到一起,组成一个真正想要得到的链接 request=requests.get(url_now,headers=headers).text#读取网页源代码 return request#返回网页源代码 def parsse(self,conetent): '''查找网页数据,coentent属性就是要查找的网页源代码''' print('*'*100) jobname_list=re.findall(r'"jobName":"(.*?)"',conetent,re.S) cmpnyname_list=re.findall('"company":{"name":"(.*?)","number"',conetent,re.S) cmpnyurl_list=re.findall('"url":"(.*?)"',conetent,re.S) salary_list=re.findall('"salary":"(.*?)"',conetent,re.S) type_list=re.findall('"type":{"name":"(.*?)"}',conetent,re.S) updatedate_list=re.findall('"updateDate":"(.*?)"',conetent,re.S) '''查找具体的数据''' for a,b,c,d,e,f in zip(jobname_list,cmpnyname_list,cmpnyurl_list,salary_list,type_list,updatedate_list): #在这里出现问题,程序并没有报错但是下面的print不能够显示。 data={ '职位':a, '公司名称':b, '公司链接':c, '工资':d, '公司性质':e, '更新时间':f } print(data) self.list.append(data) def run(self): '''主管运行整个属性''' for page in range(self.star_page,self.end_page+1):#设置循环页数 request=self.handle_request(page)#将page引用到查找网页源代码的那个函数上,并赋值到request变量上 content=request#再次定义一个变量 self.parsse(content)#将上面的获取到网页源代码引用parsse函数中 def main(): jl=input('请输入工作地点:') kw=input('请输入关键词:') star_page=int(input('请输入起始页码:')) end_page=int(input('请输入结束页码:')) spider=zhilianzhaopin(jl,kw,star_page,end_page) #将类进行实例化,因为里面的jl,kw等是我们上面定义的初始化函数,所有要指定相应的值 spider.run() #调用类中run函数 if __name__ == '__main__': main()#指定main函数
程序有漏洞
下面是我根据上面面向对象编程对我之前的程序进行修改后的代码
目的指定标签,和起始页面下载百度图片
import requests,time,re,random user_agent = [ "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52" ] headers={"User-Agent": random.choice(user_agent)} class baiduimg(): url='http://image.baidu.com/search/flip?tn=baiduimage' def __init__(self,wd,star_page,end_page): self.wd=wd self.star_page=star_page self.end_page=end_page def get_url(self,page): '''查找网页链接''' data={ 'word':self.wd, 'pn':page } request=requests.get(self.url,data).text return request def get_url_down(self,contens): '''查找网页中的图片链接''' urls_list = re.findall(r' "objURL":"(.*?)"',contens,re.S) return urls_list def down_list_img(self,url_list): '''下载图片链接到指定文件夹''' for i in url_list: time.sleep(1) title = i.split('/')[-1] # 取链接"/"后的最后一段,作为图片名称 img_end = re.search('(.jpg|.gif|.png|.jpeg|.tif|.ico)$', title) # 在图片名称中进行查找,如果没有则返回None if img_end == None: # 进行判断,如果返回结果为None title = title + '.jpg' # 则在名字后面加上'.jpg'后缀 print('正在下载{}'.format(i)) img = requests.get(i, headers=headers, verify=False).content # 读取下载链接并返回二进制形式 try: with open('/users/qq/Desktop/百度图片下载/{}'.format(title), 'wb') as f: # 打开文件夹以写入二进制的形式创建文件 f.write(img) # 写入二进制 except: pass f.close() def run(self): '''执行上面的函数''' for page in range(self.star_page,self.end_page+1): data=self.get_url(page) url_list=self.get_url_down(data) self.down_list_img(url_list) def main(): kw=input('请输入想要下载的类型:') star_page=int(input('请输入开始页:')) end_page=int(input('请输入结束页:')) down=baiduimg(kw,star_page,end_page)#将类进行实例化操作 down.run() if __name__ == '__main__': main()#执行函数
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。