赞
踩
- # 第一种方式:requests 和 lxml结合使用
- import requests
- from lxml import etree
- #1、拿到所有的页面链接,并使用yield返回完整的超链接
- def get_html(url):
- # 获取页面HTML
- html=requests.get(url)
- # 使用etree格式化HTML
- seq=etree.HTML(html.text)
- link_list=seq.xpath('//*[@id="content"]/ul/li/a/@href')
- for i in link_list:
- yield "http://www.runoob.com"+i
- # 2、获取详细的页面数据
- def get_html_link(link):
- for i in link:
- # 获取界面
- link_html=requests.get(i)
- # 初始化
- link_seq=etree.HTML(link_html.content)
- # 得到标题
- title=link_seq.xpath('//*[@id="content"]/h1/text()')[0]
- # 得到题目内容
- subject=link_seq.xpath('//*[@id="content"]/p[position()>1 and position()<4]/text()')
- subject_list='\n'.join(subject)
- yield (title,subject_list)
- # 3、保存数据
- def save_subject(title_subject):
- with open("G:/1.txt",'a+',encoding='utf-8') as f:
- for title,subject_list in title_subject:
- f.write(title+'\n')
- f.write(subject_list+'\n')
- f.write("#"*50+'\n')
- # 4、函数回调
- def funcall(url):
- link=get_html(url)
- title_subject=get_html_link(link)
- save_subject(title_subject)
- # 5、主函数
- def main():
- url='http://www.runoob.com/python/python-100-examples.html'
- funcall(url)
- if __name__=="__main__":
- main()
-
- # for i in get_html('http://www.runoob.com/python/python-100-examples.html'):
- # print(i)
- # for i in get_html_link(link):
- # print(i)

- # 第二种方式:urllib.request 与 beautifulsoup结合使用
- import urllib.request
- from bs4 import BeautifulSoup
- # 1、获取所有页面链接
- def get_html(url):
- # 获取页面HTML源码
- html=urllib.request.urlopen(url).read()
- # 格式化html
- soup=BeautifulSoup(html,'lxml')
- # 首先找到第一个id='content'的标签,并找到子标签ul(2个)
- # 其次遍历子标签ul,并获取到所有的ul的子标签li
- # 然后遍历li标签,并获取到li标签下的所有a标签
- # 使用yield返回超链接
- for i in soup.find(id='content').find_all('ul'):
- for j in i.find_all('li'):
- for k in j.find_all('a'):
- yield 'http://www.runoob.com'+k['href']
- # 2、获取详细的页面数据
- def get_html_link(link):
- # 遍历所有的超链接
- for i in link:
- # 请求超链接页面HTML
- link_list=urllib.request.urlopen(i).read()
- # 格式化HTML
- soup=BeautifulSoup(link_list,'lxml')
- # 获取id='content'的标签
- content=soup.find(id='content')
- if content:
- # 获取h1标签的内容
- title=content.find('h1').string
- # 获取前3个p标签的内容
- conten_list=content.find_all('p',limit=3)
- subject=''
- for j in conten_list:
- subject+=j.get_text()
- yield (title,subject)
- # 3、数据保存
- def save_suject(title_content):
- with open('G:/2.txt','w+',encoding='utf+8') as f:
- for tile,content in title_content:
- f.write(tile+'\n')
- f.write(content+'\n')
- f.write('#'*80+'\n')
- # 4、函数回调
- def fun_call(url):
- link=get_html(url)
- title_content=get_html_link(link)
- save_suject(title_content)
- # 5、主函数
- def main():
- url='http://www.runoob.com/python/python-100-examples.html'
- fun_call(url)
- if __name__=='__main__':
- main()

- # 第三种方式
- import requests,re
- from bs4 import BeautifulSoup
- # 1、获取页面的超链接信息
- def get_html(url):
- html=requests.get(url)
- html.encoding='utf-8'
- soup=BeautifulSoup(html.text,'lxml')
- for i in soup.find_all('a',href=re.compile('^/python/python-exercise')):
- yield 'http://www.runoob.com'+i.get('href')
- # 2、获取超链接页面的详细信息
- def get_html_link(link_list):
- for i in link_list:
- html_link=requests.get(i)
- html_link.encoding='utf-8'
- soup=BeautifulSoup(html_link.text,'lxml')
- title=soup.find('div',class_="article-intro").h1.string
- con=soup.find('div',class_="article-intro").find_all('p')
- i=1
- list1=[]
- while True:
- if re.match('程序源代码',con[i].text) or re.match(' Python 100例',con[i].text) or re.match('以上实例输出结果为',con[i].text):
- break
- else:
- list1.append(con[i].text)
- i+=1
- yield (title,list1)
- # 3、保存数据
- def save_data(content_list):
- with open('G:/3.txt','w+',encoding='utf+8') as f:
- for tile,content in content_list:
- f.write(tile+'\n')
- for i in range(len(content)):
- f.write(content[i]+'\n')
- f.write('#'*80+'\n')
- # 4、函数回调
- def fun_call(url):
- link_list=get_html(url)
- content_list=get_html_link(link_list)
- save_data(content_list)
- # 5、主函数
- def main():
- url='http://www.runoob.com/python/python-100-examples.html'
- fun_call(url)
- if __name__=='__main__':
- main()

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。