赞
踩
1. 抓取全国景点 【网站 http://www.bytravel.cn 数据】
- #coding=UTF-8
- from urllib.request import Request, urlopen,quote
- from urllib.error import URLError
- import chardet
- from bs4 import BeautifulSoup as BS
-
- import sys
- import re
- # from readability.readability import Document
- # from html2text import html2text
- def __searchUrls(pageCur,pageTotal):
- if pageCur == 1:
- url = 'http://www.bytravel.cn/view/index109_list.html'
- else:
- url = 'http://www.bytravel.cn/view/index109_list' + str(pageCur-1)+'.html'
- if pageCur > pageTotal:#获取前pageTotal页
- return
- else:
- try:
- # print(pageCur)
- # print(url)
- headers = {
- 'User-Agent':
- 'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
- }
- req = Request(url, headers=headers)
- response = urlopen(req)
- content = response.read().decode('gb2312','ignore')
- soup = BS(content)
- # print(soup)
- f=open('北京景点.txt', "a+",encoding='utf-8') #写入文件
- print("★ 上海旅游第【"+str(pageCur)+"】页"+url)
- for result_table in soup.findAll("table", {"id": "tjtable"}):
- # a_content =result_table.find("div",{"class": "user-section"})
- # a_href = a_content.find("a",{"class": "more flc80"})
- # text = getContextByurl('http://www.xxhh.com'+a_href.get("href"))
- title_div = result_table.find("div",{"id": "tctitle"})
- link = 'http://www.bytravel.cn'+title_div.a['href']
- title = title_div.text
- # text =result_table.find("div",{"id": "tcjs"}).text
- text = getContextByurl(link)
- print('['+title+']'+':'+link)
- print('简介:'+text+'\n'+'---------------------------------------------'+'\n')
- f.write('['+title+']'+':'+link)
- f.write('简介:'+text+'\n'+'---------------------------------------------'+'\n')
- except URLError as e:
- if hasattr(e, 'reason'):
- print('We failed to reach a server.')
- print('Reason: ', e.reason)
- elif hasattr(e, 'code'):
- print('The server couldn\'t fulfill the request.')
- print('Error code: ', e.code)
- pageCur = pageCur+ 1
- __searchUrls(pageCur,pageTotal)
-
- def getContextByurl(url):
- try:
- headers = {
- 'User-Agent':
- 'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
- }
- # print(url)
- req = Request(url, headers=headers)
- response = urlopen(req)
- html = response.read().decode('gb2312','ignore')
- soup = BS(html)
- # article =Document(html).summary()
- # text = html2text(article)
- div_text = soup.find("div", {"class": "f14"})
- return div_text.text
- except URLError as e:
- if hasattr(e, 'reason'):
- print('We failed to reach a server.')
- print('Reason: ', e.reason)
- return ''
- elif hasattr(e, 'code'):
- print('The server couldn\'t fulfill the request.')
- print('Error code: ', e.code)
- return ''
-
- if __name__ == '__main__':
- __searchUrls(1,20) #抓取第一页到第十页的嘻嘻哈哈
- # getContextByurl('http://www.bytravel.cn/Landscape/70/maominglu.html')

2. 抓取全国景点【网站 https://www.meet99.com/lvyou-shanghai.html】
- #coding=UTF-8
- from urllib.request import Request, urlopen,quote
- from urllib.error import URLError
- import chardet
- from bs4 import BeautifulSoup as BS
-
- import sys
- import re
- # from readability.readability import Document
- # from html2text import html2text
- def __searchUrls(pageCur,pageTotal):
- if pageCur == 1:
- url = 'https://www.meet99.com/lvyou-shanghai.html'
- else:
- url = 'http://www.bytravel.cn/view/index109_list' + str(pageCur-1)+'.html'
- if pageCur > pageTotal:#获取前pageTotal页
- return
- else:
- try:
- # print(pageCur)
- # print(url)
- headers = {
- 'User-Agent':
- 'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
- }
- req = Request(url, headers=headers)
- response = urlopen(req)
- content = response.read().decode('utf-8','ignore')
- soup = BS(content)
- # print(soup)
- # f=open('北京景点.txt', "a+",encoding='utf-8') #写入文件
- print("★ 上海旅游第【"+str(pageCur)+"】页"+url)
- for result_table in soup.findAll("li", {"class": "box"}):
- # a_content =result_table.find("div",{"class": "user-section"})
- # a_href = a_content.find("a",{"class": "more flc80"})
- # text = getContextByurl('http://www.xxhh.com'+a_href.get("href"))
- title_div = result_table.find("div",{"class": "img"})
- title_like = result_table.find("div",{"class": "bar"})# 想去与去过
- never_cnt =""
- ever_cnt = ""
- if title_like is not None:
- never_cnt = title_like.find("a",{"class": "never"}).text
- ever_cnt = title_like.find("a",{"class": "ever"}).text
- if title_div is None:
- continue
-
- link =title_div.a['href']
- title = title_div.a.text
- # text =result_table.find("div",{"id": "tcjs"}).text
- # text = getContextByurl(link)
- print('['+title+']'+':https://www.meet99.com'+link)
- print( ever_cnt +' '+never_cnt)
- # f.write('['+title+']'+':'+link)
- # f.write('简介:'+text+'\n'+'---------------------------------------------'+'\n')
- except URLError as e:
- if hasattr(e, 'reason'):
- print('We failed to reach a server.')
- print('Reason: ', e.reason)
- elif hasattr(e, 'code'):
- print('The server couldn\'t fulfill the request.')
- print('Error code: ', e.code)
- pageCur = pageCur+ 1
- __searchUrls(pageCur,pageTotal)
-
- if __name__ == '__main__':
- __searchUrls(1,1)

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。