当前位置:   article > 正文

Python 抓取旅游信息_python旅游信息采集

python旅游信息采集

1. 抓取全国景点    【网站  http://www.bytravel.cn 数据】

  1. #coding=UTF-8
  2. from urllib.request import Request, urlopen,quote
  3. from urllib.error import URLError
  4. import chardet
  5. from bs4 import BeautifulSoup as BS
  6. import sys
  7. import re
  8. # from readability.readability import Document
  9. # from html2text import html2text
  10. def __searchUrls(pageCur,pageTotal):
  11. if pageCur == 1:
  12. url = 'http://www.bytravel.cn/view/index109_list.html'
  13. else:
  14. url = 'http://www.bytravel.cn/view/index109_list' + str(pageCur-1)+'.html'
  15. if pageCur > pageTotal:#获取前pageTotal页
  16. return
  17. else:
  18. try:
  19. # print(pageCur)
  20. # print(url)
  21. headers = {
  22. 'User-Agent':
  23. 'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
  24. }
  25. req = Request(url, headers=headers)
  26. response = urlopen(req)
  27. content = response.read().decode('gb2312','ignore')
  28. soup = BS(content)
  29. # print(soup)
  30. f=open('北京景点.txt', "a+",encoding='utf-8') #写入文件
  31. print("★ 上海旅游第【"+str(pageCur)+"】页"+url)
  32. for result_table in soup.findAll("table", {"id": "tjtable"}):
  33. # a_content =result_table.find("div",{"class": "user-section"})
  34. # a_href = a_content.find("a",{"class": "more flc80"})
  35. # text = getContextByurl('http://www.xxhh.com'+a_href.get("href"))
  36. title_div = result_table.find("div",{"id": "tctitle"})
  37. link = 'http://www.bytravel.cn'+title_div.a['href']
  38. title = title_div.text
  39. # text =result_table.find("div",{"id": "tcjs"}).text
  40. text = getContextByurl(link)
  41. print('['+title+']'+':'+link)
  42. print('简介:'+text+'\n'+'---------------------------------------------'+'\n')
  43. f.write('['+title+']'+':'+link)
  44. f.write('简介:'+text+'\n'+'---------------------------------------------'+'\n')
  45. except URLError as e:
  46. if hasattr(e, 'reason'):
  47. print('We failed to reach a server.')
  48. print('Reason: ', e.reason)
  49. elif hasattr(e, 'code'):
  50. print('The server couldn\'t fulfill the request.')
  51. print('Error code: ', e.code)
  52. pageCur = pageCur+ 1
  53. __searchUrls(pageCur,pageTotal)
  54. def getContextByurl(url):
  55. try:
  56. headers = {
  57. 'User-Agent':
  58. 'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
  59. }
  60. # print(url)
  61. req = Request(url, headers=headers)
  62. response = urlopen(req)
  63. html = response.read().decode('gb2312','ignore')
  64. soup = BS(html)
  65. # article =Document(html).summary()
  66. # text = html2text(article)
  67. div_text = soup.find("div", {"class": "f14"})
  68. return div_text.text
  69. except URLError as e:
  70. if hasattr(e, 'reason'):
  71. print('We failed to reach a server.')
  72. print('Reason: ', e.reason)
  73. return ''
  74. elif hasattr(e, 'code'):
  75. print('The server couldn\'t fulfill the request.')
  76. print('Error code: ', e.code)
  77. return ''
  78. if __name__ == '__main__':
  79. __searchUrls(1,20) #抓取第一页到第十页的嘻嘻哈哈
  80. # getContextByurl('http://www.bytravel.cn/Landscape/70/maominglu.html')

2. 抓取全国景点【网站 https://www.meet99.com/lvyou-shanghai.html】 

  1. #coding=UTF-8
  2. from urllib.request import Request, urlopen,quote
  3. from urllib.error import URLError
  4. import chardet
  5. from bs4 import BeautifulSoup as BS
  6. import sys
  7. import re
  8. # from readability.readability import Document
  9. # from html2text import html2text
  10. def __searchUrls(pageCur,pageTotal):
  11. if pageCur == 1:
  12. url = 'https://www.meet99.com/lvyou-shanghai.html'
  13. else:
  14. url = 'http://www.bytravel.cn/view/index109_list' + str(pageCur-1)+'.html'
  15. if pageCur > pageTotal:#获取前pageTotal页
  16. return
  17. else:
  18. try:
  19. # print(pageCur)
  20. # print(url)
  21. headers = {
  22. 'User-Agent':
  23. 'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
  24. }
  25. req = Request(url, headers=headers)
  26. response = urlopen(req)
  27. content = response.read().decode('utf-8','ignore')
  28. soup = BS(content)
  29. # print(soup)
  30. # f=open('北京景点.txt', "a+",encoding='utf-8') #写入文件
  31. print("★ 上海旅游第【"+str(pageCur)+"】页"+url)
  32. for result_table in soup.findAll("li", {"class": "box"}):
  33. # a_content =result_table.find("div",{"class": "user-section"})
  34. # a_href = a_content.find("a",{"class": "more flc80"})
  35. # text = getContextByurl('http://www.xxhh.com'+a_href.get("href"))
  36. title_div = result_table.find("div",{"class": "img"})
  37. title_like = result_table.find("div",{"class": "bar"})# 想去与去过
  38. never_cnt =""
  39. ever_cnt = ""
  40. if title_like is not None:
  41. never_cnt = title_like.find("a",{"class": "never"}).text
  42. ever_cnt = title_like.find("a",{"class": "ever"}).text
  43. if title_div is None:
  44. continue
  45. link =title_div.a['href']
  46. title = title_div.a.text
  47. # text =result_table.find("div",{"id": "tcjs"}).text
  48. # text = getContextByurl(link)
  49. print('['+title+']'+':https://www.meet99.com'+link)
  50. print( ever_cnt +' '+never_cnt)
  51. # f.write('['+title+']'+':'+link)
  52. # f.write('简介:'+text+'\n'+'---------------------------------------------'+'\n')
  53. except URLError as e:
  54. if hasattr(e, 'reason'):
  55. print('We failed to reach a server.')
  56. print('Reason: ', e.reason)
  57. elif hasattr(e, 'code'):
  58. print('The server couldn\'t fulfill the request.')
  59. print('Error code: ', e.code)
  60. pageCur = pageCur+ 1
  61. __searchUrls(pageCur,pageTotal)
  62. if __name__ == '__main__':
  63. __searchUrls(1,1)

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/笔触狂放9/article/detail/372234
推荐阅读
相关标签
  

闽ICP备14008679号