当前位置:   article > 正文

爬取旅游网站_用bs4爬虫旅游网站

用bs4爬虫旅游网站

完整代码如下(可直接copy):

  1. from bs4 import BeautifulSoup
  2. import urllib.request
  3. import sqlite3
  4. import os
  5. import time
  6. import threading
  7. class Database:
  8. def open(self):
  9. self.con=sqlite3.connect("travels.db")
  10. self.cursor=self.con.cursor()
  11. def close(self):
  12. self.con.commit()
  13. self.con.close()
  14. def initialize(self):
  15. try:
  16. self.cursor.execute("drop table items")
  17. except:
  18. pass
  19. self.cursor.execute("create table items(ID varchar(8) primary key, tDate varchar(16), tTitle varchar(1024), tContent text, tExt varchar(8))")
  20. def insert(self, ID, tDate, tTitle, tContent, tExt):
  21. try:
  22. self.cursor.execute("insert into items (ID,tDate,tTitle,tContent,tExt) values(?, ?, ?, ?, ?)", [ID, tDate, tTitle, tContent, tExt])
  23. except Exception as err:
  24. print(err)
  25. def show(self):
  26. self.cursor.execute("select ID,tDate,tTitle,tContent,tExt from items order by ID")
  27. rows = self.cursor.fetchall()
  28. for row in rows:
  29. print(row[0])
  30. print(row[1])
  31. print(row[2])
  32. print(row[3])
  33. print(row[4])
  34. print()
  35. print("Total", len(rows), "items")
  36. def downloadImage(ID, src, tExt):
  37. try:
  38. req = urllib.request.Request(src, headers=headers)
  39. resp = urllib.request.urlopen(req, timeout=20)
  40. data = resp.read()
  41. imgName = ID + "." + tExt
  42. f = open("download\\" + imgName, "wb")
  43. f.write(data)
  44. f.close()
  45. print("Downloaded " + imgName)
  46. except Exception as err:
  47. print(err)
  48. def downloadContent(url):
  49. content = ""
  50. try:
  51. req = urllib.request.Request(url, headers=headers)
  52. resp = urllib.request.urlopen(req)
  53. html = resp.read().decode()
  54. soup = BeautifulSoup(html, "lxml")
  55. ps = soup.select("div[id='Content'] p")
  56. for p in ps:
  57. content += p.text + "\n"
  58. except Exception as err:
  59. print(err)
  60. return content
  61. def initializeDownload():
  62. if not os.path.exists("download"):
  63. os.mkdir("download")
  64. fs=os.listdir("download")
  65. for f in fs:
  66. os.remove("download\\"+f)
  67. def spider(url):
  68. global page, count, DB, threads
  69. page = page + 1
  70. print("Page", page, url)
  71. try:
  72. req = urllib.request.Request(url, headers=headers)
  73. resp = urllib.request.urlopen(req)
  74. html = resp.read().decode()
  75. soup = BeautifulSoup(html, "lxml")
  76. divs = soup.select("div[class='lft_art lf'] div[class='mb10 tw3_01_2']")
  77. for div in divs:
  78. tTitle = div.select_one("span h4").text
  79. tDate = div.select_one("span b").text
  80. count = count + 1
  81. ID = "%06d" % count
  82. img = div.select_one("span a img")
  83. src = ""
  84. tExt = ""
  85. if img:
  86. src = urllib.request.urljoin(url, img["src"])
  87. p = src.rfind(".")
  88. if p >= 0:
  89. tExt = src[p + 1:]
  90. T = threading.Thread(target=downloadImage, args=[ID, src, tExt])
  91. T.start()
  92. threads.append(T)
  93. link = div.select_one("span h4 a")["href"]
  94. link = urllib.request.urljoin(url, link)
  95. tContent = downloadContent(link)
  96. DB.insert(ID, tDate, tTitle, tContent, tExt)
  97. nextUrl = ""
  98. links = soup.select("div[id='div_currpage'] a[class='pagestyle']")
  99. for link in links:
  100. if link.text == "Next":
  101. href = link["href"]
  102. if href.startswith("//www."):
  103. nextUrl = "http:" + href
  104. else:
  105. nextUrl = urllib.request.urljoin(url, href)
  106. break
  107. if nextUrl:
  108. spider(nextUrl)
  109. except Exception as err:
  110. print(err)
  111. headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0)AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 32.0.1664.3Safari / 537.36"}
  112. while True:
  113. print("1.Spider")
  114. print("2.Show")
  115. print("3.Exit")
  116. s = input("Please enter(1,2,3):")
  117. if s == "1":
  118. initializeDownload()
  119. threads = []
  120. page = 0
  121. count = 0
  122. DB = Database()
  123. DB.open()
  124. DB.initialize()
  125. spider(url="http://www.chinadaily.com.cn/travel/citytours")
  126. DB.close()
  127. for T in threads:
  128. T.join()
  129. print("Total %d pages, %d items" % (page, count))
  130. elif s == "2":
  131. DB = Database()
  132. DB.open()
  133. DB.show()
  134. DB.close()
  135. else:
  136. break

运行结果:

 

爬取到的图片:

 

 

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/寸_铁/article/detail/1015279
推荐阅读
相关标签
  

闽ICP备14008679号