赞
踩
完整代码如下(可直接copy):
- from bs4 import BeautifulSoup
- import urllib.request
- import sqlite3
- import os
- import time
- import threading
-
-
- class Database:
- def open(self):
- self.con=sqlite3.connect("travels.db")
- self.cursor=self.con.cursor()
-
- def close(self):
- self.con.commit()
- self.con.close()
-
- def initialize(self):
- try:
- self.cursor.execute("drop table items")
- except:
- pass
- self.cursor.execute("create table items(ID varchar(8) primary key, tDate varchar(16), tTitle varchar(1024), tContent text, tExt varchar(8))")
-
- def insert(self, ID, tDate, tTitle, tContent, tExt):
- try:
- self.cursor.execute("insert into items (ID,tDate,tTitle,tContent,tExt) values(?, ?, ?, ?, ?)", [ID, tDate, tTitle, tContent, tExt])
- except Exception as err:
- print(err)
-
- def show(self):
- self.cursor.execute("select ID,tDate,tTitle,tContent,tExt from items order by ID")
- rows = self.cursor.fetchall()
- for row in rows:
- print(row[0])
- print(row[1])
- print(row[2])
- print(row[3])
- print(row[4])
- print()
- print("Total", len(rows), "items")
-
-
- def downloadImage(ID, src, tExt):
- try:
- req = urllib.request.Request(src, headers=headers)
- resp = urllib.request.urlopen(req, timeout=20)
- data = resp.read()
- imgName = ID + "." + tExt
- f = open("download\\" + imgName, "wb")
- f.write(data)
- f.close()
- print("Downloaded " + imgName)
- except Exception as err:
- print(err)
-
-
- def downloadContent(url):
- content = ""
- try:
- req = urllib.request.Request(url, headers=headers)
- resp = urllib.request.urlopen(req)
- html = resp.read().decode()
- soup = BeautifulSoup(html, "lxml")
- ps = soup.select("div[id='Content'] p")
- for p in ps:
- content += p.text + "\n"
- except Exception as err:
- print(err)
- return content
-
-
- def initializeDownload():
- if not os.path.exists("download"):
- os.mkdir("download")
- fs=os.listdir("download")
- for f in fs:
- os.remove("download\\"+f)
-
-
- def spider(url):
- global page, count, DB, threads
- page = page + 1
- print("Page", page, url)
- try:
- req = urllib.request.Request(url, headers=headers)
- resp = urllib.request.urlopen(req)
- html = resp.read().decode()
- soup = BeautifulSoup(html, "lxml")
- divs = soup.select("div[class='lft_art lf'] div[class='mb10 tw3_01_2']")
- for div in divs:
- tTitle = div.select_one("span h4").text
- tDate = div.select_one("span b").text
- count = count + 1
- ID = "%06d" % count
- img = div.select_one("span a img")
- src = ""
- tExt = ""
- if img:
- src = urllib.request.urljoin(url, img["src"])
- p = src.rfind(".")
- if p >= 0:
- tExt = src[p + 1:]
- T = threading.Thread(target=downloadImage, args=[ID, src, tExt])
- T.start()
- threads.append(T)
- link = div.select_one("span h4 a")["href"]
- link = urllib.request.urljoin(url, link)
- tContent = downloadContent(link)
- DB.insert(ID, tDate, tTitle, tContent, tExt)
- nextUrl = ""
- links = soup.select("div[id='div_currpage'] a[class='pagestyle']")
- for link in links:
- if link.text == "Next":
- href = link["href"]
- if href.startswith("//www."):
- nextUrl = "http:" + href
- else:
- nextUrl = urllib.request.urljoin(url, href)
- break
- if nextUrl:
- spider(nextUrl)
- except Exception as err:
- print(err)
-
-
- headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0)AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 32.0.1664.3Safari / 537.36"}
-
- while True:
- print("1.Spider")
- print("2.Show")
- print("3.Exit")
- s = input("Please enter(1,2,3):")
- if s == "1":
- initializeDownload()
- threads = []
- page = 0
- count = 0
- DB = Database()
- DB.open()
- DB.initialize()
- spider(url="http://www.chinadaily.com.cn/travel/citytours")
- DB.close()
- for T in threads:
- T.join()
- print("Total %d pages, %d items" % (page, count))
- elif s == "2":
- DB = Database()
- DB.open()
- DB.show()
- DB.close()
- else:
- break

运行结果:
爬取到的图片:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。