爬取旅游网站_用bs4爬虫旅游网站

作者：寸_铁 | 2024-08-22 07:24:16

踩

用bs4爬虫旅游网站

完整代码如下（可直接copy）：


from bs4 import BeautifulSoup
import urllib.request
import sqlite3
import os
import time
import threading
 
 
class Database:
    def open(self):
        self.con=sqlite3.connect("travels.db")
        self.cursor=self.con.cursor()
 
    def close(self):
        self.con.commit()
        self.con.close()
 
    def initialize(self):
        try:
            self.cursor.execute("drop table items")
        except:
            pass
        self.cursor.execute("create table items(ID varchar(8) primary key, tDate varchar(16), tTitle varchar(1024), tContent text, tExt varchar(8))")
 
    def insert(self, ID, tDate, tTitle, tContent, tExt):
        try:
            self.cursor.execute("insert into items (ID,tDate,tTitle,tContent,tExt) values(?, ?, ?, ?, ?)", [ID, tDate, tTitle, tContent, tExt])
        except Exception as err:
            print(err)
 
    def show(self):
        self.cursor.execute("select ID,tDate,tTitle,tContent,tExt from items order by ID")
        rows = self.cursor.fetchall()
        for row in rows:
            print(row[0])
            print(row[1])
            print(row[2])
            print(row[3])
            print(row[4])
            print()
        print("Total", len(rows), "items")
 
 
def downloadImage(ID, src, tExt):
    try:
        req = urllib.request.Request(src, headers=headers)
        resp = urllib.request.urlopen(req, timeout=20)
        data = resp.read()
        imgName = ID + "." + tExt
        f = open("download\\" + imgName, "wb")
        f.write(data)
        f.close()
        print("Downloaded " + imgName)
    except Exception as err:
        print(err)
 
 
def downloadContent(url):
    content = ""
    try:
        req = urllib.request.Request(url, headers=headers)
        resp = urllib.request.urlopen(req)
        html = resp.read().decode()
        soup = BeautifulSoup(html, "lxml")
        ps = soup.select("div[id='Content'] p")
        for p in ps:
            content += p.text + "\n"
    except Exception as err:
        print(err)
    return content
 
 
def initializeDownload():
    if not os.path.exists("download"):
        os.mkdir("download")
    fs=os.listdir("download")
    for f in fs:
        os.remove("download\\"+f)
 
 
def spider(url):
    global page, count, DB, threads
    page = page + 1
    print("Page", page, url)
    try:
        req = urllib.request.Request(url, headers=headers)
        resp = urllib.request.urlopen(req)
        html = resp.read().decode()
        soup = BeautifulSoup(html, "lxml")
        divs = soup.select("div[class='lft_art lf'] div[class='mb10 tw3_01_2']")
        for div in divs:
            tTitle = div.select_one("span h4").text
            tDate = div.select_one("span b").text
            count = count + 1
            ID = "%06d" % count
            img = div.select_one("span a img")
            src = ""
            tExt = ""
            if img:
                src = urllib.request.urljoin(url, img["src"])
                p = src.rfind(".")
                if p >= 0:
                    tExt = src[p + 1:]
                T = threading.Thread(target=downloadImage, args=[ID, src, tExt])
                T.start()
                threads.append(T)
            link = div.select_one("span h4 a")["href"]
            link = urllib.request.urljoin(url, link)
            tContent = downloadContent(link)
            DB.insert(ID, tDate, tTitle, tContent, tExt)
        nextUrl = ""
        links = soup.select("div[id='div_currpage'] a[class='pagestyle']")
        for link in links:
            if link.text == "Next":
                href = link["href"]
                if href.startswith("//www."):
                    nextUrl = "http:" + href
                else:
                    nextUrl = urllib.request.urljoin(url, href)
                break
            if nextUrl:
                spider(nextUrl)
    except Exception as err:
        print(err)
 
 
headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0)AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 32.0.1664.3Safari / 537.36"}
 
while True:
    print("1.Spider")
    print("2.Show")
    print("3.Exit")
    s = input("Please enter(1,2,3):")
    if s == "1":
        initializeDownload()
        threads = []
        page = 0
        count = 0
        DB = Database()
        DB.open()
        DB.initialize()
        spider(url="http://www.chinadaily.com.cn/travel/citytours")
        DB.close()
        for T in threads:
            T.join()
        print("Total %d pages, %d items" % (page, count))
    elif s == "2":
        DB = Database()
        DB.open()
        DB.show()
        DB.close()
    else:
        break

运行结果：

爬取到的图片：

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/寸_铁/article/detail/1015279