Python多线程爬取小说_# pip install requests,pip install lxmlimport requ

作者：我家自动化 | 2024-06-17 00:21:28

踩

# pip install requests,pip install lxmlimport requestsfrom lxml import e

上一篇 Python爬取文章和小说内容

一、效果

在这里插入图片描述

二、代码(可直接运行)

先安装所需要的库

1、pip install requests
2、pip install lxml
3、pip install bs4

安装完成后则可运行代码：

import requests
from lxml import etree
import os
from queue import Queue
import threading
class Biquge(threading.Thread):
    def __init__(self,url=None,name=None,q_novels=None):
        super().__init__()
        self.url = url
        self.name = name
        self.q_novel = q_novels
        self.proxies = self.get_proxies()
        # self.parse()

    def get_proxies(self):
        try:
            response = requests.get('http://localhost:5000/get')
            proxy = response.text
            proxies = {
                'http': 'http://' + proxy
            }
            return proxies
        except Exception:
            return None

    def get_xpath_by_requests(self,url, proxies):
        '''
        :param url:
        :param proxies: 代理字典
        :return:
        '''
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
                'Cookie': '_abcde_qweasd=0; _abcde_qweasd=0; bdshare_firstime=1577178973028; Hm_lvt_169609146ffe5972484b0957bd1b46d6=1577178973,1577186563,1577186739,1577235413; BAIDU_SSP_lcr=https://www.baidu.com/link?url=AvLJGcMiHKBXi90P2T0xOluezhPz2PeeTLAbP75dmma&wd=&eqid=e131d391001338d8000000025e02b3d2; Hm_lpvt_169609146ffe5972484b0957bd1b46d6=1577235422',
                'Referer': 'http://www.xbiquge.la/'
            }
            response = requests.get(url, headers=headers, proxies=proxies)
            return etree.HTML(response.content.decode('utf-8'))
        except Exception:
            new_proxies = self.get_proxies()
            print('更换{}代理ip！'.format(new_proxies))
            return self.get_xpath_by_requests(url, new_proxies)

    def get_text(self,text):
        if text:
            return text[0]
        return ''

    def write_to_txt(self,text, book_name):
        filename = './book/' + book_name + '.txt'   # 保存的文件名以及文件格式
        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        with open(filename, 'a+', encoding='utf-8') as fp:   # 写入操作
            fp.write(text)

    def parse_chapter(self,url):
        url = 'http://www.xbiquge.la' + url
        html = self.get_xpath_by_requests(url, self.proxies)
        chapter_name = self.get_text(html.xpath('//div[@class="bookname"]/h1/text()'))
        book_name = self.get_text(html.xpath('//div[@class="con_top"]/a[last()]/text()'))
        # print(chapter_name,book_name)
        contents = html.xpath('//div[@id="content"]/text()')
        # print(type(contents))
        # content = ''
        content = ''.join(contents)
        text = chapter_name + r'\n' + content
        self.write_to_txt(text, book_name)
        # print(url)
        # print(''.join(contents))

    def parse_novel(self,url):
        # 获取页面xpath对象
        html = self.get_xpath_by_requests(url, self.proxies)
        chapters = html.xpath('//div[@id="list"]/dl/dd/a/@href')
        # print(chapters)
        for chapter in chapters:
            self.parse_chapter(chapter)

    def get_novels(self):
        html = self.get_xpath_by_requests(self.url, self.proxies)
        novel_urls = html.xpath('//span[@class="s2"]/a/@href')
        # print(novel_urls)
        return novel_urls
        # for url in novel_urls:
        #     self.parse_novel(url)
    def run(self):
        while True:
            if self.q_novel.empty():
                break
            novel_url = self.q_novel.get()
            print('======={}==========@{}'.format(novel_url,self.name))
            self.parse_novel(novel_url)
if __name__ == '__main__':
    base_url = 'http://www.xbiquge.la/xuanhuanxiaoshuo/'
    b = Biquge(url=base_url)
    novel_urs = b.get_novels()
    #初始化任务队列
    q_novels = Queue()
    for url in novel_urs:
        q_novels.put(url)
    #创建一个list，遍历这个list创建线程
    crawl_list = ['1','2','3','4','5']
    for crwal in crawl_list:
        t = Biquge(name = crwal,q_novels=q_novels)
        t.start()



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110

上一篇 Python爬取文章和小说内容

原文链接：https://blog.csdn.net/D_wart/article/details/103695881

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/我家自动化/article/detail/728862