赞
踩
先安装所需要的库
1、
pip install requests
2、pip install lxml
3、pip install bs4
安装完成后则可运行代码:
import requests from lxml import etree import os from queue import Queue import threading class Biquge(threading.Thread): def __init__(self,url=None,name=None,q_novels=None): super().__init__() self.url = url self.name = name self.q_novel = q_novels self.proxies = self.get_proxies() # self.parse() def get_proxies(self): try: response = requests.get('http://localhost:5000/get') proxy = response.text proxies = { 'http': 'http://' + proxy } return proxies except Exception: return None def get_xpath_by_requests(self,url, proxies): ''' :param url: :param proxies: 代理字典 :return: ''' try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 'Cookie': '_abcde_qweasd=0; _abcde_qweasd=0; bdshare_firstime=1577178973028; Hm_lvt_169609146ffe5972484b0957bd1b46d6=1577178973,1577186563,1577186739,1577235413; BAIDU_SSP_lcr=https://www.baidu.com/link?url=AvLJGcMiHKBXi90P2T0xOluezhPz2PeeTLAbP75dmma&wd=&eqid=e131d391001338d8000000025e02b3d2; Hm_lpvt_169609146ffe5972484b0957bd1b46d6=1577235422', 'Referer': 'http://www.xbiquge.la/' } response = requests.get(url, headers=headers, proxies=proxies) return etree.HTML(response.content.decode('utf-8')) except Exception: new_proxies = self.get_proxies() print('更换{}代理ip!'.format(new_proxies)) return self.get_xpath_by_requests(url, new_proxies) def get_text(self,text): if text: return text[0] return '' def write_to_txt(self,text, book_name): filename = './book/' + book_name + '.txt' # 保存的文件名以及文件格式 dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.mkdir(dirname) with open(filename, 'a+', encoding='utf-8') as fp: # 写入操作 fp.write(text) def parse_chapter(self,url): url = 'http://www.xbiquge.la' + url html = self.get_xpath_by_requests(url, self.proxies) chapter_name = self.get_text(html.xpath('//div[@class="bookname"]/h1/text()')) book_name = self.get_text(html.xpath('//div[@class="con_top"]/a[last()]/text()')) # print(chapter_name,book_name) contents = html.xpath('//div[@id="content"]/text()') # print(type(contents)) # content = '' content = ''.join(contents) text = chapter_name + r'\n' + content self.write_to_txt(text, book_name) # print(url) # print(''.join(contents)) def parse_novel(self,url): # 获取页面xpath对象 html = self.get_xpath_by_requests(url, self.proxies) chapters = html.xpath('//div[@id="list"]/dl/dd/a/@href') # print(chapters) for chapter in chapters: self.parse_chapter(chapter) def get_novels(self): html = self.get_xpath_by_requests(self.url, self.proxies) novel_urls = html.xpath('//span[@class="s2"]/a/@href') # print(novel_urls) return novel_urls # for url in novel_urls: # self.parse_novel(url) def run(self): while True: if self.q_novel.empty(): break novel_url = self.q_novel.get() print('======={}==========@{}'.format(novel_url,self.name)) self.parse_novel(novel_url) if __name__ == '__main__': base_url = 'http://www.xbiquge.la/xuanhuanxiaoshuo/' b = Biquge(url=base_url) novel_urs = b.get_novels() #初始化任务队列 q_novels = Queue() for url in novel_urs: q_novels.put(url) #创建一个list,遍历这个list创建线程 crawl_list = ['1','2','3','4','5'] for crwal in crawl_list: t = Biquge(name = crwal,q_novels=q_novels) t.start()
原文链接:https://blog.csdn.net/D_wart/article/details/103695881
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。