赞
踩
运行环境:
Python3.6
book.zongheng.com
]采集代码Demo代理IP
,我写了一个代理IP
提取接口 -> 跳转;Cookie
信息加入到headers
中;VIP权限
才能访问的内容# -*- coding: utf-8 -*- # @Author : Leo import re import os import logging import requests from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter logging.basicConfig(level=logging.INFO, # 最低输出 format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S') class ZonghengSpider: """ 纵横中文网爬虫 - http://book.zongheng.com/ """ # 小说保存主路径 novel_save_dir = 'novels' session = requests.session() # 设置重试次数 session.mount('http://', HTTPAdapter(max_retries=3)) session.mount('https://', HTTPAdapter(max_retries=3)) def __init__(self): self.session.headers.update( { 'Host': 'book.zongheng.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'}) self.chapter_url = 'http://book.zongheng.com/api/chapter/chapterinfo?bookId={book_id}&chapterId={chapter_id}' def crawl(self, target_url: str): """ 开始爬取当前指定url :param target_url: 为需要爬取的书籍页面URL :return: """ def request_url(url): resp = self.session.get(url=url) if resp.status_code == 200: return resp.json() else: return None book_name, book_id, chapter_id = self.get_page_info(target_url) logging.info(f'获取到的书籍名: {book_name}, 书籍ID: {book_id}, 首章ID: {chapter_id}') if all([book_name, book_id, chapter_id]): # 设置保存路径 novel_save_path = os.path.join(self.novel_save_dir, book_name) if not os.path.exists(novel_save_path): os.makedirs(novel_save_path) logging.info(f'书籍保存路径: {novel_save_path}') index = 0 while True: index += 1 chapter_url = self._get_chapter_url(book_id, chapter_id)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。