https://www.cnblogs.com/microman/p/6111711.html
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- # Created on 2017-12-07 13:40:43
- # Project: adquan
-
- from pyspider.libs.base_handler import *
-
-
- class Handler(BaseHandler):
- crawl_config = {
- }
-
- def __init__(self):
- self.deal = Deal()
-
- @every(minutes=24 * 60)
- def on_start(self):
- self.crawl('http://creative.adquan.com/show/42759', callback=self.detail_page)
-
- @config(age=10 * 24 * 60 * 60)
- def index_page(self, response):
- for each in response.doc('a[href^="http"]').items():
- self.crawl(each.attr.href, callback=self.detail_page)
-
- @config(priority=2)
- def detail_page(self, response):
- name = 'test'
- count = 0
- for img in response.doc('.con_Text img').items():
- url = img.attr.src
- if url:
- dir_path = self.deal.mkDir(name)
- extension = self.deal.getExtension(url)
- file_name = str(count) + '.' + extension
- count += 1
- self.crawl(img.attr.src, callback=self.save_img, save={'dir_path': dir_path, 'file_name':file_name})
- return {
- "url": response.url,
- "title": response.doc('title').text(),
- }
- def save_img(self, response):
- content = response.content
- dir_path = response.save['dir_path']
- file_name = response.save['file_name']
- file_path = dir_path + '/' + file_name
- self.deal.saveImg(content, file_path)
-
-
- import os
-
- DIR_PATH = "E:/pyspider/"
-
- class Deal:
- def __init__(self):
- self.path = DIR_PATH
- if not self.path.endswith('/'):
- self.path = self.path + '/'
- if not os.path.exists(self.path):
- os.makedirs(self.path)
-
- def mkDir(self, path):
- path = path.strip()
- dir_path = self.path + path
- exists = os.path.exists(dir_path)
- if not exists:
- os.makedirs(dir_path)
- return dir_path
- else:
- return dir_path
-
- def saveImg(self, content, path):
- f = open(path, 'wb')
- f.write(content)
- f.close()
-
- def saveBrief(self, content, dir_path, name):
- file_name = dir_path + "/" + name + ".txt"
- f = open(file_name, "w+")
- f.write(content.encode('utf-8'))
-
- def getExtension(self, url):
- extension = url.split('.')[-1]
- return extension
http://demo.pyspider.org/