赞
踩
改写为:
开发:
# 1. 创建项目
scrapy startproject mySpider
# 2.生成一个爬虫
scrapy genspider itcast itcast.cn
# 3.提取数据
# 根据网站结构在spider中实现数据采集相关
# 4.保存数据
# 使用pipeline进行数据后续处理和保存
import scrapy class ItcastSpider(scrapy.Spider): name = 'itcast' # 2.检查修改允许的域名 allowed_domains = ['itcast.cn'] # 1. 修改起始url start_urls = ['http://www.itcast.cn/channel/teacher.shtml#ajavaee'] # 3.在parse方法中实现爬取逻辑 def parse(self, response): node_list = response.xpath('//div[@class="li_txt"]') print(len(node_list))
import scrapy class ItcastSpider(scrapy.Spider): name = 'itcast' # 2.检查修改允许的域名 allowed_domains = ['itcast.cn'] # 1. 修改起始url start_urls = ['http://www.itcast.cn/channel/teacher.shtml#ajavaee'] # 3.在parse方法中实现爬取逻辑 def parse(self, response): # 获取所有教室结点 node_list = response.xpath('//div[@class="li_txt"]') # 遍历教师节点列表 for node in node_list: temp = {} # scrapy中xpath方法返回的是选择器对象列表 # extract() 用于从选择器对象中提取数据,配合索引 # extract_first()直接提取第一个数据,自然不用索引 # 确定只含有一个值,直接使用.extract_first() temp['name'] = node.xpath('./h3/text()').extract_first() temp['title'] = node.xpath('./h4/text()')[0].extract() temp['desc'] = node.xpath('./p/text()')[0].extract() # 如果使用return 函数就直接执行完毕了 # 如果使用列表 .append 还需要做翻页操作 yield temp
说明:
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter import json class MyspiderPipeline(object): def __init__(self): self.file = open('itcast.json','w') def process_item(self, item, spider): # print('itcast',item) # 将字典数据序列化 json_data = json.dumps(item, ensure_ascii=False) + ',\n' # 将数据写入文件 self.file.write(json_data) # 默认使用完管道之后 需要将数据返回给引擎 return item def __del__(self): self.file.close()
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class MyspiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() title = scrapy.Field() desc = scrapy.Field() pass
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。