赞
踩
系统 I/O 设备驱动程序通常调用其特定子系统的接口为 DMA 分配内存,但最终会调到 DMA 子系统的 dma_alloc_coherent()
/dma_alloc_attrs()
等接口。dma_alloc_coherent()
/dma_alloc_attrs()
等接口通过 DMA IOMMU 的回调分配内存,并为经过 IOMMU 的 DMA 内存访问准备转换表。之后经过 IOMMU 的 DMA 内存访问所需的所有工作都已完成。
音频子系统 ALSA 的驱动程序,通过 snd_pcm_lib_preallocate_pages()
/snd_pcm_lib_preallocate_pages_for_all()
/snd_pcm_lib_malloc_pages()
等函数为 DMA 分配内存,这些函数定义 (位于 sound/core/pcm_memory.c 文件中) 如下:
static int do_alloc_pages(struct snd_card *card, int type, struct device *dev, size_t size, struct snd_dma_buffer *dmab) { int err; if (max_alloc_per_card && card->total_pcm_alloc_bytes + size > max_alloc_per_card) return -ENOMEM; err = snd_dma_alloc_pages(type, dev, size, dmab); if (!err) { mutex_lock(&card->memory_mutex); card->total_pcm_alloc_bytes += dmab->bytes; mutex_unlock(&card->memory_mutex); } return err; } . . . . . . /* * try to allocate as the large pages as possible. * stores the resultant memory size in *res_size. * * the minimum size is snd_minimum_buffer. it should be power of 2. */ static int preallocate_pcm_pages(struct snd_pcm_substream *substream, size_t size) { struct snd_dma_buffer *dmab = &substream->dma_buffer; struct snd_card *card = substream->pcm->card; size_t orig_size = size; int err; do { err = do_alloc_pages(card, dmab->dev.type, dmab->dev.dev, size, dmab); if (err != -ENOMEM) return err; size >>= 1; } while (size >= snd_minimum_buffer); dmab->bytes = 0; /* tell error */ pr_warn("ALSA pcmC%dD%d%c,%d:%s: cannot preallocate for size %zu\n", substream->pcm->card->number, substream->pcm->device, substream->stream ? 'c' : 'p', substream->number, substream->pcm->name, orig_size); return 0; } . . . . . . /* * pre-allocate the buffer and create a proc file for the substream */ static void preallocate_pages(struct snd_pcm_substream *substream, int type, struct device *data, size_t size, size_t max, bool managed) { if (snd_BUG_ON(substream->dma_buffer.dev.type)) return; substream->dma_buffer.dev.type = type; substream->dma_buffer.dev.dev = data; if (size > 0 && preallocate_dma && substream->number < maximum_substreams) preallocate_pcm_pages(substream, size); if (substream->dma_buffer.bytes > 0) substream->buffer_bytes_max = substream->dma_buffer.bytes; substream->dma_max = max; if (max > 0) preallocate_info_init(substream); if (managed) substream->managed_buffer_alloc = 1; } static void preallocate_pages_for_all(struct snd_pcm *pcm, int type, void *data, size_t size, size_t max, bool managed) { struct snd_pcm_substream *substream; int stream; for (stream = 0; stream < 2; stream++) for (substream = pcm->streams[stream].substream; substream; substream = substream->next) preallocate_pages(substream, type, data, size, max, managed); } /** * snd_pcm_lib_preallocate_pages - pre-allocation for the given DMA type * @substream: the pcm substream instance * @type: DMA type (SNDRV_DMA_TYPE_*) * @data: DMA type dependent data * @size: the requested pre-allocation size in bytes * @max: the max. allowed pre-allocation size * * Do pre-allocation for the given DMA buffer type. */ void snd_pcm_lib_preallocate_pages(struct snd_pcm_substream *substream, int type, struct device *data, size_t size, size_t max) { preallocate_pages(substream, type, data, size, max, false); } EXPORT_SYMBOL(snd_pcm_lib_preallocate_pages); /** * snd_pcm_lib_preallocate_pages_for_all - pre-allocation for continuous memory type (all substreams) * @pcm: the pcm instance * @type: DMA type (SNDRV_DMA_TYPE_*) * @data: DMA type dependent data * @size: the requested pre-allocation size in bytes * @max: the max. allowed pre-allocation size * * Do pre-allocation to all substreams of the given pcm for the * specified DMA type. */ void snd_pcm_lib_preallocate_pages_for_all(struct snd_pcm *pcm, int type, void *data, size_t size, size_t max) { preallocate_pages_for_all(pcm, type, data, size, max, false); } EXPORT_SYMBOL(snd_pcm_lib_preallocate_pages_for_all); /** * snd_pcm_set_managed_buffer - set up buffer management for a substream * @substream: the pcm substream instance * @type: DMA type (SNDRV_DMA_TYPE_*) * @data: DMA type dependent data * @size: the requested pre-allocation size in bytes * @max: the max. allowed pre-allocation size * * Do pre-allocation for the given DMA buffer type, and set the managed * buffer allocation mode to the given substream. * In this mode, PCM core will allocate a buffer automatically before PCM * hw_params ops call, and release the buffer after PCM hw_free ops call * as well, so that the driver doesn't need to invoke the allocation and * the release explicitly in its callback. * When a buffer is actually allocated before the PCM hw_params call, it * turns on the runtime buffer_changed flag for drivers changing their h/w * parameters accordingly. */ void snd_pcm_set_managed_buffer(struct snd_pcm_substream *substream, int type, struct device *data, size_t size, size_t max) { preallocate_pages(substream, type, data, size, max, true); } EXPORT_SYMBOL(snd_pcm_set_managed_buffer); /** * snd_pcm_set_managed_buffer_all - set up buffer management for all substreams * for all substreams * @pcm: the pcm instance * @type: DMA type (SNDRV_DMA_TYPE_*) * @data: DMA type dependent data * @size: the requested pre-allocation size in bytes * @max: the max. allowed pre-allocation size * * Do pre-allocation to all substreams of the given pcm for the specified DMA * type and size, and set the managed_buffer_alloc flag to each substream. */ void snd_pcm_set_managed_buffer_all(struct snd_pcm *pcm, int type, struct device *data, size_t size, size_t max) { preallocate_pages_for_all(pcm, type, data, size, max, true); } EXPORT_SYMBOL(snd_pcm_set_managed_buffer_all); . . . . . . /** * snd_pcm_lib_malloc_pages - allocate the DMA buffer * @substream: the substream to allocate the DMA buffer to * @size: the requested buffer size in bytes * * Allocates the DMA buffer on the BUS type given earlier to * snd_pcm_lib_preallocate_xxx_pages(). * * Return: 1 if the buffer is changed, 0 if not changed, or a negative * code on failure. */ int snd_pcm_lib_malloc_pages(struct snd_pcm_substream *substream, size_t size) { struct snd_card *card; struct snd_pcm_runtime *runtime; struct snd_dma_buffer *dmab = NULL; if (PCM_RUNTIME_CHECK(substream)) return -EINVAL; if (snd_BUG_ON(substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_UNKNOWN)) return -EINVAL; runtime = substream->runtime; card = substream->pcm->card; if (runtime->dma_buffer_p) { /* perphaps, we might free the large DMA memory region to save some space here, but the actual solution costs us less time */ if (runtime->dma_buffer_p->bytes >= size) { runtime->dma_bytes = size; return 0; /* ok, do not change */ } snd_pcm_lib_free_pages(substream); } if (substream->dma_buffer.area != NULL && substream->dma_buffer.bytes >= size) { dmab = &substream->dma_buffer; /* use the pre-allocated buffer */ } else { dmab = kzalloc(sizeof(*dmab), GFP_KERNEL); if (! dmab) return -ENOMEM; dmab->dev = substream->dma_buffer.dev; if (do_alloc_pages(card, substream->dma_buffer.dev.type, substream->dma_buffer.dev.dev, size, dmab) < 0) { kfree(dmab); return -ENOMEM; } } snd_pcm_set_runtime_buffer(substream, dmab); runtime->dma_bytes = size; return 1; /* area was changed */ } EXPORT_SYMBOL(snd_pcm_lib_malloc_pages);
Linux 内核音频子系统的这些 DMA 内存分配函数的调用过程如下图:
snd_pcm_lib_preallocate_pages()
和 snd_pcm_lib_preallocate_pages_for_all()
函数用于预分配 DMA 内存,两者的区别在于,前者为单个子流预分配,后者为某个 PCM 下的所有子流预分配。它们都通过 preallocate_pages()
/preallocate_pcm_pages()
/do_alloc_pages()
函数为某个子流预分配内存。do_alloc_pages()
执行单个声卡所有子流分配的总内存量的控制,即不能超过 32 MB。preallocate_pcm_pages()
函数执行内存分配失败的恢复策略,当分配请求最大大小的内存块失败时,将请求大小减半再次尝试分配内存,直到内存分配成功,或请求大小小于 16 KB 则分配失败返回。preallocate_pages()
函数更新子流的缓冲区最大字节数为实际预分配的内存块的字节数。预分配的内存块保存在子流 snd_pcm_substream
的 dma_buffer
字段中。
音频子系统 ALSA 的驱动程序调用 snd_pcm_lib_malloc_pages()
函数为子流分配 DMA 缓冲区,这个过程大体如下:
do_alloc_pages()
函数为子流分配内存块。do_alloc_pages()
函数调用 snd_dma_alloc_pages()
函数根据请求的类型分配内存,这个函数定义 (位于 sound/core/memalloc.c 文件中) 如下:
#ifdef CONFIG_HAS_DMA /* allocate the coherent DMA pages */ static void snd_malloc_dev_pages(struct snd_dma_buffer *dmab, size_t size) { gfp_t gfp_flags; gfp_flags = GFP_KERNEL | __GFP_COMP /* compound page lets parts be mapped */ | __GFP_NORETRY /* don't trigger OOM-killer */ | __GFP_NOWARN; /* no stack trace print - this call is non-critical */ dmab->area = dma_alloc_coherent(dmab->dev.dev, size, &dmab->addr, gfp_flags); #ifdef CONFIG_X86 if (dmab->area && dmab->dev.type == SNDRV_DMA_TYPE_DEV_UC) set_memory_wc((unsigned long)dmab->area, PAGE_ALIGN(size) >> PAGE_SHIFT); #endif } /* free the coherent DMA pages */ static void snd_free_dev_pages(struct snd_dma_buffer *dmab) { #ifdef CONFIG_X86 if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_UC) set_memory_wb((unsigned long)dmab->area, PAGE_ALIGN(dmab->bytes) >> PAGE_SHIFT); #endif dma_free_coherent(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); } . . . . . . /** * snd_dma_alloc_pages - allocate the buffer area according to the given type * @type: the DMA buffer type * @device: the device pointer * @size: the buffer size to allocate * @dmab: buffer allocation record to store the allocated data * * Calls the memory-allocator function for the corresponding * buffer type. * * Return: Zero if the buffer with the given size is allocated successfully, * otherwise a negative value on error. */ int snd_dma_alloc_pages(int type, struct device *device, size_t size, struct snd_dma_buffer *dmab) { gfp_t gfp; if (WARN_ON(!size)) return -ENXIO; if (WARN_ON(!dmab)) return -ENXIO; dmab->dev.type = type; dmab->dev.dev = device; dmab->bytes = 0; dmab->area = NULL; dmab->addr = 0; dmab->private_data = NULL; switch (type) { case SNDRV_DMA_TYPE_CONTINUOUS: gfp = snd_mem_get_gfp_flags(device, GFP_KERNEL); dmab->area = alloc_pages_exact(size, gfp); break; case SNDRV_DMA_TYPE_VMALLOC: gfp = snd_mem_get_gfp_flags(device, GFP_KERNEL | __GFP_HIGHMEM); dmab->area = __vmalloc(size, gfp); break; #ifdef CONFIG_HAS_DMA #ifdef CONFIG_GENERIC_ALLOCATOR case SNDRV_DMA_TYPE_DEV_IRAM: snd_malloc_dev_iram(dmab, size); if (dmab->area) break; /* Internal memory might have limited size and no enough space, * so if we fail to malloc, try to fetch memory traditionally. */ dmab->dev.type = SNDRV_DMA_TYPE_DEV; fallthrough; #endif /* CONFIG_GENERIC_ALLOCATOR */ case SNDRV_DMA_TYPE_DEV: case SNDRV_DMA_TYPE_DEV_UC: snd_malloc_dev_pages(dmab, size); break; #endif #ifdef CONFIG_SND_DMA_SGBUF case SNDRV_DMA_TYPE_DEV_SG: case SNDRV_DMA_TYPE_DEV_UC_SG: snd_malloc_sgbuf_pages(device, size, dmab, NULL); break; #endif default: pr_err("snd-malloc: invalid device type %d\n", type); return -ENXIO; } if (! dmab->area) return -ENOMEM; dmab->bytes = size; return 0; } EXPORT_SYMBOL(snd_dma_alloc_pages); . . . . . . /** * snd_dma_free_pages - release the allocated buffer * @dmab: the buffer allocation record to release * * Releases the allocated buffer via snd_dma_alloc_pages(). */ void snd_dma_free_pages(struct snd_dma_buffer *dmab) { switch (dmab->dev.type) { case SNDRV_DMA_TYPE_CONTINUOUS: free_pages_exact(dmab->area, dmab->bytes); break; case SNDRV_DMA_TYPE_VMALLOC: vfree(dmab->area); break; #ifdef CONFIG_HAS_DMA #ifdef CONFIG_GENERIC_ALLOCATOR case SNDRV_DMA_TYPE_DEV_IRAM: snd_free_dev_iram(dmab); break; #endif /* CONFIG_GENERIC_ALLOCATOR */ case SNDRV_DMA_TYPE_DEV: case SNDRV_DMA_TYPE_DEV_UC: snd_free_dev_pages(dmab); break; #endif #ifdef CONFIG_SND_DMA_SGBUF case SNDRV_DMA_TYPE_DEV_SG: case SNDRV_DMA_TYPE_DEV_UC_SG: snd_free_sgbuf_pages(dmab); break; #endif default: pr_err("snd-malloc: invalid device type %d\n", dmab->dev.type); } } EXPORT_SYMBOL(snd_dma_free_pages);
DMA 内存最终由 DMA 子系统的 dma_alloc_coherent()
和 dma_free_coherent()
分配和释放。
dma_alloc_coherent()
函数定义 (位于 include/linux/dma-mapping.h 文件中) 如下:
static inline void *dma_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp)
{
return dma_alloc_attrs(dev, size, dma_handle, gfp,
(gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0);
}
static inline void dma_free_coherent(struct device *dev, size_t size,
void *cpu_addr, dma_addr_t dma_handle)
{
return dma_free_attrs(dev, size, cpu_addr, dma_handle, 0);
}
dma_alloc_coherent()
函数是 dma_alloc_attrs()
的包装,后者定义 (位于 kernel/dma/mapping.c 文件中) 如下:
static bool dma_go_direct(struct device *dev, dma_addr_t mask, const struct dma_map_ops *ops) { if (likely(!ops)) return true; #ifdef CONFIG_DMA_OPS_BYPASS if (dev->dma_ops_bypass) return min_not_zero(mask, dev->bus_dma_limit) >= dma_direct_get_required_mask(dev); #endif return false; } /* * Check if the devices uses a direct mapping for streaming DMA operations. * This allows IOMMU drivers to set a bypass mode if the DMA mask is large * enough. */ static inline bool dma_alloc_direct(struct device *dev, const struct dma_map_ops *ops) { return dma_go_direct(dev, dev->coherent_dma_mask, ops); } . . . . . . void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); void *cpu_addr; WARN_ON_ONCE(!dev->coherent_dma_mask); if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) return cpu_addr; /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); if (dma_alloc_direct(dev, ops)) cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); else if (ops->alloc) cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); else return NULL; debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); return cpu_addr; } EXPORT_SYMBOL(dma_alloc_attrs);
dma_alloc_attrs()
函数的执行过程如下:
dma_alloc_from_dev_coherent()
函数尝试从系统 I/O 设备的 coherent 内存中分配,如果成功就返回,否则继续执行。dma_direct_alloc()
函数分配内存;否则通过设备的 DMA 映射操作的 alloc()
回调分配内存。一般来说,DMA 映射操作有效时,不需要使用直接映射。在开启了 IOMMU 的情况下,对于连接到 IOMMU 的系统 I/O 设备,DMA 映射操作在系统 I/O 设备的 IOMMU 探测及其与 IOMMU 设备的连接过程中,在 of_dma_configure_id()
函数中,通过调用 arch_setup_dma_ops()
函数设置。设备的 coherent 内存池是 Linux 内核的一种系统 I/O 设备驱动程序 DMA 内存管理和分配机制。dma_alloc_from_dev_coherent()
函数定义 (位于 kernel/dma/coherent.c 文件中) 如下:
static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev) { if (dev && dev->dma_mem) return dev->dma_mem; return NULL; } static inline dma_addr_t dma_get_device_base(struct device *dev, struct dma_coherent_mem * mem) { if (mem->use_dev_dma_pfn_offset) return phys_to_dma(dev, PFN_PHYS(mem->pfn_base)); return mem->device_base; } . . . . . . static void *__dma_alloc_from_coherent(struct device *dev, struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) { int order = get_order(size); unsigned long flags; int pageno; void *ret; spin_lock_irqsave(&mem->spinlock, flags); if (unlikely(size > ((dma_addr_t)mem->size << PAGE_SHIFT))) goto err; pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); if (unlikely(pageno < 0)) goto err; /* * Memory was found in the coherent area. */ *dma_handle = dma_get_device_base(dev, mem) + ((dma_addr_t)pageno << PAGE_SHIFT); ret = mem->virt_base + ((dma_addr_t)pageno << PAGE_SHIFT); spin_unlock_irqrestore(&mem->spinlock, flags); memset(ret, 0, size); return ret; err: spin_unlock_irqrestore(&mem->spinlock, flags); return NULL; } /** * dma_alloc_from_dev_coherent() - allocate memory from device coherent pool * @dev: device from which we allocate memory * @size: size of requested memory area * @dma_handle: This will be filled with the correct dma handle * @ret: This pointer will be filled with the virtual address * to allocated area. * * This function should be only called from per-arch dma_alloc_coherent() * to support allocation from per-device coherent memory pools. * * Returns 0 if dma_alloc_coherent should continue with allocating from * generic memory areas, or !0 if dma_alloc_coherent should return @ret. */ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret) { struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); if (!mem) return 0; *ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle); return 1; }
dma_alloc_from_dev_coherent()
函数从设备的 coherent 内存池中分配内存。coherent 内存池由设备驱动程序通过 dma_declare_coherent_memory()
接口创建,这个接口实现 (位于 kernel/dma/coherent.c 文件中) 如下:
static int dma_init_coherent_memory(phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, struct dma_coherent_mem **mem) { struct dma_coherent_mem *dma_mem = NULL; void *mem_base = NULL; int pages = size >> PAGE_SHIFT; int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); int ret; if (!size) { ret = -EINVAL; goto out; } mem_base = memremap(phys_addr, size, MEMREMAP_WC); if (!mem_base) { ret = -EINVAL; goto out; } dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); if (!dma_mem) { ret = -ENOMEM; goto out; } dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); if (!dma_mem->bitmap) { ret = -ENOMEM; goto out; } dma_mem->virt_base = mem_base; dma_mem->device_base = device_addr; dma_mem->pfn_base = PFN_DOWN(phys_addr); dma_mem->size = pages; spin_lock_init(&dma_mem->spinlock); *mem = dma_mem; return 0; out: kfree(dma_mem); if (mem_base) memunmap(mem_base); return ret; } . . . . . . static int dma_assign_coherent_memory(struct device *dev, struct dma_coherent_mem *mem) { if (!dev) return -ENODEV; if (dev->dma_mem) return -EBUSY; dev->dma_mem = mem; return 0; } /* * Declare a region of memory to be handed out by dma_alloc_coherent() when it * is asked for coherent memory for this device. This shall only be used * from platform code, usually based on the device tree description. * * phys_addr is the CPU physical address to which the memory is currently * assigned (this will be ioremapped so the CPU can access the region). * * device_addr is the DMA address the device needs to be programmed with to * actually address this memory (this will be handed out as the dma_addr_t in * dma_alloc_coherent()). * * size is the size of the area (must be a multiple of PAGE_SIZE). * * As a simplification for the platforms, only *one* such region of memory may * be declared per device. */ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size) { struct dma_coherent_mem *mem; int ret; ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem); if (ret) return ret; ret = dma_assign_coherent_memory(dev, mem); if (ret) dma_release_coherent_memory(mem); return ret; }
对于没有使用 coherent 内存池的系统 I/O 设备驱动程序,可以忽略从 coherent 内存池分配内存的逻辑。
arch_setup_dma_ops()
为系统 I/O 设备设置 DMA 操作,它是各个 CPU 硬件平台提供的操作,对于 ARM64 平台,这个操作的实现 (位于 arch/arm64/mm/dma-mapping.c 文件中) 如下:
void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) { int cls = cache_line_size_of_cpu(); WARN_TAINT(!coherent && cls > ARCH_DMA_MINALIGN, TAINT_CPU_OUT_OF_SPEC, "%s %s: ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)", dev_driver_string(dev), dev_name(dev), ARCH_DMA_MINALIGN, cls); dev->dma_coherent = coherent; if (iommu) iommu_setup_dma_ops(dev, dma_base, size); #ifdef CONFIG_XEN if (xen_initial_domain()) dev->dma_ops = &xen_swiotlb_dma_ops; #endif }
arch_setup_dma_ops()
操作调用 IOMMU 子系统的 iommu_setup_dma_ops()
函数为系统 I/O 设备设置 DMA 操作,iommu_setup_dma_ops()
函数定义 (位于 drivers/iommu/dma-iommu.c 文件中) 如下:
static const struct dma_map_ops iommu_dma_ops = { .alloc = iommu_dma_alloc, .free = iommu_dma_free, .alloc_pages = dma_common_alloc_pages, .free_pages = dma_common_free_pages, .alloc_noncoherent = iommu_dma_alloc_noncoherent, .free_noncoherent = iommu_dma_free_noncoherent, .mmap = iommu_dma_mmap, .get_sgtable = iommu_dma_get_sgtable, .map_page = iommu_dma_map_page, .unmap_page = iommu_dma_unmap_page, .map_sg = iommu_dma_map_sg, .unmap_sg = iommu_dma_unmap_sg, .sync_single_for_cpu = iommu_dma_sync_single_for_cpu, .sync_single_for_device = iommu_dma_sync_single_for_device, .sync_sg_for_cpu = iommu_dma_sync_sg_for_cpu, .sync_sg_for_device = iommu_dma_sync_sg_for_device, .map_resource = iommu_dma_map_resource, .unmap_resource = iommu_dma_unmap_resource, .get_merge_boundary = iommu_dma_get_merge_boundary, }; /* * The IOMMU core code allocates the default DMA domain, which the underlying * IOMMU driver needs to support via the dma-iommu layer. */ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); if (!domain) goto out_err; /* * The IOMMU core code allocates the default DMA domain, which the * underlying IOMMU driver needs to support via the dma-iommu layer. */ if (domain->type == IOMMU_DOMAIN_DMA) { if (iommu_dma_init_domain(domain, dma_base, size, dev)) goto out_err; dev->dma_ops = &iommu_dma_ops; } return; out_err: pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n", dev_name(dev)); }
iommu_setup_dma_ops()
函数调用 iommu_dma_init_domain()
初始化 DMA 映射 domain,并为系统 I/O 设备设置 DMA 操作。iommu_dma_init_domain()
函数定义 (位于 drivers/iommu/dma-iommu.c 文件中) 如下:
static int iova_reserve_iommu_regions(struct device *dev, struct iommu_domain *domain) { struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; struct iommu_resv_region *region; LIST_HEAD(resv_regions); int ret = 0; if (dev_is_pci(dev)) { ret = iova_reserve_pci_windows(to_pci_dev(dev), iovad); if (ret) return ret; } iommu_get_resv_regions(dev, &resv_regions); list_for_each_entry(region, &resv_regions, list) { unsigned long lo, hi; /* We ARE the software that manages these! */ if (region->type == IOMMU_RESV_SW_MSI) continue; lo = iova_pfn(iovad, region->start); hi = iova_pfn(iovad, region->start + region->length - 1); reserve_iova(iovad, lo, hi); if (region->type == IOMMU_RESV_MSI) ret = cookie_init_hw_msi_region(cookie, region->start, region->start + region->length); if (ret) break; } iommu_put_resv_regions(dev, &resv_regions); return ret; } . . . . . . /** * iommu_dma_init_domain - Initialise a DMA mapping domain * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() * @base: IOVA at which the mappable address space starts * @size: Size of IOVA space * @dev: Device the domain is being initialised for * * @base and @size should be exact multiples of IOMMU page granularity to * avoid rounding surprises. If necessary, we reserve the page at address 0 * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but * any change which could make prior IOVAs invalid will fail. */ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, u64 size, struct device *dev) { struct iommu_dma_cookie *cookie = domain->iova_cookie; unsigned long order, base_pfn; struct iova_domain *iovad; int attr; if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE) return -EINVAL; iovad = &cookie->iovad; /* Use the smallest supported page size for IOVA granularity */ order = __ffs(domain->pgsize_bitmap); base_pfn = max_t(unsigned long, 1, base >> order); /* Check the domain allows at least some access to the device... */ if (domain->geometry.force_aperture) { if (base > domain->geometry.aperture_end || base + size <= domain->geometry.aperture_start) { pr_warn("specified DMA range outside IOMMU capability\n"); return -EFAULT; } /* ...then finally give it a kicking to make sure it fits */ base_pfn = max_t(unsigned long, base_pfn, domain->geometry.aperture_start >> order); } /* start_pfn is always nonzero for an already-initialised domain */ if (iovad->start_pfn) { if (1UL << order != iovad->granule || base_pfn != iovad->start_pfn) { pr_warn("Incompatible range for DMA domain\n"); return -EFAULT; } return 0; } init_iova_domain(iovad, 1UL << order, base_pfn); if (!cookie->fq_domain && !iommu_domain_get_attr(domain, DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) && attr) { if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all, NULL)) pr_warn("iova flush queue initialization failed\n"); else cookie->fq_domain = domain; } if (!dev) return 0; return iova_reserve_iommu_regions(dev, domain); }
在前面 IOMMU 回调 arm_smmu_domain_alloc()
被调用创建 domain 时,如果 domain type 为 IOMMU_DOMAIN_DMA
,会为 domain 创建类型为 IOMMU_DMA_IOVA_COOKIE
的 iommu_dma_cookie
及其 struct iova_domain
。struct iova_domain
用于管理 domain 的 I/O 虚拟地址空间。iommu_dma_init_domain()
函数调用 init_iova_domain()
为 domain 初始化 struct iova_domain
。
size
为 IOVA 空间的大小,它是由 of_dma_configure_id()
函数传过来的,如果 dev->coherent_dma_mask
非 0,则一般为 dev->coherent_dma_mask
,否则为 0x100000000。
__ffs()
函数返回一个数二进制表示的最低有效位的位数。iommu_dma_init_domain()
函数以 1UL << order
作为 granule
来初始化 struct iova_domain
,即设置以支持的最细粒度的内存页大小来分配 IOVA 地址段。
init_iova_domain()
函数定义 (位于 drivers/iommu/iova.c 文件中) 如下:
void init_iova_domain(struct iova_domain *iovad, unsigned long granule, unsigned long start_pfn) { /* * IOVA granularity will normally be equal to the smallest * supported IOMMU page size; both *must* be capable of * representing individual CPU pages exactly. */ BUG_ON((granule > PAGE_SIZE) || !is_power_of_2(granule)); spin_lock_init(&iovad->iova_rbtree_lock); iovad->rbroot = RB_ROOT; iovad->cached_node = &iovad->anchor.node; iovad->cached32_node = &iovad->anchor.node; iovad->granule = granule; iovad->start_pfn = start_pfn; iovad->dma_32bit_pfn = 1UL << (32 - iova_shift(iovad)); iovad->max32_alloc_size = iovad->dma_32bit_pfn; iovad->flush_cb = NULL; iovad->fq = NULL; iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR; rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node); rb_insert_color(&iovad->anchor.node, &iovad->rbroot); init_iova_rcaches(iovad); } EXPORT_SYMBOL_GPL(init_iova_domain);
dma_alloc_attrs()
函数调用的 dma 映射操作的 alloc()
回调实际为 iommu_dma_alloc()
函数,这个函数定义 (位于 drivers/iommu/dma-iommu.c 文件中) 如下:
static void *iommu_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, unsigned long attrs) { bool coherent = dev_is_dma_coherent(dev); int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs); struct page *page = NULL; void *cpu_addr; gfp |= __GFP_ZERO; if (IS_ENABLED(CONFIG_DMA_REMAP) && gfpflags_allow_blocking(gfp) && !(attrs & DMA_ATTR_FORCE_CONTIGUOUS)) { return iommu_dma_alloc_remap(dev, size, handle, gfp, dma_pgprot(dev, PAGE_KERNEL, attrs), attrs); } if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !gfpflags_allow_blocking(gfp) && !coherent) page = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &cpu_addr, gfp, NULL); else cpu_addr = iommu_dma_alloc_pages(dev, size, &page, gfp, attrs); if (!cpu_addr) return NULL; *handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot, dev->coherent_dma_mask); if (*handle == DMA_MAPPING_ERROR) { __iommu_dma_free(dev, size, cpu_addr); return NULL; } return cpu_addr; }
iommu_dma_alloc()
函数中,如果启用了 CONFIG_DMA_REMAP,gfp 标记允许阻塞,且没有带强制连续属性,则调用 iommu_dma_alloc_remap()
函数完成内存分配及重映射并返回;否则,先通过 dma_alloc_from_pool()
或 iommu_dma_alloc_pages()
分配物理内存页,再通过 __iommu_dma_map()
分配 I/O 虚拟地址空间地址段并执行重映射。
iommu_dma_alloc_remap()
函数定义 (位于 drivers/iommu/dma-iommu.c 文件中) 如下:
static int iommu_dma_deferred_attach(struct device *dev, struct iommu_domain *domain) { const struct iommu_ops *ops = domain->ops; if (!is_kdump_kernel()) return 0; if (unlikely(ops->is_attach_deferred && ops->is_attach_deferred(domain, dev))) return iommu_attach_device(domain, dev); return 0; } . . . . . . /** * iommu_dma_alloc_remap - Allocate and map a buffer contiguous in IOVA space * @dev: Device to allocate memory for. Must be a real device * attached to an iommu_dma_domain * @size: Size of buffer in bytes * @dma_handle: Out argument for allocated DMA handle * @gfp: Allocation flags * @prot: pgprot_t to use for the remapped mapping * @attrs: DMA attributes for this allocation * * If @size is less than PAGE_SIZE, then a full CPU page will be allocated, * but an IOMMU which supports smaller pages might not map the whole thing. * * Return: Mapped virtual address, or NULL on failure. */ static void *iommu_dma_alloc_remap(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot, unsigned long attrs) { struct iommu_domain *domain = iommu_get_dma_domain(dev); struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; bool coherent = dev_is_dma_coherent(dev); int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs); unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap; struct page **pages; struct sg_table sgt; dma_addr_t iova; void *vaddr; *dma_handle = DMA_MAPPING_ERROR; if (unlikely(iommu_dma_deferred_attach(dev, domain))) return NULL; min_size = alloc_sizes & -alloc_sizes; if (min_size < PAGE_SIZE) { min_size = PAGE_SIZE; alloc_sizes |= PAGE_SIZE; } else { size = ALIGN(size, min_size); } if (attrs & DMA_ATTR_ALLOC_SINGLE_PAGES) alloc_sizes = min_size; count = PAGE_ALIGN(size) >> PAGE_SHIFT; pages = __iommu_dma_alloc_pages(dev, count, alloc_sizes >> PAGE_SHIFT, gfp); if (!pages) return NULL; size = iova_align(iovad, size); iova = iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask, dev); if (!iova) goto out_free_pages; if (sg_alloc_table_from_pages(&sgt, pages, count, 0, size, GFP_KERNEL)) goto out_free_iova; if (!(ioprot & IOMMU_CACHE)) { struct scatterlist *sg; int i; for_each_sg(sgt.sgl, sg, sgt.orig_nents, i) arch_dma_prep_coherent(sg_page(sg), sg->length); } if (iommu_map_sg_atomic(domain, iova, sgt.sgl, sgt.orig_nents, ioprot) < size) goto out_free_sg; vaddr = dma_common_pages_remap(pages, size, prot, __builtin_return_address(0)); if (!vaddr) goto out_unmap; *dma_handle = iova; sg_free_table(&sgt); return vaddr; out_unmap: __iommu_dma_unmap(dev, iova, size); out_free_sg: sg_free_table(&sgt); out_free_iova: iommu_dma_free_iova(cookie, iova, size); out_free_pages: __iommu_dma_free_pages(pages, count); return NULL; }
iommu_dma_alloc_remap()
函数做了这样一些事情:
iommu_attach_device()
执行系统 I/O 设备与 IOMMU 的连接。order_mask
,并调用 __iommu_dma_alloc_pages()
分配物理内存页。domain->pgsize_bitmap
为 domain 支持的页大小的集合,支持的页大小都是 2 的整数次幂,这个值是支持的页大小的按位或。计算 min_size
时,表达式 alloc_sizes & -alloc_sizes
的思路是,一个数的相反数是这个数的按位取反加 1,因此这个表达式等价于 alloc_sizes & ((~alloc_sizes) + 1)
,这样可以比较高效地找到数的二进制表示的最低有效位,也就是支持的最小页大小。这里限定分配的最小内存块为 PAGE_SIZE
。一个 alloc_sizes
和 min_size
的示例如:alloc_sizes(0x40201000), min_size(0x1000)
。iommu_dma_alloc_iova()
在 I/O 虚拟地址空间分配地址段,传入的 dma_limit
为 dev->coherent_dma_mask
。sg_alloc_table_from_pages()
由内存页数组分配并初始化一个 sg 表。iommu_map_sg_atomic()
执行 IOMMU sg 映射。iommu_attach_device()
函数定义 (位于 drivers/iommu/iommu.c 文件中) 如下:
static int iommu_group_device_count(struct iommu_group *group) { struct group_device *entry; int ret = 0; list_for_each_entry(entry, &group->devices, list) ret++; return ret; } . . . . . . static int __iommu_attach_device(struct iommu_domain *domain, struct device *dev) { int ret; if (unlikely(domain->ops->attach_dev == NULL)) return -ENODEV; ret = domain->ops->attach_dev(domain, dev); if (!ret) trace_attach_device_to_domain(dev); return ret; } int iommu_attach_device(struct iommu_domain *domain, struct device *dev) { struct iommu_group *group; int ret; group = iommu_group_get(dev); if (!group) return -ENODEV; /* * Lock the group to make sure the device-count doesn't * change while we are attaching */ mutex_lock(&group->mutex); ret = -EINVAL; if (iommu_group_device_count(group) != 1) goto out_unlock; ret = __iommu_attach_group(domain, group); out_unlock: mutex_unlock(&group->mutex); iommu_group_put(group); return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device); . . . . . . /* * IOMMU groups are really the natural working unit of the IOMMU, but * the IOMMU API works on domains and devices. Bridge that gap by * iterating over the devices in a group. Ideally we'd have a single * device which represents the requestor ID of the group, but we also * allow IOMMU drivers to create policy defined minimum sets, where * the physical hardware may be able to distiguish members, but we * wish to group them at a higher level (ex. untrusted multi-function * PCI devices). Thus we attach each device. */ static int iommu_group_do_attach_device(struct device *dev, void *data) { struct iommu_domain *domain = data; return __iommu_attach_device(domain, dev); } static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { int ret; if (group->default_domain && group->domain != group->default_domain) return -EBUSY; ret = __iommu_group_for_each_dev(group, domain, iommu_group_do_attach_device); if (ret == 0) group->domain = domain; return ret; }
iommu_attach_device()
函数从系统 I/O 设备获得 IOMMU group,并计算 IOMMU group 中系统 I/O 设备的数量,如果不为 1,则返回,否则为 IOMMU group 中的系统 I/O 设备执行与 IOMMU 的连接。最终通过 IOMMU 回调的 attach_dev()
操作为系统 I/O 设备执行与 IOMMU 的连接。
__iommu_dma_alloc_pages()
函数用于分配物理内存页,这个函数定义 (位于 drivers/iommu/dma-iommu.c 文件中) 如下:
static struct page **__iommu_dma_alloc_pages(struct device *dev, unsigned int count, unsigned long order_mask, gfp_t gfp) { struct page **pages; unsigned int i = 0, nid = dev_to_node(dev); order_mask &= (2U << MAX_ORDER) - 1; if (!order_mask) return NULL; pages = kvzalloc(count * sizeof(*pages), GFP_KERNEL); if (!pages) return NULL; /* IOMMU can map any pages, so himem can also be used here */ gfp |= __GFP_NOWARN | __GFP_HIGHMEM; /* It makes no sense to muck about with huge pages */ gfp &= ~__GFP_COMP; while (count) { struct page *page = NULL; unsigned int order_size; /* * Higher-order allocations are a convenience rather * than a necessity, hence using __GFP_NORETRY until * falling back to minimum-order allocations. */ for (order_mask &= (2U << __fls(count)) - 1; order_mask; order_mask &= ~order_size) { unsigned int order = __fls(order_mask); gfp_t alloc_flags = gfp; order_size = 1U << order; if (order_mask > order_size) alloc_flags |= __GFP_NORETRY; page = alloc_pages_node(nid, alloc_flags, order); if (!page) continue; if (order) split_page(page, order); break; } if (!page) { __iommu_dma_free_pages(pages, i); return NULL; } count -= order_size; while (order_size--) pages[i++] = page++; } return pages; }
__iommu_dma_alloc_pages()
函数中,order_mask
为支持的连续内存页数的集合。__fls()
函数用于获得一个数的二进制表示的最高有效位数。(2U << __fls(count))
表达式获得 count
的最高位加 1 位;(2U << __fls(count)) - 1
表达式获得 count
最高位及其更低位全为 1 的值;order_mask &= (2U << __fls(count)) - 1
表达式获得 count
范围内可用的支持的连续内存页数的集合。__fls(order_mask)
表达式获得一次分配最大的可支持的连续页数。order_mask &= ~order_size
表达式清除最高位。
__iommu_dma_alloc_pages()
函数用一个两层循环分配物理内存页。外层循环控制分配内存的总页数。内层循环执行错误处理。内层循环每次尝试分配最大的可支持的连续物理内存页数,如果分配失败,则尝试分配更小一级的可支持的连续物理内存页数,依次类推直到分配成功。如果内存分配成功,会将大内存页按 PAGE_SIZE
大小分割为更小的内存页。
尽管 __iommu_dma_alloc_pages()
函数返回的是大小相同的物理内存页,但它返回的实际上可能是可作为大小不同的内存页来使用的内存页。深入到内存管理子系统中,分配物理内存页的详细过程,这里忽略。
iommu_dma_alloc_iova()
函数在 I/O 虚拟地址空间分配地址段,这个函数定义 (位于 drivers/iommu/dma-iommu.c 文件中) 如下:
static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, size_t size, u64 dma_limit, struct device *dev) { struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; unsigned long shift, iova_len, iova = 0; if (cookie->type == IOMMU_DMA_MSI_COOKIE) { cookie->msi_iova += size; return cookie->msi_iova - size; } shift = iova_shift(iovad); iova_len = size >> shift; /* * Freeing non-power-of-two-sized allocations back into the IOVA caches * will come back to bite us badly, so we have to waste a bit of space * rounding up anything cacheable to make sure that can't happen. The * order of the unadjusted size will still match upon freeing. */ if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1))) iova_len = roundup_pow_of_two(iova_len); dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit); if (domain->geometry.force_aperture) dma_limit = min(dma_limit, (u64)domain->geometry.aperture_end); /* Try to get PCI devices a SAC address */ if (dma_limit > DMA_BIT_MASK(32) && dev_is_pci(dev)) iova = alloc_iova_fast(iovad, iova_len, DMA_BIT_MASK(32) >> shift, false); if (!iova) iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift, true); return (dma_addr_t)iova << shift; }
iommu_dma_alloc_iova()
函数根据传入的 dma_limit
,domain 的限制,及设备的类型重新计算最大的限制地址,根据要分配的内存块大小,计算要分配的 IO 虚拟地址空间内存块块数,并调用 alloc_iova_fast()
函数在 I/O 虚拟地址空间分配地址段。对于 PCIe 设备,会先尝试获取一个 SAC 地址,失败时根据计算获得的 dma_limit
在 I/O 虚拟地址空间分配地址段。
alloc_iova_fast()
函数定义 (位于 drivers/iommu/iova.c 文件中) 如下:
unsigned long size, unsigned long limit_pfn, struct iova *new, bool size_aligned) { struct rb_node *curr, *prev; struct iova *curr_iova; unsigned long flags; unsigned long new_pfn; unsigned long align_mask = ~0UL; if (size_aligned) align_mask <<= fls_long(size - 1); /* Walk the tree backwards */ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); if (limit_pfn <= iovad->dma_32bit_pfn && size >= iovad->max32_alloc_size) goto iova32_full; curr = __get_cached_rbnode(iovad, limit_pfn); curr_iova = rb_entry(curr, struct iova, node); do { limit_pfn = min(limit_pfn, curr_iova->pfn_lo); new_pfn = (limit_pfn - size) & align_mask; prev = curr; curr = rb_prev(curr); curr_iova = rb_entry(curr, struct iova, node); } while (curr && new_pfn <= curr_iova->pfn_hi); if (limit_pfn < size || new_pfn < iovad->start_pfn) { iovad->max32_alloc_size = size; goto iova32_full; } /* pfn_lo will point to size aligned address if size_aligned is set */ new->pfn_lo = new_pfn; new->pfn_hi = new->pfn_lo + size - 1; /* If we have 'prev', it's a valid place to start the insertion. */ iova_insert_rbtree(&iovad->rbroot, new, prev); __cached_rbnode_insert_update(iovad, new); spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); return 0; iova32_full: spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); return -ENOMEM; } static struct kmem_cache *iova_cache; static unsigned int iova_cache_users; static DEFINE_MUTEX(iova_cache_mutex); struct iova *alloc_iova_mem(void) { return kmem_cache_zalloc(iova_cache, GFP_ATOMIC | __GFP_NOWARN); } EXPORT_SYMBOL(alloc_iova_mem); . . . . . . /** * alloc_iova - allocates an iova * @iovad: - iova domain in question * @size: - size of page frames to allocate * @limit_pfn: - max limit address * @size_aligned: - set if size_aligned address range is required * This function allocates an iova in the range iovad->start_pfn to limit_pfn, * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned * flag is set then the allocated address iova->pfn_lo will be naturally * aligned on roundup_power_of_two(size). */ struct iova * alloc_iova(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, bool size_aligned) { struct iova *new_iova; int ret; new_iova = alloc_iova_mem(); if (!new_iova) return NULL; ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1, new_iova, size_aligned); if (ret) { free_iova_mem(new_iova); return NULL; } return new_iova; } EXPORT_SYMBOL_GPL(alloc_iova); . . . . . . /** * alloc_iova_fast - allocates an iova from rcache * @iovad: - iova domain in question * @size: - size of page frames to allocate * @limit_pfn: - max limit address * @flush_rcache: - set to flush rcache on regular allocation failure * This function tries to satisfy an iova allocation from the rcache, * and falls back to regular allocation on failure. If regular allocation * fails too and the flush_rcache flag is set then the rcache will be flushed. */ unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, bool flush_rcache) { unsigned long iova_pfn; struct iova *new_iova; iova_pfn = iova_rcache_get(iovad, size, limit_pfn + 1); if (iova_pfn) return iova_pfn; retry: new_iova = alloc_iova(iovad, size, limit_pfn, true); if (!new_iova) { unsigned int cpu; if (!flush_rcache) return 0; /* Try replenishing IOVAs by flushing rcache. */ flush_rcache = false; for_each_online_cpu(cpu) free_cpu_cached_iovas(cpu, iovad); goto retry; } return new_iova->pfn_lo; } EXPORT_SYMBOL_GPL(alloc_iova_fast); . . . . . . /* * Caller wants to allocate a new IOVA range from 'rcache'. If we can * satisfy the request, return a matching non-NULL range and remove * it from the 'rcache'. */ static unsigned long __iova_rcache_get(struct iova_rcache *rcache, unsigned long limit_pfn) { struct iova_cpu_rcache *cpu_rcache; unsigned long iova_pfn = 0; bool has_pfn = false; unsigned long flags; cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches); spin_lock_irqsave(&cpu_rcache->lock, flags); if (!iova_magazine_empty(cpu_rcache->loaded)) { has_pfn = true; } else if (!iova_magazine_empty(cpu_rcache->prev)) { swap(cpu_rcache->prev, cpu_rcache->loaded); has_pfn = true; } else { spin_lock(&rcache->lock); if (rcache->depot_size > 0) { iova_magazine_free(cpu_rcache->loaded); cpu_rcache->loaded = rcache->depot[--rcache->depot_size]; has_pfn = true; } spin_unlock(&rcache->lock); } if (has_pfn) iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn); spin_unlock_irqrestore(&cpu_rcache->lock, flags); return iova_pfn; } /* * Try to satisfy IOVA allocation range from rcache. Fail if requested * size is too big or the DMA limit we are given isn't satisfied by the * top element in the magazine. */ static unsigned long iova_rcache_get(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn) { unsigned int log_size = order_base_2(size); if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE) return 0; return __iova_rcache_get(&iovad->rcaches[log_size], limit_pfn - size); }
alloc_iova_fast()
函数首先尝试从缓存中查找满足条件的 I/O 虚拟地址空间地址段,如果找到就成功返回;否则,调用 alloc_iova()
函数分配 IOVA 内存段,如果需要刷新缓存,则释放 CPU 缓存的所有 IOVA 范围。
在 alloc_iova()
函数中,它首先分配 struct iova
对象,然后遵从地址限制,分配 IOVA 内存段。
在 iommu_dma_alloc_remap()
函数中,可以看到 dma_limit
的一个来源是 dev->coherent_dma_mask
。在系统 I/O 设备驱动程序中,为 DMA 分配内存前,需要调用 dma_coerce_mask_and_coherent()
/dma_set_mask_and_coherent()
/dma_set_coherent_mask()
/dma_set_mask()
等函数为系统 I/O 设备设置 mask 和 coherent mask,如:
if (WARN_ON(!dev->dma_mask)) {
/* Platform did not initialize dma_mask */
ret = dma_coerce_mask_and_coherent(i2s->dev, DMA_BIT_MASK(40));
} else {
ret = dma_set_mask_and_coherent(i2s->dev, DMA_BIT_MASK(40));
}
DMA_BIT_MASK(n)
宏及上面几个函数的定义或声明 (位于 include/linux/dma-mapping.h 文件中) 如下:
#define DMA_MAPPING_ERROR (~(dma_addr_t)0) #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) . . . . . . int dma_set_mask(struct device *dev, u64 mask); int dma_set_coherent_mask(struct device *dev, u64 mask); . . . . . . /* * Set both the DMA mask and the coherent DMA mask to the same thing. * Note that we don't check the return value from dma_set_coherent_mask() * as the DMA API guarantees that the coherent DMA mask can be set to * the same or smaller than the streaming DMA mask. */ static inline int dma_set_mask_and_coherent(struct device *dev, u64 mask) { int rc = dma_set_mask(dev, mask); if (rc == 0) dma_set_coherent_mask(dev, mask); return rc; } /* * Similar to the above, except it deals with the case where the device * does not have dev->dma_mask appropriately setup. */ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask) { dev->dma_mask = &dev->coherent_dma_mask; return dma_set_mask_and_coherent(dev, mask); }
dma_coerce_mask_and_coherent()
/dma_set_mask_and_coherent()
接口同时设置系统 I/O 设备的 DMA mask 和 coherent DMA mask 为相同值。以上面传入 DMA_BIT_MASK(40)
作为 mask 的代码为例,mask 将是 0xFFFFFFFF。它们调用 dma_set_coherent_mask()
/dma_set_mask()
分别为 DMA mask 和 coherent DMA mask 设置值。dma_set_coherent_mask()
/dma_set_mask()
函数定义 (位于 include/linux/dma-mapping.h 文件中) 如下:
int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); /* * ->dma_supported sets the bypass flag, so we must always call * into the method here unless the device is truly direct mapped. */ if (!ops) return dma_direct_supported(dev, mask); if (!ops->dma_supported) return 1; return ops->dma_supported(dev, mask); } EXPORT_SYMBOL(dma_supported); #ifdef CONFIG_ARCH_HAS_DMA_SET_MASK void arch_dma_set_mask(struct device *dev, u64 mask); #else #define arch_dma_set_mask(dev, mask) do { } while (0) #endif int dma_set_mask(struct device *dev, u64 mask) { /* * Truncate the mask to the actually supported dma_addr_t width to * avoid generating unsupportable addresses. */ mask = (dma_addr_t)mask; if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; arch_dma_set_mask(dev, mask); *dev->dma_mask = mask; return 0; } EXPORT_SYMBOL(dma_set_mask); #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) { /* * Truncate the mask to the actually supported dma_addr_t width to * avoid generating unsupportable addresses. */ mask = (dma_addr_t)mask; if (!dma_supported(dev, mask)) return -EIO; dev->coherent_dma_mask = mask; return 0; } EXPORT_SYMBOL(dma_set_coherent_mask); #endif
系统 I/O 设备驱动程序为系统 I/O 设备设置的 DMA mask 和 coherent DMA mask,对 DMA 内存分配有着比较大的影响。对于连接到 SMMU 的系统 I/O 设备,这个值设置的和 SMMU 的输入地址大小 IAS 保持一致比较好,如 IAS 为 40 bit,DMA mask 和 coherent DMA mask 就设置为 DMA_BIT_MASK(40)
。
dma_limit 是分配 I/O 虚拟地址空间地址段的上限,IOMMU 子系统的 IOVA 部分,从这个上限往下查找可用的地址段,并用红黑树管理已经分配的地址段,IOVA 部分还会建立地址段的缓存,以加速地址段的分配。
在 iommu_dma_alloc_remap()
函数中,sg_alloc_table_from_pages()
用于由内存页数组分配并初始化 sg 表,这个函数定义 (位于 lib/scatterlist.c 文件中) 如下:
static struct scatterlist *get_next_sg(struct sg_table *table, struct scatterlist *cur, unsigned long needed_sges, gfp_t gfp_mask) { struct scatterlist *new_sg, *next_sg; unsigned int alloc_size; if (cur) { next_sg = sg_next(cur); /* Check if last entry should be keeped for chainning */ if (!sg_is_last(next_sg) || needed_sges == 1) return next_sg; } alloc_size = min_t(unsigned long, needed_sges, SG_MAX_SINGLE_ALLOC); new_sg = sg_kmalloc(alloc_size, gfp_mask); if (!new_sg) return ERR_PTR(-ENOMEM); sg_init_table(new_sg, alloc_size); if (cur) { __sg_chain(next_sg, new_sg); table->orig_nents += alloc_size - 1; } else { table->sgl = new_sg; table->orig_nents = alloc_size; table->nents = 0; } return new_sg; } /** * __sg_alloc_table_from_pages - Allocate and initialize an sg table from * an array of pages * @sgt: The sg table header to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @max_segment: Maximum size of a scatterlist element in bytes * @prv: Last populated sge in sgt * @left_pages: Left pages caller have to set after this call * @gfp_mask: GFP allocation mask * * Description: * If @prv is NULL, allocate and initialize an sg table from a list of pages, * else reuse the scatterlist passed in at @prv. * Contiguous ranges of the pages are squashed into a single scatterlist * entry up to the maximum size specified in @max_segment. A user may * provide an offset at a start and a size of valid data in a buffer * specified by the page array. * * Returns: * Last SGE in sgt on success, PTR_ERR on otherwise. * The allocation in @sgt must be released by sg_free_table. * * Notes: * If this function returns non-0 (eg failure), the caller must call * sg_free_table() to cleanup any leftover allocations. */ struct scatterlist *__sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, struct scatterlist *prv, unsigned int left_pages, gfp_t gfp_mask) { unsigned int chunks, cur_page, seg_len, i, prv_len = 0; unsigned int added_nents = 0; struct scatterlist *s = prv; /* * The algorithm below requires max_segment to be aligned to PAGE_SIZE * otherwise it can overshoot. */ max_segment = ALIGN_DOWN(max_segment, PAGE_SIZE); if (WARN_ON(max_segment < PAGE_SIZE)) return ERR_PTR(-EINVAL); if (IS_ENABLED(CONFIG_ARCH_NO_SG_CHAIN) && prv) return ERR_PTR(-EOPNOTSUPP); if (prv) { unsigned long paddr = (page_to_pfn(sg_page(prv)) * PAGE_SIZE + prv->offset + prv->length) / PAGE_SIZE; if (WARN_ON(offset)) return ERR_PTR(-EINVAL); /* Merge contiguous pages into the last SG */ prv_len = prv->length; while (n_pages && page_to_pfn(pages[0]) == paddr) { if (prv->length + PAGE_SIZE > max_segment) break; prv->length += PAGE_SIZE; paddr++; pages++; n_pages--; } if (!n_pages) goto out; } /* compute number of contiguous chunks */ chunks = 1; seg_len = 0; for (i = 1; i < n_pages; i++) { seg_len += PAGE_SIZE; if (seg_len >= max_segment || page_to_pfn(pages[i]) != page_to_pfn(pages[i - 1]) + 1) { chunks++; seg_len = 0; } } /* merging chunks and putting them into the scatterlist */ cur_page = 0; for (i = 0; i < chunks; i++) { unsigned int j, chunk_size; /* look for the end of the current chunk */ seg_len = 0; for (j = cur_page + 1; j < n_pages; j++) { seg_len += PAGE_SIZE; if (seg_len >= max_segment || page_to_pfn(pages[j]) != page_to_pfn(pages[j - 1]) + 1) break; } /* Pass how many chunks might be left */ s = get_next_sg(sgt, s, chunks - i + left_pages, gfp_mask); if (IS_ERR(s)) { /* * Adjust entry length to be as before function was * called. */ if (prv) prv->length = prv_len; return s; } chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset; sg_set_page(s, pages[cur_page], min_t(unsigned long, size, chunk_size), offset); added_nents++; size -= chunk_size; offset = 0; cur_page = j; } sgt->nents += added_nents; out: if (!left_pages) sg_mark_end(s); return s; } EXPORT_SYMBOL(__sg_alloc_table_from_pages); /** * sg_alloc_table_from_pages - Allocate and initialize an sg table from * an array of pages * @sgt: The sg table header to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @gfp_mask: GFP allocation mask * * Description: * Allocate and initialize an sg table from a list of pages. Contiguous * ranges of the pages are squashed into a single scatterlist node. A user * may provide an offset at a start and a size of valid data in a buffer * specified by the page array. The returned sg table is released by * sg_free_table. * * Returns: * 0 on success, negative error on failure */ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, gfp_t gfp_mask) { return PTR_ERR_OR_ZERO(__sg_alloc_table_from_pages(sgt, pages, n_pages, offset, size, UINT_MAX, NULL, 0, gfp_mask)); } EXPORT_SYMBOL(sg_alloc_table_from_pages);
sg_alloc_table_from_pages()
/__sg_alloc_table_from_pages()
函数的执行过程大体为:
在 iommu_dma_alloc_remap()
函数中,iommu_map_sg_atomic()
用于建立物理内存页和 IOVA 虚拟地址之间的映射,这个函数定义 (位于 drivers/iommu/iommu.c 文件中) 如下:
size_t iommu_pgsize(struct iommu_domain *domain, unsigned long addr_merge, size_t size) { unsigned int pgsize_idx; size_t pgsize; /* Max page size that still fits into 'size' */ pgsize_idx = __fls(size); /* need to consider alignment requirements ? */ if (likely(addr_merge)) { /* Max page size allowed by address */ unsigned int align_pgsize_idx = __ffs(addr_merge); pgsize_idx = min(pgsize_idx, align_pgsize_idx); } /* build a mask of acceptable page sizes */ pgsize = (1UL << (pgsize_idx + 1)) - 1; /* throw away page sizes not supported by the hardware */ pgsize &= domain->pgsize_bitmap; /* make sure we're still sane */ BUG_ON(!pgsize); /* pick the biggest page */ pgsize_idx = __fls(pgsize); pgsize = 1UL << pgsize_idx; return pgsize; } EXPORT_SYMBOL_GPL(iommu_pgsize); static int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { const struct iommu_ops *ops = domain->ops; unsigned long orig_iova = iova; unsigned int min_pagesz; size_t orig_size = size; phys_addr_t orig_paddr = paddr; int ret = 0; if (unlikely(ops->map == NULL || domain->pgsize_bitmap == 0UL)) return -ENODEV; if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) return -EINVAL; /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); /* * both the virtual address and the physical one, as well as * the size of the mapping, must be aligned (at least) to the * size of the smallest page supported by the hardware */ if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) { pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 0x%x\n", iova, &paddr, size, min_pagesz); return -EINVAL; } pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { size_t pgsize = iommu_pgsize(domain, iova | paddr, size); pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n", iova, &paddr, pgsize); ret = ops->map(domain, iova, paddr, pgsize, prot, gfp); if (ret) break; iova += pgsize; paddr += pgsize; size -= pgsize; } /* unroll mapping in case something went wrong */ if (ret) iommu_unmap(domain, orig_iova, orig_size - size); else trace_map(orig_iova, orig_paddr, orig_size); return ret; } . . . . . . static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg, unsigned int nents, int prot, gfp_t gfp) { const struct iommu_ops *ops = domain->ops; size_t len = 0, mapped = 0; phys_addr_t start; unsigned int i = 0; int ret; while (i <= nents) { phys_addr_t s_phys = sg_phys(sg); if (len && s_phys != start + len) { ret = __iommu_map(domain, iova + mapped, start, len, prot, gfp); if (ret) goto out_err; mapped += len; len = 0; } if (len) { len += sg->length; } else { len = sg->length; start = s_phys; } if (++i < nents) sg = sg_next(sg); } if (ops->iotlb_sync_map) ops->iotlb_sync_map(domain, iova, mapped); return mapped; out_err: /* undo mappings already done */ iommu_unmap(domain, iova, mapped); return 0; } . . . . . . size_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg, unsigned int nents, int prot) { return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(iommu_map_sg_atomic);
iommu_map_sg_atomic()
/__iommu_map_sg()
函数逐个 scatterlist
处理,它们对于每个 scatterlist
执行如下操作:
scatterlist
内存块的物理地址。__iommu_map()
函数映射一段连续的物理内存块。当遍历到第 n+1 个 scatterlist
时,为第 n 个 scatterlist
做映射。IOVA 地址连续单调递增,物理地址则可能因 scatterlist
的物理内存地址的跳変而跳変。scatterlist
。__iommu_map_sg()
函数在最后,还会通过 IOMMU 设备驱动程序的 IOMMU 回调的 iotlb_sync_map
同步映射。
在 __iommu_map()
函数中,以支持的最小页大小为单位,通过 IOMMU 设备驱动程序的 IOMMU 回调的 map()
逐个内存页执行映射。SMMUv3 设备驱动程序的 map()
回调为 arm_smmu_map()
函数。arm_smmu_map()
函数定义 (位于 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 文件中) 如下:
static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
{
struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
if (!ops)
return -ENODEV;
return ops->map(ops, iova, paddr, size, prot, gfp);
}
arm_smmu_map()
函数包装了 domain 的 IO pagetable 操作的 map()
回调。domain 的 IO pagetable 操作在 arm_smmu_attach_dev()
中创建。
domain 的 IO pagetable 操作的 map()
回调是 arm_lpae_map()
函数,这个函数定义 (位于 drivers/iommu/io-pgtable-arm.c 文件中) 如下:
static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t size, int iommu_prot, gfp_t gfp) { struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); struct io_pgtable_cfg *cfg = &data->iop.cfg; arm_lpae_iopte *ptep = data->pgd; int ret, lvl = data->start_level; arm_lpae_iopte prot; long iaext = (s64)iova >> cfg->ias; /* If no access, then nothing to do */ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) return 0; if (WARN_ON(!size || (size & cfg->pgsize_bitmap) != size)) return -EINVAL; if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1) iaext = ~iaext; if (WARN_ON(iaext || paddr >> cfg->oas)) return -ERANGE; prot = arm_lpae_prot_to_pte(data, iommu_prot); ret = __arm_lpae_map(data, iova, paddr, size, prot, lvl, ptep, gfp); /* * Synchronise all PTE updates for the new mapping before there's * a chance for anything to kick off a table walk for the new iova. */ wmb(); return ret; }
在 SMMUv3 设备驱动程序中,系统 I/O 设备的地址转换表,是早就分配好的。在 arm_smmu_attach_dev()
的执行过程中,在 alloc_io_pgtable_ops()
-> arm_64_lpae_alloc_pgtable_s1()
中为 domain 创建 IO pagetable 操作时,已经为系统 I/O 设备创建了地址转换表,并将地址转换表的基地址 (ttbr) 等配置信息保存在了 io_pgtable_cfg
中,如 arm_64_lpae_alloc_pgtable_s1()
函数的定义 (位于 drivers/iommu/io-pgtable-arm.c 文件中):
static struct io_pgtable * arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) { u64 reg; struct arm_lpae_io_pgtable *data; typeof(&cfg->arm_lpae_s1_cfg.tcr) tcr = &cfg->arm_lpae_s1_cfg.tcr; bool tg1; if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NON_STRICT | IO_PGTABLE_QUIRK_ARM_TTBR1 | IO_PGTABLE_QUIRK_ARM_HD | IO_PGTABLE_QUIRK_ARM_BBML1 | IO_PGTABLE_QUIRK_ARM_BBML2)) return NULL; data = arm_lpae_alloc_pgtable(cfg); if (!data) return NULL; /* TCR */ if (cfg->coherent_walk) { tcr->sh = ARM_LPAE_TCR_SH_IS; tcr->irgn = ARM_LPAE_TCR_RGN_WBWA; tcr->orgn = ARM_LPAE_TCR_RGN_WBWA; } else { tcr->sh = ARM_LPAE_TCR_SH_OS; tcr->irgn = ARM_LPAE_TCR_RGN_NC; tcr->orgn = ARM_LPAE_TCR_RGN_NC; } tg1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1; switch (ARM_LPAE_GRANULE(data)) { case SZ_4K: tcr->tg = tg1 ? ARM_LPAE_TCR_TG1_4K : ARM_LPAE_TCR_TG0_4K; break; case SZ_16K: tcr->tg = tg1 ? ARM_LPAE_TCR_TG1_16K : ARM_LPAE_TCR_TG0_16K; break; case SZ_64K: tcr->tg = tg1 ? ARM_LPAE_TCR_TG1_64K : ARM_LPAE_TCR_TG0_64K; break; } switch (cfg->oas) { case 32: tcr->ips = ARM_LPAE_TCR_PS_32_BIT; break; case 36: tcr->ips = ARM_LPAE_TCR_PS_36_BIT; break; case 40: tcr->ips = ARM_LPAE_TCR_PS_40_BIT; break; case 42: tcr->ips = ARM_LPAE_TCR_PS_42_BIT; break; case 44: tcr->ips = ARM_LPAE_TCR_PS_44_BIT; break; case 48: tcr->ips = ARM_LPAE_TCR_PS_48_BIT; break; case 52: tcr->ips = ARM_LPAE_TCR_PS_52_BIT; break; default: goto out_free_data; } tcr->tsz = 64ULL - cfg->ias; /* MAIRs */ reg = (ARM_LPAE_MAIR_ATTR_NC << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_NC)) | (ARM_LPAE_MAIR_ATTR_WBRWA << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) | (ARM_LPAE_MAIR_ATTR_DEVICE << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) | (ARM_LPAE_MAIR_ATTR_INC_OWBRWA << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE)); cfg->arm_lpae_s1_cfg.mair = reg; /* Looking good; allocate a pgd */ data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), GFP_KERNEL, cfg); if (!data->pgd) goto out_free_data; /* Ensure the empty pgd is visible before any actual TTBR write */ wmb(); /* TTBR */ cfg->arm_lpae_s1_cfg.ttbr = virt_to_phys(data->pgd); return &data->iop; out_free_data: kfree(data); return NULL; }
后面在 arm_smmu_write_ctx_desc()
函数中,为系统 I/O 设备写入上下文描述符时,地址转换表基址和配置这些字段也一并被写入,如 arm_smmu_write_ctx_desc()
函数的定义 (位于 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 文件中):
int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid, struct arm_smmu_ctx_desc *cd) { /* * This function handles the following cases: * * (1) Install primary CD, for normal DMA traffic (SSID = 0). * (2) Install a secondary CD, for SID+SSID traffic. * (3) Update ASID of a CD. Atomically write the first 64 bits of the * CD, then invalidate the old entry and mappings. * (4) Quiesce the context without clearing the valid bit. Disable * translation, and ignore any translation fault. * (5) Remove a secondary CD. */ u64 val; bool cd_live; __le64 *cdptr; struct arm_smmu_device *smmu = smmu_domain->smmu; if (WARN_ON(ssid >= (1 << smmu_domain->s1_cfg.s1cdmax))) return -E2BIG; cdptr = arm_smmu_get_cd_ptr(smmu_domain, ssid); if (!cdptr) return -ENOMEM; val = le64_to_cpu(cdptr[0]); cd_live = !!(val & CTXDESC_CD_0_V); if (!cd) { /* (5) */ val = 0; } else if (cd == &quiet_cd) { /* (4) */ val |= CTXDESC_CD_0_TCR_EPD0; } else if (cd_live) { /* (3) */ val &= ~CTXDESC_CD_0_ASID; val |= FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid); /* * Until CD+TLB invalidation, both ASIDs may be used for tagging * this substream's traffic */ } else { /* (1) and (2) */ u64 tcr = cd->tcr; cdptr[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK); cdptr[2] = 0; cdptr[3] = cpu_to_le64(cd->mair); if (!(smmu->features & ARM_SMMU_FEAT_HD)) tcr &= ~CTXDESC_CD_0_TCR_HD; if (!(smmu->features & ARM_SMMU_FEAT_HA)) tcr &= ~CTXDESC_CD_0_TCR_HA; /* * STE is live, and the SMMU might read dwords of this CD in any * order. Ensure that it observes valid values before reading * V=1. */ arm_smmu_sync_cd(smmu_domain, ssid, true); val = tcr | #ifdef __BIG_ENDIAN CTXDESC_CD_0_ENDI | #endif CTXDESC_CD_0_R | CTXDESC_CD_0_A | (cd->mm ? 0 : CTXDESC_CD_0_ASET) | CTXDESC_CD_0_AA64 | FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid) | CTXDESC_CD_0_V; if (smmu_domain->stall_enabled) val |= CTXDESC_CD_0_S; } /* * The SMMU accesses 64-bit values atomically. See IHI0070Ca 3.21.3 * "Configuration structures and configuration invalidation completion" * * The size of single-copy atomic reads made by the SMMU is * IMPLEMENTATION DEFINED but must be at least 64 bits. Any single * field within an aligned 64-bit span of a structure can be altered * without first making the structure invalid. */ WRITE_ONCE(cdptr[0], cpu_to_le64(val)); arm_smmu_sync_cd(smmu_domain, ssid, true); return 0; }
按照 ARM 系统内存管理单元架构规范版本 3 的定义,上下文描述符共有 512 位,64 个字节,8 个 64 位值,具体如下图:
其中有许多位属于保留位,或实现定义位,在标准 SMMUv3 设备驱动程序中,并没有真正被用到。arm_smmu_write_ctx_desc()
函数将包括地址转换表基址在内的必要字段写入。
arm_lpae_map()
函数填充地址转换表的内容,它主要分为两步来完成:
arm_lpae_prot_to_pte()
函数构造页表项,其中包含主要的配置信息。__arm_lpae_map()
函数在页表项中填入 I/O 虚拟地址和物理地址,完成页表项设置,并将页表项写入地址转换表。如果需要创建多级页表,还会创建中间页目录。arm_lpae_prot_to_pte()
函数定义 (位于 drivers/iommu/io-pgtable-arm.c 文件中) 如下:
static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, int prot) { struct io_pgtable_cfg *cfg = &data->iop.cfg; arm_lpae_iopte pte; if (data->iop.fmt == ARM_64_LPAE_S1 || data->iop.fmt == ARM_32_LPAE_S1) { pte = ARM_LPAE_PTE_nG; if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ)) pte |= ARM_LPAE_PTE_AP_RDONLY; else if (data->iop.fmt == ARM_64_LPAE_S1 && cfg->quirks & IO_PGTABLE_QUIRK_ARM_HD) pte |= ARM_LPAE_PTE_DBM; if (!(prot & IOMMU_PRIV)) pte |= ARM_LPAE_PTE_AP_UNPRIV; } else { pte = ARM_LPAE_PTE_HAP_FAULT; if (prot & IOMMU_READ) pte |= ARM_LPAE_PTE_HAP_READ; if (prot & IOMMU_WRITE) pte |= ARM_LPAE_PTE_HAP_WRITE; } /* * Note that this logic is structured to accommodate Mali LPAE * having stage-1-like attributes but stage-2-like permissions. */ if (data->iop.fmt == ARM_64_LPAE_S2 || data->iop.fmt == ARM_32_LPAE_S2) { if (prot & IOMMU_MMIO) pte |= ARM_LPAE_PTE_MEMATTR_DEV; else if (prot & IOMMU_CACHE) pte |= ARM_LPAE_PTE_MEMATTR_OIWB; else pte |= ARM_LPAE_PTE_MEMATTR_NC; } else { if (prot & IOMMU_MMIO) pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV << ARM_LPAE_PTE_ATTRINDX_SHIFT); else if (prot & IOMMU_CACHE) pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE << ARM_LPAE_PTE_ATTRINDX_SHIFT); } /* * Also Mali has its own notions of shareability wherein its Inner * domain covers the cores within the GPU, and its Outer domain is * "outside the GPU" (i.e. either the Inner or System domain in CPU * terms, depending on coherency). */ if (prot & IOMMU_CACHE && data->iop.fmt != ARM_MALI_LPAE) pte |= ARM_LPAE_PTE_SH_IS; else pte |= ARM_LPAE_PTE_SH_OS; if (prot & IOMMU_NOEXEC) pte |= ARM_LPAE_PTE_XN; if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS) pte |= ARM_LPAE_PTE_NS; if (data->iop.fmt != ARM_MALI_LPAE) pte |= ARM_LPAE_PTE_AF; return pte; }
__arm_lpae_map()
函数定义 (位于 drivers/iommu/io-pgtable-arm.c 文件中) 如下:
/* IOPTE accessors */ #define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d)) #define iopte_type(pte,l) \ (((pte) >> ARM_LPAE_PTE_TYPE_SHIFT) & ARM_LPAE_PTE_TYPE_MASK) #define iopte_prot(pte) ((pte) & ARM_LPAE_PTE_ATTR_MASK) struct arm_lpae_io_pgtable { struct io_pgtable iop; int pgd_bits; int start_level; int bits_per_level; void *pgd; }; typedef u64 arm_lpae_iopte; static inline bool iopte_leaf(arm_lpae_iopte pte, int lvl, enum io_pgtable_fmt fmt) { if (lvl == (ARM_LPAE_MAX_LEVELS - 1) && fmt != ARM_MALI_LPAE) return iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_PAGE; return iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_BLOCK; } static arm_lpae_iopte paddr_to_iopte(phys_addr_t paddr, struct arm_lpae_io_pgtable *data) { arm_lpae_iopte pte = paddr; /* Of the bits which overlap, either 51:48 or 15:12 are always RES0 */ return (pte | (pte >> (48 - 12))) & ARM_LPAE_PTE_ADDR_MASK; } . . . . . . static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp, struct io_pgtable_cfg *cfg) { struct device *dev = cfg->iommu_dev; int order = get_order(size); struct page *p; dma_addr_t dma; void *pages; VM_BUG_ON((gfp & __GFP_HIGHMEM)); p = alloc_pages_node(dev ? dev_to_node(dev) : NUMA_NO_NODE, gfp | __GFP_ZERO, order); if (!p) return NULL; pages = page_address(p); if (!cfg->coherent_walk) { dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE); if (dma_mapping_error(dev, dma)) goto out_free; /* * We depend on the IOMMU being able to work with any physical * address directly, so if the DMA layer suggests otherwise by * translating or truncating them, that bodes very badly... */ if (dma != virt_to_phys(pages)) goto out_unmap; } return pages; out_unmap: dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n"); dma_unmap_single(dev, dma, size, DMA_TO_DEVICE); out_free: __free_pages(p, order); return NULL; } . . . . . . static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg) { dma_sync_single_for_device(cfg->iommu_dev, __arm_lpae_dma_addr(ptep), sizeof(*ptep), DMA_TO_DEVICE); } static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte, struct io_pgtable_cfg *cfg) { *ptep = pte; if (!cfg->coherent_walk) __arm_lpae_sync_pte(ptep, cfg); } . . . . . . static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, phys_addr_t paddr, arm_lpae_iopte prot, int lvl, arm_lpae_iopte *ptep) { arm_lpae_iopte pte = prot; if (data->iop.fmt != ARM_MALI_LPAE && lvl == ARM_LPAE_MAX_LEVELS - 1) pte |= ARM_LPAE_PTE_TYPE_PAGE; else pte |= ARM_LPAE_PTE_TYPE_BLOCK; pte |= paddr_to_iopte(paddr, data); __arm_lpae_set_pte(ptep, pte, &data->iop.cfg); } static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, unsigned long iova, phys_addr_t paddr, arm_lpae_iopte prot, int lvl, arm_lpae_iopte *ptep) { arm_lpae_iopte pte = *ptep; if (iopte_leaf(pte, lvl, data->iop.fmt)) { /* We require an unmap first */ WARN_ON(!selftest_running); return -EEXIST; } else if (iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_TABLE) { /* * We need to unmap and free the old table before * overwriting it with a block entry. */ arm_lpae_iopte *tblp; size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data); tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data); if (__arm_lpae_unmap(data, NULL, iova, sz, lvl, tblp) != sz) { WARN_ON(1); return -EINVAL; } } __arm_lpae_init_pte(data, paddr, prot, lvl, ptep); return 0; } static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table, arm_lpae_iopte *ptep, arm_lpae_iopte curr, struct arm_lpae_io_pgtable *data) { arm_lpae_iopte old, new; struct io_pgtable_cfg *cfg = &data->iop.cfg; new = paddr_to_iopte(__pa(table), data) | ARM_LPAE_PTE_TYPE_TABLE; if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) new |= ARM_LPAE_PTE_NSTABLE; /* * Ensure the table itself is visible before its PTE can be. * Whilst we could get away with cmpxchg64_release below, this * doesn't have any ordering semantics when !CONFIG_SMP. */ dma_wmb(); old = cmpxchg64_relaxed(ptep, curr, new); if (cfg->coherent_walk || (old & ARM_LPAE_PTE_SW_SYNC)) return old; /* Even if it's not ours, there's no point waiting; just kick it */ __arm_lpae_sync_pte(ptep, cfg); if (old == curr) WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC); return old; } static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, phys_addr_t paddr, size_t size, arm_lpae_iopte prot, int lvl, arm_lpae_iopte *ptep, gfp_t gfp) { arm_lpae_iopte *cptep, pte; size_t block_size = ARM_LPAE_BLOCK_SIZE(lvl, data); size_t tblsz = ARM_LPAE_GRANULE(data); struct io_pgtable_cfg *cfg = &data->iop.cfg; /* Find our entry at the current level */ ptep += ARM_LPAE_LVL_IDX(iova, lvl, data); /* If we can install a leaf entry at this level, then do so */ if (size == block_size) return arm_lpae_init_pte(data, iova, paddr, prot, lvl, ptep); /* We can't allocate tables at the final level */ if (WARN_ON(lvl >= ARM_LPAE_MAX_LEVELS - 1)) return -EINVAL; /* Grab a pointer to the next level */ pte = READ_ONCE(*ptep); if (!pte) { cptep = __arm_lpae_alloc_pages(tblsz, gfp, cfg); if (!cptep) return -ENOMEM; pte = arm_lpae_install_table(cptep, ptep, 0, data); if (pte) __arm_lpae_free_pages(cptep, tblsz, cfg); } else if (!cfg->coherent_walk && !(pte & ARM_LPAE_PTE_SW_SYNC)) { __arm_lpae_sync_pte(ptep, cfg); } if (pte && !iopte_leaf(pte, lvl, data->iop.fmt)) { cptep = iopte_deref(pte, data); } else if (pte) { /* We require an unmap first */ WARN_ON(!selftest_running); return -EEXIST; } /* Rinse, repeat */ return __arm_lpae_map(data, iova, paddr, size, prot, lvl + 1, cptep, gfp); }
__arm_lpae_map()
函数的执行过程如下:
arm_lpae_init_pte()
函数初始化并写入页表项。arm_lpae_init_pte()
函数检查参数,当页表项保存了有效的映射时,会先执行 unmap,调用 __arm_lpae_init_pte()
函数构造最终的页表项并将页表项写入页表。在 __arm_lpae_init_pte()
函数中,paddr_to_iopte()
用于在页表项中填入物理内存页的地址,__arm_lpae_set_pte()
则用于将页表项写入页表。__arm_lpae_alloc_pages()
函数为下一级页表分配内存,调用 arm_lpae_install_table()
函数安装页表,在对应的页目录项位置写入构造的页目录项,并调用 __arm_lpae_sync_pte()
函数与设备同步页表/页目录项。__arm_lpae_sync_pte()
函数与设备同步页表/页目录项。在 iommu_dma_alloc_remap()
函数中,dma_common_pages_remap()
用于将 PAGE_SIZE
大小的内存页数组重映射到另一个 vm_area
。dma_common_pages_remap()
函数定义 (位于 kernel/dma/remap.c 文件中) 如下:
/*
* Remaps an array of PAGE_SIZE pages into another vm_area.
* Cannot be used in non-sleeping contexts
*/
void *dma_common_pages_remap(struct page **pages, size_t size,
pgprot_t prot, const void *caller)
{
void *vaddr;
vaddr = vmap(pages, PAGE_ALIGN(size) >> PAGE_SHIFT,
VM_DMA_COHERENT, prot);
if (vaddr)
find_vm_area(vaddr)->pages = pages;
return vaddr;
}
SMMUv3 设备在执行 IO 地址转换失败时,会给事件队列发送消息,并产生中断。SMMUv3 设备驱动程序中,定义 (位于 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 文件中) 了几种事件类型:
#define EVT_ID_TRANSLATION_FAULT 0x10
#define EVT_ID_ADDR_SIZE_FAULT 0x11
#define EVT_ID_ACCESS_FAULT 0x12
#define EVT_ID_PERMISSION_FAULT 0x13
SMMUv3 硬件设备实际可能报出的事件类型更多一些。ARM 系统内存管理单元架构规范版本 3 中定义了各种事件的事件类型号,事件信息的详细结构及事件的触发条件。SMMUv3 硬件设备的事件有如下这些:
事件类型 | 事件类型号 |
---|---|
F_UUT | 0x01 |
C_BAD_STREAMID | 0x02 |
F_STE_FETCH | 0x03 |
C_BAD_STE | 0x04 |
F_BAD_ATS_TREQ | 0x05 |
F_STREAM_DISABLED | 0x06 |
F_TRANSL_FORBIDDEN | 0x07 |
C_BAD_SUBSTREAMID | 0x08 |
F_CD_FETCH | 0x09 |
C_BAD_CD | 0x0A |
F_WALK_EABT | 0x0B |
F_TRANSLATION | 0x10 |
F_ADDR_SIZE | 0x11 |
F_ACCESS | 0x12 |
F_PERMISSION | 0x13 |
F_TLB_CONFLICT | 0x20 |
F_CFG_CONFLICT | 0x21 |
E_PAGE_REQUEST | 0x24 |
F_VMS_FETCH | 0x25 |
[ 130.837807] arm-smmu-v3 1000000.smmu: event 0x10 received:
[ 130.845314] arm-smmu-v3 1000000.smmu: 0x0000000100002810
[ 130.852634] arm-smmu-v3 1000000.smmu: 0x0000020880000b17
[ 130.859927] arm-smmu-v3 1000000.smmu: 0x00000009f44a0300
[ 130.867203] arm-smmu-v3 1000000.smmu: 0x0000000000000000
[ 130.874834] arm-smmu-v3 1000000.smmu: event 0x10 received:
[ 130.882345] arm-smmu-v3 1000000.smmu: 0x0000000100002810
[ 130.889630] arm-smmu-v3 1000000.smmu: 0x0000020880000b18
[ 130.896921] arm-smmu-v3 1000000.smmu: 0x00000009f44a0380
[ 130.904214] arm-smmu-v3 1000000.smmu: 0x0000000000000000
pasid-num-bits
但为系统 I/O 设备配置了 0 pasid[ 27.960983] arm-smmu-v3 1000000.smmu: event 0x06 received:
[ 27.969199] arm-smmu-v3 1000000.smmu: 0x0000000100000006
[ 27.976969] arm-smmu-v3 1000000.smmu: 0x0000000000000000
[ 27.984570] arm-smmu-v3 1000000.smmu: 0x0000000000000000
[ 27.992193] arm-smmu-v3 1000000.smmu: 0x0000000000000000
[ 27.999578] arm-smmu-v3 1000000.smmu: event 0x06 received:
[ 28.007042] arm-smmu-v3 1000000.smmu: 0x0000000100000006
[ 28.014346] arm-smmu-v3 1000000.smmu: 0x0000000000000000
[ 28.021676] arm-smmu-v3 1000000.smmu: 0x0000000000000000
[ 28.029014] arm-smmu-v3 1000000.smmu: 0x0000000000000000
pasid-num-bits
配置为 0,为系统 I/O 设备配置了 0 pasid[ 42.463517] arm-smmu-v3 1000000.smmu: event 0x08 received:
[ 42.471632] arm-smmu-v3 1000000.smmu: 0x0000000100000008
[ 42.479423] arm-smmu-v3 1000000.smmu: 0x0000000000000000
[ 42.487026] arm-smmu-v3 1000000.smmu: 0x0000000000000000
[ 42.494674] arm-smmu-v3 1000000.smmu: 0x0000000000000000
[ 42.502219] arm-smmu-v3 1000000.smmu: event 0x08 received:
[ 42.509674] arm-smmu-v3 1000000.smmu: 0x0000000100000008
[ 42.516984] arm-smmu-v3 1000000.smmu: 0x0000000000000000
[ 42.524302] arm-smmu-v3 1000000.smmu: 0x0000000000000000
[ 42.531604] arm-smmu-v3 1000000.smmu: 0x0000000000000000
Done.
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。