vhost源码分析

作者：寸_铁 | 2024-08-18 20:15:17

踩

vhost源码分析

1、概述

vhost的大致原理就是qemu在Guest、Host之间创建一些共享buffer，Guest作为生产者往buffer填充可用描述符信息，Host作为消费者从可用描述符里消费buffer。Host消费完buffer后再通知Guest回收描述符；

本文主要基于3.10版本kernel，分析了vhost的报文收发过程。

2、vring

Guest、Host之间通过通过共享vring buffer的方式完成数据报文传递，相关数据结构如下，其中vring_virtqueue为Guest侧数据结构，vhost_virtqueue为Host侧数据结构；


struct vring_virtqueue
{
	struct virtqueue vq;
 
	/* Actual memory layout for this queue */
    /*
    包含desc、avail、used三个vring，其中desc用于存放描述符信息，avail用于表示当前可用的desc 的
    head id，used用于描述当前已经使用的desc的head id
    */
	struct vring vring;
 
	/* Can we use weak barriers? */
	bool weak_barriers;
 
	/* Other side has made a mess, don't try any more. */
	bool broken;
 
	/* Host supports indirect buffers */
	bool indirect;
 
	/* Host publishes avail event idx */
	bool event;
 
	/* Head of free buffer list. */
	unsigned int free_head;
	/* Number we've added since last sync. */
	unsigned int num_added;
 
	/* Last used index we've seen. */
    /*
    用于描述Guest当前已回收的最后一个desc id值
    */
	u16 last_used_idx;
 
	/* How to notify other side. FIXME: commonalize hcalls! */
	void (*notify)(struct virtqueue *vq);
 
#ifdef DEBUG
	/* They're supposed to lock for us. */
	unsigned int in_use;
 
	/* Figure out if their kicks are too delayed. */
	bool last_add_time_valid;
	ktime_t last_add_time;
#endif
 
	/* Tokens for callbacks. */
	void *data[];
};


struct vhost_virtqueue {
	struct vhost_dev *dev;
 
	/* The actual ring of buffers. */
	struct mutex mutex;
	unsigned int num;
    /*
      Qemu通过VHOST_SET_VRING_ADDR将Guest的三个vring地址通知给vhost，vhost填充到
      vhost_virtqueue对应字段
    */
	struct vring_desc __user *desc;
	struct vring_avail __user *avail;
	struct vring_used __user *used;
	struct file *kick;
	struct file *call;
	struct file *error;
	struct eventfd_ctx *call_ctx;
	struct eventfd_ctx *error_ctx;
	struct eventfd_ctx *log_ctx;
 
	struct vhost_poll poll;
 
	/* The routine to call when the Guest pings us, or timeout. */
	vhost_work_fn_t handle_kick;
 
	/* Last available index we saw. */
    /*
      Host可用的第一个desc id
    */
	u16 last_avail_idx;
 
	/* Caches available index value from user. */
	u16 avail_idx;
 
	/* Last index we used. */
	u16 last_used_idx;
 
	/* Used flags */
	u16 used_flags;
 
	/* Last used index value we have signalled on */
	u16 signalled_used;
 
	/* Last used index value we have signalled on */
	bool signalled_used_valid;
 
	/* Log writes to used structure. */
	bool log_used;
	u64 log_addr;
 
	struct iovec iov[UIO_MAXIOV];
	struct iovec *indirect;
	struct vring_used_elem *heads;
	/* We use a kind of RCU to access private pointer.
	 * All readers access it from worker, which makes it possible to
	 * flush the vhost_work instead of synchronize_rcu. Therefore readers do
	 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
	 * vhost_work execution acts instead of rcu_read_lock() and the end of
	 * vhost_work execution acts instead of rcu_read_unlock().
	 * Writers use virtqueue mutex. */
	void __rcu *private_data;
	/* Log write descriptors */
	void __user *log_base;
	struct vhost_log *log;
};

3、发包流程

3.1 Guest侧


static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
				  void (*callback)(struct virtqueue *vq),
				  const char *name,
				  u16 msix_vec)
{
	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
	struct virtio_pci_vq_info *info;
	struct virtqueue *vq;
	unsigned long flags, size;
	u16 num;
	int err;
 
	/* Select the queue we're interested in */
	iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
	/* Check if queue is either not available or already active. */
    /*获取当前配置的vring buffer个数*/
	num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM);
	if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN))
		return ERR_PTR(-ENOENT);
 
	/* allocate and fill out our structure the represents an active
	 * queue */
	info = kmalloc(sizeof(struct virtio_pci_vq_info), GFP_KERNEL);
	if (!info)
		return ERR_PTR(-ENOMEM);
 
	info->num = num;
	info->msix_vector = msix_vec;
 
	size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
    /*分配desc页信息*/
	info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
	if (info->queue == NULL) {
		err = -ENOMEM;
		goto out_info;
	}
 
	/* activate the queue */
	iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
	/* create the vring */
	vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev,
				 true, info->queue, vp_notify, callback, name);
	if (!vq) {
		err = -ENOMEM;
		goto out_activate_queue;
	}
 
	vq->priv = info;
	info->vq = vq;
 
	if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
		iowrite16(msix_vec, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
		msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
		if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
			err = -EBUSY;
			goto out_assign;
		}
	}
 
	if (callback) {
		spin_lock_irqsave(&vp_dev->lock, flags);
		list_add(&info->node, &vp_dev->virtqueues);
		spin_unlock_irqrestore(&vp_dev->lock, flags);
	} else {
		INIT_LIST_HEAD(&info->node);
	}
 
	return vq;
 
out_assign:
	vring_del_virtqueue(vq);
out_activate_queue:
	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
	free_pages_exact(info->queue, size);
out_info:
	kfree(info);
	return ERR_PTR(err);
}

Guest在初始化virtio设备时，在setup_vq函数里，先获取配置的vring buffer个数信息，然后分配实际的desc内存，调用vring_new_virtqueue创建vq，完成vring信息的初始化；


struct virtqueue *vring_new_virtqueue(unsigned int index,
				      unsigned int num,
				      unsigned int vring_align,
				      struct virtio_device *vdev,
				      bool weak_barriers,
				      void *pages,
				      void (*notify)(struct virtqueue *),
				      void (*callback)(struct virtqueue *),
				      const char *name)
{
	struct vring_virtqueue *vq;
	unsigned int i;
 
	/* We assume num is a power of 2. */
	if (num & (num - 1)) {
		dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
		return NULL;
	}
 
	vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
	if (!vq)
		return NULL;
 
	vring_init(&vq->vring, num, pages, vring_align);
	vq->vq.callback = callback;
	vq->vq.vdev = vdev;
	vq->vq.name = name;
	vq->vq.num_free = num;
	vq->vq.index = index;
	vq->notify = notify;
	vq->weak_barriers = weak_barriers;
	vq->broken = false;
	vq->last_used_idx = 0;
	vq->num_added = 0;
	list_add_tail(&vq->vq.list, &vdev->vqs);
#ifdef DEBUG
	vq->in_use = false;
	vq->last_add_time_valid = false;
#endif
 
	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
	/* No callback?  Tell other side not to bother us. */
	if (!callback)
		vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
 
	/* Put everything in free lists. */
	vq->free_head = 0;
    /*初始化desc描述符vring*/
	for (i = 0; i < num-1; i++) {
		vq->vring.desc[i].next = i+1;
		vq->data[i] = NULL;
	}
    /*初始化data数据结构，Host接收报文时，就是将相应的报文数据填充到data里*/
	vq->data[i] = NULL;
 
	return &vq->vq;
}

当Guest需要向外发送报文时，会调用到start_xmit（virtio_net.c），该函数最终会调用virtqueue_add将skb_buffer填充到vq->data里；


static inline int virtqueue_add(struct virtqueue *_vq,
				struct scatterlist *sgs[],
				struct scatterlist *(*next)
				  (struct scatterlist *, unsigned int *),
				unsigned int total_out,
				unsigned int total_in,
				unsigned int out_sgs,
				unsigned int in_sgs,
				void *data,
				gfp_t gfp)
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	struct scatterlist *sg;
	unsigned int i, n, avail, uninitialized_var(prev), total_sg;
	int head;
 
	START_USE(vq);
 
	BUG_ON(data == NULL);
 
#ifdef DEBUG
	{
		ktime_t now = ktime_get();
 
		/* No kick or get, with .1 second between?  Warn. */
		if (vq->last_add_time_valid)
			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
					    > 100);
		vq->last_add_time = now;
		vq->last_add_time_valid = true;
	}
#endif
 
	total_sg = total_in + total_out;
 
	/* If the host supports indirect descriptor tables, and we have multiple
	 * buffers, then go indirect. FIXME: tune this threshold */
	if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
		head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
					  total_in,
					  out_sgs, in_sgs, gfp);
		if (likely(head >= 0))
			goto add_head;
	}
 
	BUG_ON(total_sg > vq->vring.num);
	BUG_ON(total_sg == 0);
 
	if (vq->vq.num_free < total_sg) {
		pr_debug("Can't add buf len %i - avail = %i\n",
			 total_sg, vq->vq.num_free);
		/* FIXME: for historical reasons, we force a notify here if
		 * there are outgoing parts to the buffer.  Presumably the
		 * host should service the ring ASAP. */
		if (out_sgs)
			vq->notify(&vq->vq);
		END_USE(vq);
		return -ENOSPC;
	}
 
	/* We're about to use some buffers from the free list. */
	vq->vq.num_free -= total_sg;
    /*获取当前首个可用描述符id*/
	head = i = vq->free_head;
    /*填充描述符信息*/
	for (n = 0; n < out_sgs; n++) {
		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
			vq->vring.desc[i].addr = sg_phys(sg);
			vq->vring.desc[i].len = sg->length;
			prev = i;
			i = vq->vring.desc[i].next;
		}
	}
	for (; n < (out_sgs + in_sgs); n++) {
		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
			vq->vring.desc[i].addr = sg_phys(sg);
			vq->vring.desc[i].len = sg->length;
			prev = i;
			i = vq->vring.desc[i].next;
		}
	}
	/* Last one doesn't continue. */
	vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
 
	/* Update free pointer */
	vq->free_head = i;
 
add_head:
	/* Set token. */
	vq->data[head] = data;
 
	/* Put entry in available array (but don't update avail->idx until they
	 * do sync). */
	avail = (vq->vring.avail->idx & (vq->vring.num-1));
	vq->vring.avail->ring[avail] = head;
	/* Descriptors and available array need to be set before we expose the
	 * new available array entries. */
	virtio_wmb(vq->weak_barriers);
    /*更新avail idx，Host在收包get可用描述符时，会获取该值*/
	vq->vring.avail->idx++;
	vq->num_added++;
	/* This is very unlikely, but theoretically possible.  Kick
	 * just in case. */
	if (unlikely(vq->num_added == (1 << 16) - 1))
		virtqueue_kick(_vq);
	pr_debug("Added buffer head %i to %p\n", head, vq);
	END_USE(vq);
	return 0;
}

start_xmit添加完outbuffer后，调用virtqueue_kick通知host；


void virtqueue_kick(struct virtqueue *vq)
{
	if (virtqueue_kick_prepare(vq))
		virtqueue_notify(vq);
}

在virtqueue_kick，virtqueue_kick_prepare会根据vring_need_event的返回值判断是否需要通知host，下面重点看一下vring_need_event：


static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
{
	/* Note: Xen has similar logic for notification hold-off
	 * in include/xen/interface/io/ring.h with req_event and req_prod
	 * corresponding to event_idx + 1 and new_idx respectively.
	 * Note also that req_event and req_prod in Xen start at 1,
	 * event indexes in virtio start at 0. */
	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
}

看一下这个event_idx的定义：

#define vring_avail_event(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num])

那这个值是在哪里更新的呢？回到vhost代码；会发现vhost在使用完ring buffer后会调用vhost_update_avail_event，在这里会调用__put_user(vq->avail_idx, vhost_avail_event(vq))，会将avail_idx写到vhost_avail_event(vq)里，看下vhost_avail_event(vq)的定义：

#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])

可以发现这个vhost_avail_event其实就是vring_avail_event；至于avail_idx，vhost在handle_rx阶段获取可用描述符（vhost_get_vq_desc）时会通过__get_user(vq->avail_idx, &vq->avail->idx)获取，表示当前而Guest已添加的可用buffer id，Guest每添加一个buffer（virtqueue_add），就会更新一次vq->avail->idx，总结一下就是：

Guest在添加buffer的时候会更新vq->avail->idx，host在获取可用buffer时会获取这个值，然后将其写到vhost_avail_event里，Guest在添加完新的buffer后，就会判断当前host填进去的vhost_avail_enevt是不是Guest最新的值，如果是，则通过virtqueue_notify通知host，如果不是，表明host当前消息处理不过来，等下次再通知。

3.2 Host侧

Guest调用vrtrqueue_notify后，触发mmio异常陷出到host，然后通过eventfd机制唤醒vhost线程（vhost线程唤醒机制后续再单独做分析）；当vhost需要发包被唤醒时，会调用handle_tx（drivers/vhost/net.c），在handle_tx里会调用vhost_get_vq_desc获取Guest填充的buffer信息；


int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
		      struct iovec iov[], unsigned int iov_size,
		      unsigned int *out_num, unsigned int *in_num,
		      struct vhost_log *log, unsigned int *log_num)
{
	struct vring_desc desc;
	unsigned int i, head, found = 0;
	u16 last_avail_idx;
	int ret;
 
	/* Check it isn't doing very strange things with descriptor numbers. */
	last_avail_idx = vq->last_avail_idx;
	/*
		获取Guest填充的最新的可用id值
	*/
	if (unlikely(__get_user(vq->avail_idx, &vq->avail->idx))) {
		vq_err(vq, "Failed to access avail idx at %p\n",
		       &vq->avail->idx);
		return -EFAULT;
	}
 
	if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
		vq_err(vq, "Guest moved used index from %u to %u",
		       last_avail_idx, vq->avail_idx);
		return -EFAULT;
	}
 
	/* If there's nothing new since last we looked, return invalid. */
    /*
    Last_avail_idx表示vhost当前可用的id值，vhost每使用一个描述符，last_avail_idx就会加1；如果
     vq->avail_idx与last_avail_idx相等，则表明Guest没有填充新的buffer，也即当前没有需要发送的
    数据
    */
	if (vq->avail_idx == last_avail_idx)
		return vq->num;
 
	/* Only get avail ring entries after they have been exposed by guest. */
	smp_rmb();
 
	/* Grab the next descriptor number they're advertising, and increment
	 * the index we've seen. */
	if (unlikely(__get_user(head,
				&vq->avail->ring[last_avail_idx % vq->num]))) {
		vq_err(vq, "Failed to read head: idx %d address %p\n",
		       last_avail_idx,
		       &vq->avail->ring[last_avail_idx % vq->num]);
		return -EFAULT;
	}
 
	/* If their number is silly, that's an error. */
	if (unlikely(head >= vq->num)) {
		vq_err(vq, "Guest says index %u > %u is available",
		       head, vq->num);
		return -EINVAL;
	}
 
	/* When we start there are none of either input nor output. */
	*out_num = *in_num = 0;
	if (unlikely(log))
		*log_num = 0;
 
	i = head;
	do {
		unsigned iov_count = *in_num + *out_num;
		if (unlikely(i >= vq->num)) {
			vq_err(vq, "Desc index is %u > %u, head = %u",
			       i, vq->num, head);
			return -EINVAL;
		}
		if (unlikely(++found > vq->num)) {
			vq_err(vq, "Loop detected: last one at %u "
			       "vq size %u head %u\n",
			       i, vq->num, head);
			return -EINVAL;
		}
		ret = __copy_from_user(&desc, vq->desc + i, sizeof desc);
		if (unlikely(ret)) {
			vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
			       i, vq->desc + i);
			return -EFAULT;
		}
		if (desc.flags & VRING_DESC_F_INDIRECT) {
			ret = get_indirect(dev, vq, iov, iov_size,
					   out_num, in_num,
					   log, log_num, &desc);
			if (unlikely(ret < 0)) {
				vq_err(vq, "Failure detected "
				       "in indirect descriptor at idx %d\n", i);
				return ret;
			}
			continue;
		}
 
		ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count,
				     iov_size - iov_count);
		if (unlikely(ret < 0)) {
			vq_err(vq, "Translation failure %d descriptor idx %d\n",
			       ret, i);
			return ret;
		}
		if (desc.flags & VRING_DESC_F_WRITE) {
			/* If this is an input descriptor,
			 * increment that count. */
			*in_num += ret;
			if (unlikely(log)) {
				log[*log_num].addr = desc.addr;
				log[*log_num].len = desc.len;
				++*log_num;
			}
		} else {
			/* If it's an output descriptor, they're all supposed
			 * to come before any input descriptors. */
			if (unlikely(*in_num)) {
				vq_err(vq, "Descriptor has out after in: "
				       "idx %d\n", i);
				return -EINVAL;
			}
			*out_num += ret;
		}
	} while ((i = next_desc(&desc)) != -1);
 
	/* On success, increment avail index. */
    /*
    成功获取一个buffer后，last_avail_idx加1
    */
	vq->last_avail_idx++;
 
	/* Assume notifications from guest are disabled at this point,
	 * if they aren't we would need to update avail_event index. */
	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
	return head;
}

Vhost获取可用buffer后，通过sock->ops->sendmsg完成报文的发送；然后调用vhost_add_used将已使用的buffer信息回填给Guest；


int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
{
	struct vring_used_elem __user *used;
 
	/* The virtqueue contains a ring of used buffers.  Get a pointer to the
	 * next entry in that used ring. */
	used = &vq->used->ring[vq->last_used_idx % vq->num];
	if (__put_user(head, &used->id)) {
		vq_err(vq, "Failed to write used id");
		return -EFAULT;
	}
	if (__put_user(len, &used->len)) {
		vq_err(vq, "Failed to write used len");
		return -EFAULT;
	}
	/* Make sure buffer is written before we update index. */
	smp_wmb();
    /*
    将last_used_idx+1写到vq->used->idx里
    */
	if (__put_user(vq->last_used_idx + 1, &vq->used->idx)) {
		vq_err(vq, "Failed to increment used idx");
		return -EFAULT;
	}
	if (unlikely(vq->log_used)) {
		/* Make sure data is seen before log. */
		smp_wmb();
		/* Log used ring entry write. */
		log_write(vq->log_base,
			  vq->log_addr +
			   ((void __user *)used - (void __user *)vq->used),
			  sizeof *used);
		/* Log used index update. */
		log_write(vq->log_base,
			  vq->log_addr + offsetof(struct vring_used, idx),
			  sizeof vq->used->idx);
		if (vq->log_ctx)
			eventfd_signal(vq->log_ctx, 1);
	}
    /*
    Vhost已使用的id加1
    */
	vq->last_used_idx++;
	/* If the driver never bothers to signal in a very long while,
	 * used index might wrap around. If that happens, invalidate
	 * signalled_used index we stored. TODO: make sure driver
	 * signals at least once in 2^16 and remove this. */
	if (unlikely(vq->last_used_idx == vq->signalled_used))
		vq->signalled_used_valid = false;
	return 0;
}

在vhost_add_used里会调用__put_user(vq->last_used_idx + 1, &vq->used->idx)将vhost当前已使用的used id写到vq->used->idx里，这里的作用是让Guest知道当前vhost已经使用的id值，这样当Guest需要回收buffer或者接收vhost转给它的报文时才知道需要从哪里获取。

4、收包流程

4.1 Host侧

Vhost在接受网卡上送的报文时，会调用handle_rx，在handle_rx里首先通过get_rx_bufs获取当前可用描述符信息，然后通过sock->ops->recvmsg完成报文接收；报文接收完成后调用vhost_add_used_and_signal_n添加已使用id信息；


int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
		     unsigned count)
{
	int start, n, r;
 
	start = vq->last_used_idx % vq->num;
	n = vq->num - start;
	if (n < count) {
		r = __vhost_add_used_n(vq, heads, n);
		if (r < 0)
			return r;
		heads += n;
		count -= n;
	}
	r = __vhost_add_used_n(vq, heads, count);
 
	/* Make sure buffer is written before we update index. */
	smp_wmb();
    /*
    将当前已经的id值写到vq->used->idx里，供Guest接收报文时使用
    */	
	if (put_user(vq->last_used_idx, &vq->used->idx)) {
		vq_err(vq, "Failed to increment used idx");
		return -EFAULT;
	}
	if (unlikely(vq->log_used)) {
		/* Log used index update. */
		log_write(vq->log_base,
			  vq->log_addr + offsetof(struct vring_used, idx),
			  sizeof vq->used->idx);
		if (vq->log_ctx)
			eventfd_signal(vq->log_ctx, 1);
	}
	return r;
}

在__vhost_add_used_n里vhost会更新last_used_idx（new = （vq->last_used_idx += count）），更新完成后，将最新的last_used_idx通过(put_user(vq->last_used_idx, &vq->used->idx)通知给Guest；vhost更新完id信息后，调用vhost_signal通知Guest接收报文，在vhost_signal里，同样会先判断是否需要通知，判断的原理跟Guest决定是否通知Host时的类似，这里通过vring_need_event判断是否需要通知Guest，其中event_idx为Guest当前已回收的描述符id信息，如果Guest描述符回收过慢，说明当前Guest还有很多接收报文待处理，暂不通知Guest。


static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
{
	/* Note: Xen has similar logic for notification hold-off
	 * in include/xen/interface/io/ring.h with req_event and req_prod
	 * corresponding to event_idx + 1 and new_idx respectively.
	 * Note also that req_event and req_prod in Xen start at 1,
	 * event indexes in virtio start at 0. */
	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
}

4.2 Guest侧

当vhost判断当前需要通知Guest时，会通过irqfd往Guest注入一个虚拟中断（详细流程到时会单独分析）；Guest接收到中断时，进入中断处理函数skb_recv_done，然后触发NET_RX_SOFTIRQ软中断，然后进入virtnet_poll处理报文接收（详细看init_vqs drivers/net/virtio_net.c）；


static int virtnet_poll(struct napi_struct *napi, int budget)
 
{
 
       struct receive_queue *rq =
 
              container_of(napi, struct receive_queue, napi);
 
       struct virtnet_info *vi = rq->vq->vdev->priv;
 
       void *buf;
 
       unsigned int r, len, received = 0;
 
 
 
again:
 
       while (received < budget &&
 
              /* 获取Host填充的报文信息*/
 
              (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
 
              receive_buf(rq, buf, len);
 
              --rq->num;
 
              received++;
 
       }
 
       if (rq->num < rq->max / 2) {
 
              /*回收描述符*/
 
              if (!try_fill_recv(rq, GFP_ATOMIC))
 
                     schedule_delayed_work(&vi->refill, 0);
 
       }
 
 
       /* Out of packets? */
 
       if (received < budget) {
 
              r = virtqueue_enable_cb_prepare(rq->vq);
 
              napi_complete(napi);
 
              if (unlikely(virtqueue_poll(rq->vq, r)) &&
 
                  napi_schedule_prep(napi)) {
 
                     virtqueue_disable_cb(rq->vq);
 
                     __napi_schedule(napi);
 
                     goto again;
 
              }
 
       }
 
 
       return received;
 
}

在virtqueue_get_buf里会获取Host填充的报文数据信息，然后再将报文发给协议栈处理；


void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
{
 
       struct vring_virtqueue *vq = to_vvq(_vq);
 
       void *ret;
 
       unsigned int i;
 
       u16 last_used;
 
 
 
       START_USE(vq);
 
 
 
       if (unlikely(vq->broken)) {
 
              END_USE(vq);
 
              return NULL;
 
       }
 
       /*
 
       这里判断vq-> last_used_idx与vq->vring.used->idx是否相等，其中vq->last_used_idx表示
       Guest上一次已处理的已使用的描述符id值，vq->vring.used->id是vhost填写的，表示vhost当 前
       已使用的描述符id值，如果这两个值相等，说明Guest没有需要处理的报文了
 
       */
 
       if (!more_used(vq)) {
 
              pr_debug("No more buffers in queue\n");
 
              END_USE(vq);
 
              return NULL;
 
       }
 
 
 
       /* Only get used array entries after they have been exposed by host. */
 
       virtio_rmb(vq->weak_barriers);
 
 
 
       last_used = (vq->last_used_idx & (vq->vring.num - 1));
 
       i = vq->vring.used->ring[last_used].id;
 
       *len = vq->vring.used->ring[last_used].len;
 
 
 
       if (unlikely(i >= vq->vring.num)) {
 
              BAD_RING(vq, "id %u out of range\n", i);
 
              return NULL;
 
       }
 
       if (unlikely(!vq->data[i])) {
 
              BAD_RING(vq, "id %u is not a head!\n", i);
 
              return NULL;
 
       }
 
 
 
       /* detach_buf clears data, so grab it now. */
 
       /*Vq->data[i]为具体的报文buffer*/   
 
       ret = vq->data[i];
 
       detach_buf(vq, i);
 
       /*每处理完一个报文，last_used_idx加1*/
 
       vq->last_used_idx++;
 
       /* If we expect an interrupt for the next entry, tell host
 
        * by writing event index and flush out the write before
 
        * the read in the next get_buf call. */
 
       if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
 
              vring_used_event(&vq->vring) = vq->last_used_idx;
 
              virtio_mb(vq->weak_barriers);
 
       }
 
 
 
#ifdef DEBUG
 
       vq->last_add_time_valid = false;
 
#endif
 
 
 
       END_USE(vq);
 
       return ret;
 
}
 
EXPORT_SYMBOL_GPL(virtqueue_get_buf);

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/寸_铁/article/detail/999072