赞
踩
vhost的大致原理就是qemu在Guest、Host之间创建一些共享buffer,Guest作为生产者往buffer填充可用描述符信息,Host作为消费者从可用描述符里消费buffer。Host消费完buffer后再通知Guest回收描述符;
本文主要基于3.10版本kernel,分析了vhost的报文收发过程。
Guest、Host之间通过通过共享vring buffer的方式完成数据报文传递,相关数据结构如下,其中vring_virtqueue为Guest侧数据结构,vhost_virtqueue为Host侧数据结构;
- struct vring_virtqueue
- {
- struct virtqueue vq;
-
- /* Actual memory layout for this queue */
- /*
- 包含desc、avail、used三个vring,其中desc用于存放描述符信息,avail用于表示当前可用的desc 的
- head id,used用于描述当前已经使用的desc的head id
- */
- struct vring vring;
-
- /* Can we use weak barriers? */
- bool weak_barriers;
-
- /* Other side has made a mess, don't try any more. */
- bool broken;
-
- /* Host supports indirect buffers */
- bool indirect;
-
- /* Host publishes avail event idx */
- bool event;
-
- /* Head of free buffer list. */
- unsigned int free_head;
- /* Number we've added since last sync. */
- unsigned int num_added;
-
- /* Last used index we've seen. */
- /*
- 用于描述Guest当前已回收的最后一个desc id值
- */
- u16 last_used_idx;
-
- /* How to notify other side. FIXME: commonalize hcalls! */
- void (*notify)(struct virtqueue *vq);
-
- #ifdef DEBUG
- /* They're supposed to lock for us. */
- unsigned int in_use;
-
- /* Figure out if their kicks are too delayed. */
- bool last_add_time_valid;
- ktime_t last_add_time;
- #endif
-
- /* Tokens for callbacks. */
- void *data[];
- };

- struct vhost_virtqueue {
- struct vhost_dev *dev;
-
- /* The actual ring of buffers. */
- struct mutex mutex;
- unsigned int num;
- /*
- Qemu通过VHOST_SET_VRING_ADDR将Guest的三个vring地址通知给vhost,vhost填充到
- vhost_virtqueue对应字段
- */
- struct vring_desc __user *desc;
- struct vring_avail __user *avail;
- struct vring_used __user *used;
- struct file *kick;
- struct file *call;
- struct file *error;
- struct eventfd_ctx *call_ctx;
- struct eventfd_ctx *error_ctx;
- struct eventfd_ctx *log_ctx;
-
- struct vhost_poll poll;
-
- /* The routine to call when the Guest pings us, or timeout. */
- vhost_work_fn_t handle_kick;
-
- /* Last available index we saw. */
- /*
- Host可用的第一个desc id
- */
- u16 last_avail_idx;
-
- /* Caches available index value from user. */
- u16 avail_idx;
-
- /* Last index we used. */
- u16 last_used_idx;
-
- /* Used flags */
- u16 used_flags;
-
- /* Last used index value we have signalled on */
- u16 signalled_used;
-
- /* Last used index value we have signalled on */
- bool signalled_used_valid;
-
- /* Log writes to used structure. */
- bool log_used;
- u64 log_addr;
-
- struct iovec iov[UIO_MAXIOV];
- struct iovec *indirect;
- struct vring_used_elem *heads;
- /* We use a kind of RCU to access private pointer.
- * All readers access it from worker, which makes it possible to
- * flush the vhost_work instead of synchronize_rcu. Therefore readers do
- * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
- * vhost_work execution acts instead of rcu_read_lock() and the end of
- * vhost_work execution acts instead of rcu_read_unlock().
- * Writers use virtqueue mutex. */
- void __rcu *private_data;
- /* Log write descriptors */
- void __user *log_base;
- struct vhost_log *log;
- };

- static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
- void (*callback)(struct virtqueue *vq),
- const char *name,
- u16 msix_vec)
- {
- struct virtio_pci_device *vp_dev = to_vp_device(vdev);
- struct virtio_pci_vq_info *info;
- struct virtqueue *vq;
- unsigned long flags, size;
- u16 num;
- int err;
-
- /* Select the queue we're interested in */
- iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
-
- /* Check if queue is either not available or already active. */
- /*获取当前配置的vring buffer个数*/
- num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM);
- if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN))
- return ERR_PTR(-ENOENT);
-
- /* allocate and fill out our structure the represents an active
- * queue */
- info = kmalloc(sizeof(struct virtio_pci_vq_info), GFP_KERNEL);
- if (!info)
- return ERR_PTR(-ENOMEM);
-
- info->num = num;
- info->msix_vector = msix_vec;
-
- size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
- /*分配desc页信息*/
- info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
- if (info->queue == NULL) {
- err = -ENOMEM;
- goto out_info;
- }
-
- /* activate the queue */
- iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
- vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
-
- /* create the vring */
- vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev,
- true, info->queue, vp_notify, callback, name);
- if (!vq) {
- err = -ENOMEM;
- goto out_activate_queue;
- }
-
- vq->priv = info;
- info->vq = vq;
-
- if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
- iowrite16(msix_vec, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
- msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
- if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
- err = -EBUSY;
- goto out_assign;
- }
- }
-
- if (callback) {
- spin_lock_irqsave(&vp_dev->lock, flags);
- list_add(&info->node, &vp_dev->virtqueues);
- spin_unlock_irqrestore(&vp_dev->lock, flags);
- } else {
- INIT_LIST_HEAD(&info->node);
- }
-
- return vq;
-
- out_assign:
- vring_del_virtqueue(vq);
- out_activate_queue:
- iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
- free_pages_exact(info->queue, size);
- out_info:
- kfree(info);
- return ERR_PTR(err);
- }

Guest在初始化virtio设备时,在setup_vq函数里,先获取配置的vring buffer个数信息,然后分配实际的desc内存,调用vring_new_virtqueue创建vq,完成vring信息的初始化;
- struct virtqueue *vring_new_virtqueue(unsigned int index,
- unsigned int num,
- unsigned int vring_align,
- struct virtio_device *vdev,
- bool weak_barriers,
- void *pages,
- void (*notify)(struct virtqueue *),
- void (*callback)(struct virtqueue *),
- const char *name)
- {
- struct vring_virtqueue *vq;
- unsigned int i;
-
- /* We assume num is a power of 2. */
- if (num & (num - 1)) {
- dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
- return NULL;
- }
-
- vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
- if (!vq)
- return NULL;
-
- vring_init(&vq->vring, num, pages, vring_align);
- vq->vq.callback = callback;
- vq->vq.vdev = vdev;
- vq->vq.name = name;
- vq->vq.num_free = num;
- vq->vq.index = index;
- vq->notify = notify;
- vq->weak_barriers = weak_barriers;
- vq->broken = false;
- vq->last_used_idx = 0;
- vq->num_added = 0;
- list_add_tail(&vq->vq.list, &vdev->vqs);
- #ifdef DEBUG
- vq->in_use = false;
- vq->last_add_time_valid = false;
- #endif
-
- vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
- vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
-
- /* No callback? Tell other side not to bother us. */
- if (!callback)
- vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
-
- /* Put everything in free lists. */
- vq->free_head = 0;
- /*初始化desc描述符vring*/
- for (i = 0; i < num-1; i++) {
- vq->vring.desc[i].next = i+1;
- vq->data[i] = NULL;
- }
- /*初始化data数据结构,Host接收报文时,就是将相应的报文数据填充到data里*/
- vq->data[i] = NULL;
-
- return &vq->vq;
- }

当Guest需要向外发送报文时,会调用到start_xmit(virtio_net.c),该函数最终会调用virtqueue_add将skb_buffer填充到vq->data里;
- static inline int virtqueue_add(struct virtqueue *_vq,
- struct scatterlist *sgs[],
- struct scatterlist *(*next)
- (struct scatterlist *, unsigned int *),
- unsigned int total_out,
- unsigned int total_in,
- unsigned int out_sgs,
- unsigned int in_sgs,
- void *data,
- gfp_t gfp)
- {
- struct vring_virtqueue *vq = to_vvq(_vq);
- struct scatterlist *sg;
- unsigned int i, n, avail, uninitialized_var(prev), total_sg;
- int head;
-
- START_USE(vq);
-
- BUG_ON(data == NULL);
-
- #ifdef DEBUG
- {
- ktime_t now = ktime_get();
-
- /* No kick or get, with .1 second between? Warn. */
- if (vq->last_add_time_valid)
- WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
- > 100);
- vq->last_add_time = now;
- vq->last_add_time_valid = true;
- }
- #endif
-
- total_sg = total_in + total_out;
-
- /* If the host supports indirect descriptor tables, and we have multiple
- * buffers, then go indirect. FIXME: tune this threshold */
- if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
- head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
- total_in,
- out_sgs, in_sgs, gfp);
- if (likely(head >= 0))
- goto add_head;
- }
-
- BUG_ON(total_sg > vq->vring.num);
- BUG_ON(total_sg == 0);
-
- if (vq->vq.num_free < total_sg) {
- pr_debug("Can't add buf len %i - avail = %i\n",
- total_sg, vq->vq.num_free);
- /* FIXME: for historical reasons, we force a notify here if
- * there are outgoing parts to the buffer. Presumably the
- * host should service the ring ASAP. */
- if (out_sgs)
- vq->notify(&vq->vq);
- END_USE(vq);
- return -ENOSPC;
- }
-
- /* We're about to use some buffers from the free list. */
- vq->vq.num_free -= total_sg;
- /*获取当前首个可用描述符id*/
- head = i = vq->free_head;
- /*填充描述符信息*/
- for (n = 0; n < out_sgs; n++) {
- for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
- vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
- vq->vring.desc[i].addr = sg_phys(sg);
- vq->vring.desc[i].len = sg->length;
- prev = i;
- i = vq->vring.desc[i].next;
- }
- }
- for (; n < (out_sgs + in_sgs); n++) {
- for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
- vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
- vq->vring.desc[i].addr = sg_phys(sg);
- vq->vring.desc[i].len = sg->length;
- prev = i;
- i = vq->vring.desc[i].next;
- }
- }
- /* Last one doesn't continue. */
- vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
-
- /* Update free pointer */
- vq->free_head = i;
-
- add_head:
- /* Set token. */
- vq->data[head] = data;
-
- /* Put entry in available array (but don't update avail->idx until they
- * do sync). */
- avail = (vq->vring.avail->idx & (vq->vring.num-1));
- vq->vring.avail->ring[avail] = head;
- /* Descriptors and available array need to be set before we expose the
- * new available array entries. */
- virtio_wmb(vq->weak_barriers);
- /*更新avail idx,Host在收包get可用描述符时,会获取该值*/
- vq->vring.avail->idx++;
- vq->num_added++;
- /* This is very unlikely, but theoretically possible. Kick
- * just in case. */
- if (unlikely(vq->num_added == (1 << 16) - 1))
- virtqueue_kick(_vq);
- pr_debug("Added buffer head %i to %p\n", head, vq);
- END_USE(vq);
- return 0;
- }

start_xmit添加完outbuffer后,调用virtqueue_kick通知host;
- void virtqueue_kick(struct virtqueue *vq)
- {
- if (virtqueue_kick_prepare(vq))
- virtqueue_notify(vq);
- }
在virtqueue_kick,virtqueue_kick_prepare会根据vring_need_event的返回值判断是否需要通知host,下面重点看一下vring_need_event:
- static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
- {
- /* Note: Xen has similar logic for notification hold-off
- * in include/xen/interface/io/ring.h with req_event and req_prod
- * corresponding to event_idx + 1 and new_idx respectively.
- * Note also that req_event and req_prod in Xen start at 1,
- * event indexes in virtio start at 0. */
- return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
- }
看一下这个event_idx的定义:
#define vring_avail_event(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num])
那这个值是在哪里更新的呢?回到vhost代码;会发现vhost在使用完ring buffer后会调用vhost_update_avail_event,在这里会调用__put_user(vq->avail_idx, vhost_avail_event(vq)),会将avail_idx写到vhost_avail_event(vq)里,看下vhost_avail_event(vq)的定义:
#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
可以发现这个vhost_avail_event其实就是vring_avail_event;至于avail_idx,vhost在handle_rx阶段获取可用描述符(vhost_get_vq_desc)时会通过__get_user(vq->avail_idx, &vq->avail->idx)获取,表示当前而Guest已添加的可用buffer id,Guest每添加一个buffer(virtqueue_add),就会更新一次vq->avail->idx,总结一下就是:
Guest在添加buffer的时候会更新vq->avail->idx,host在获取可用buffer时会获取这个值,然后将其写到vhost_avail_event里,Guest在添加完新的buffer后,就会判断当前host填进去的vhost_avail_enevt是不是Guest最新的值,如果是,则通过virtqueue_notify通知host,如果不是,表明host当前消息处理不过来,等下次再通知。
Guest调用vrtrqueue_notify后,触发mmio异常陷出到host,然后通过eventfd机制唤醒vhost线程(vhost线程唤醒机制后续再单独做分析);当vhost需要发包被唤醒时,会调用handle_tx(drivers/vhost/net.c),在handle_tx里会调用vhost_get_vq_desc获取Guest填充的buffer信息;
- int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
- {
- struct vring_desc desc;
- unsigned int i, head, found = 0;
- u16 last_avail_idx;
- int ret;
-
- /* Check it isn't doing very strange things with descriptor numbers. */
- last_avail_idx = vq->last_avail_idx;
- /*
- 获取Guest填充的最新的可用id值
- */
- if (unlikely(__get_user(vq->avail_idx, &vq->avail->idx))) {
- vq_err(vq, "Failed to access avail idx at %p\n",
- &vq->avail->idx);
- return -EFAULT;
- }
-
- if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
- vq_err(vq, "Guest moved used index from %u to %u",
- last_avail_idx, vq->avail_idx);
- return -EFAULT;
- }
-
- /* If there's nothing new since last we looked, return invalid. */
- /*
- Last_avail_idx表示vhost当前可用的id值,vhost每使用一个描述符,last_avail_idx就会加1;如果
- vq->avail_idx与last_avail_idx相等,则表明Guest没有填充新的buffer,也即当前没有需要发送的
- 数据
- */
- if (vq->avail_idx == last_avail_idx)
- return vq->num;
-
- /* Only get avail ring entries after they have been exposed by guest. */
- smp_rmb();
-
- /* Grab the next descriptor number they're advertising, and increment
- * the index we've seen. */
- if (unlikely(__get_user(head,
- &vq->avail->ring[last_avail_idx % vq->num]))) {
- vq_err(vq, "Failed to read head: idx %d address %p\n",
- last_avail_idx,
- &vq->avail->ring[last_avail_idx % vq->num]);
- return -EFAULT;
- }
-
- /* If their number is silly, that's an error. */
- if (unlikely(head >= vq->num)) {
- vq_err(vq, "Guest says index %u > %u is available",
- head, vq->num);
- return -EINVAL;
- }
-
- /* When we start there are none of either input nor output. */
- *out_num = *in_num = 0;
- if (unlikely(log))
- *log_num = 0;
-
- i = head;
- do {
- unsigned iov_count = *in_num + *out_num;
- if (unlikely(i >= vq->num)) {
- vq_err(vq, "Desc index is %u > %u, head = %u",
- i, vq->num, head);
- return -EINVAL;
- }
- if (unlikely(++found > vq->num)) {
- vq_err(vq, "Loop detected: last one at %u "
- "vq size %u head %u\n",
- i, vq->num, head);
- return -EINVAL;
- }
- ret = __copy_from_user(&desc, vq->desc + i, sizeof desc);
- if (unlikely(ret)) {
- vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
- i, vq->desc + i);
- return -EFAULT;
- }
- if (desc.flags & VRING_DESC_F_INDIRECT) {
- ret = get_indirect(dev, vq, iov, iov_size,
- out_num, in_num,
- log, log_num, &desc);
- if (unlikely(ret < 0)) {
- vq_err(vq, "Failure detected "
- "in indirect descriptor at idx %d\n", i);
- return ret;
- }
- continue;
- }
-
- ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count,
- iov_size - iov_count);
- if (unlikely(ret < 0)) {
- vq_err(vq, "Translation failure %d descriptor idx %d\n",
- ret, i);
- return ret;
- }
- if (desc.flags & VRING_DESC_F_WRITE) {
- /* If this is an input descriptor,
- * increment that count. */
- *in_num += ret;
- if (unlikely(log)) {
- log[*log_num].addr = desc.addr;
- log[*log_num].len = desc.len;
- ++*log_num;
- }
- } else {
- /* If it's an output descriptor, they're all supposed
- * to come before any input descriptors. */
- if (unlikely(*in_num)) {
- vq_err(vq, "Descriptor has out after in: "
- "idx %d\n", i);
- return -EINVAL;
- }
- *out_num += ret;
- }
- } while ((i = next_desc(&desc)) != -1);
-
- /* On success, increment avail index. */
- /*
- 成功获取一个buffer后,last_avail_idx加1
- */
- vq->last_avail_idx++;
-
- /* Assume notifications from guest are disabled at this point,
- * if they aren't we would need to update avail_event index. */
- BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
- return head;
- }

Vhost获取可用buffer后,通过sock->ops->sendmsg完成报文的发送;然后调用vhost_add_used将已使用的buffer信息回填给Guest;
- int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
- {
- struct vring_used_elem __user *used;
-
- /* The virtqueue contains a ring of used buffers. Get a pointer to the
- * next entry in that used ring. */
- used = &vq->used->ring[vq->last_used_idx % vq->num];
- if (__put_user(head, &used->id)) {
- vq_err(vq, "Failed to write used id");
- return -EFAULT;
- }
- if (__put_user(len, &used->len)) {
- vq_err(vq, "Failed to write used len");
- return -EFAULT;
- }
- /* Make sure buffer is written before we update index. */
- smp_wmb();
- /*
- 将last_used_idx+1写到vq->used->idx里
- */
- if (__put_user(vq->last_used_idx + 1, &vq->used->idx)) {
- vq_err(vq, "Failed to increment used idx");
- return -EFAULT;
- }
- if (unlikely(vq->log_used)) {
- /* Make sure data is seen before log. */
- smp_wmb();
- /* Log used ring entry write. */
- log_write(vq->log_base,
- vq->log_addr +
- ((void __user *)used - (void __user *)vq->used),
- sizeof *used);
- /* Log used index update. */
- log_write(vq->log_base,
- vq->log_addr + offsetof(struct vring_used, idx),
- sizeof vq->used->idx);
- if (vq->log_ctx)
- eventfd_signal(vq->log_ctx, 1);
- }
- /*
- Vhost已使用的id加1
- */
- vq->last_used_idx++;
- /* If the driver never bothers to signal in a very long while,
- * used index might wrap around. If that happens, invalidate
- * signalled_used index we stored. TODO: make sure driver
- * signals at least once in 2^16 and remove this. */
- if (unlikely(vq->last_used_idx == vq->signalled_used))
- vq->signalled_used_valid = false;
- return 0;
- }

在vhost_add_used里会调用__put_user(vq->last_used_idx + 1, &vq->used->idx)将vhost当前已使用的used id写到vq->used->idx里,这里的作用是让Guest知道当前vhost已经使用的id值,这样当Guest需要回收buffer或者接收vhost转给它的报文时才知道需要从哪里获取。
Vhost在接受网卡上送的报文时,会调用handle_rx,在handle_rx里首先通过get_rx_bufs获取当前可用描述符信息,然后通过sock->ops->recvmsg完成报文接收;报文接收完成后调用vhost_add_used_and_signal_n添加已使用id信息;
- int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
- unsigned count)
- {
- int start, n, r;
-
- start = vq->last_used_idx % vq->num;
- n = vq->num - start;
- if (n < count) {
- r = __vhost_add_used_n(vq, heads, n);
- if (r < 0)
- return r;
- heads += n;
- count -= n;
- }
- r = __vhost_add_used_n(vq, heads, count);
-
- /* Make sure buffer is written before we update index. */
- smp_wmb();
- /*
- 将当前已经的id值写到vq->used->idx里,供Guest接收报文时使用
- */
- if (put_user(vq->last_used_idx, &vq->used->idx)) {
- vq_err(vq, "Failed to increment used idx");
- return -EFAULT;
- }
- if (unlikely(vq->log_used)) {
- /* Log used index update. */
- log_write(vq->log_base,
- vq->log_addr + offsetof(struct vring_used, idx),
- sizeof vq->used->idx);
- if (vq->log_ctx)
- eventfd_signal(vq->log_ctx, 1);
- }
- return r;
- }

在__vhost_add_used_n里vhost会更新last_used_idx(new = (vq->last_used_idx += count)),更新完成后,将最新的last_used_idx通过(put_user(vq->last_used_idx, &vq->used->idx)通知给Guest;vhost更新完id信息后,调用vhost_signal通知Guest接收报文,在vhost_signal里,同样会先判断是否需要通知,判断的原理跟Guest决定是否通知Host时的类似,这里通过vring_need_event判断是否需要通知Guest,其中event_idx为Guest当前已回收的描述符id信息,如果Guest描述符回收过慢,说明当前Guest还有很多接收报文待处理,暂不通知Guest。
- static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
- {
- /* Note: Xen has similar logic for notification hold-off
- * in include/xen/interface/io/ring.h with req_event and req_prod
- * corresponding to event_idx + 1 and new_idx respectively.
- * Note also that req_event and req_prod in Xen start at 1,
- * event indexes in virtio start at 0. */
- return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
- }
当vhost判断当前需要通知Guest时,会通过irqfd往Guest注入一个虚拟中断(详细流程到时会单独分析);Guest接收到中断时,进入中断处理函数skb_recv_done,然后触发NET_RX_SOFTIRQ软中断,然后进入virtnet_poll处理报文接收(详细看init_vqs drivers/net/virtio_net.c);
- static int virtnet_poll(struct napi_struct *napi, int budget)
-
- {
-
- struct receive_queue *rq =
-
- container_of(napi, struct receive_queue, napi);
-
- struct virtnet_info *vi = rq->vq->vdev->priv;
-
- void *buf;
-
- unsigned int r, len, received = 0;
-
-
-
- again:
-
- while (received < budget &&
-
- /* 获取Host填充的报文信息*/
-
- (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
-
- receive_buf(rq, buf, len);
-
- --rq->num;
-
- received++;
-
- }
-
- if (rq->num < rq->max / 2) {
-
- /*回收描述符*/
-
- if (!try_fill_recv(rq, GFP_ATOMIC))
-
- schedule_delayed_work(&vi->refill, 0);
-
- }
-
-
- /* Out of packets? */
-
- if (received < budget) {
-
- r = virtqueue_enable_cb_prepare(rq->vq);
-
- napi_complete(napi);
-
- if (unlikely(virtqueue_poll(rq->vq, r)) &&
-
- napi_schedule_prep(napi)) {
-
- virtqueue_disable_cb(rq->vq);
-
- __napi_schedule(napi);
-
- goto again;
-
- }
-
- }
-
-
- return received;
-
- }

在virtqueue_get_buf里会获取Host填充的报文数据信息,然后再将报文发给协议栈处理;
- void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
- {
-
- struct vring_virtqueue *vq = to_vvq(_vq);
-
- void *ret;
-
- unsigned int i;
-
- u16 last_used;
-
-
-
- START_USE(vq);
-
-
-
- if (unlikely(vq->broken)) {
-
- END_USE(vq);
-
- return NULL;
-
- }
-
- /*
-
- 这里判断vq-> last_used_idx与vq->vring.used->idx是否相等,其中vq->last_used_idx表示
- Guest上一次已处理的已使用的描述符id值,vq->vring.used->id是vhost填写的,表示vhost当 前
- 已使用的描述符id值,如果这两个值相等,说明Guest没有需要处理的报文了
-
- */
-
- if (!more_used(vq)) {
-
- pr_debug("No more buffers in queue\n");
-
- END_USE(vq);
-
- return NULL;
-
- }
-
-
-
- /* Only get used array entries after they have been exposed by host. */
-
- virtio_rmb(vq->weak_barriers);
-
-
-
- last_used = (vq->last_used_idx & (vq->vring.num - 1));
-
- i = vq->vring.used->ring[last_used].id;
-
- *len = vq->vring.used->ring[last_used].len;
-
-
-
- if (unlikely(i >= vq->vring.num)) {
-
- BAD_RING(vq, "id %u out of range\n", i);
-
- return NULL;
-
- }
-
- if (unlikely(!vq->data[i])) {
-
- BAD_RING(vq, "id %u is not a head!\n", i);
-
- return NULL;
-
- }
-
-
-
- /* detach_buf clears data, so grab it now. */
-
- /*Vq->data[i]为具体的报文buffer*/
-
- ret = vq->data[i];
-
- detach_buf(vq, i);
-
- /*每处理完一个报文,last_used_idx加1*/
-
- vq->last_used_idx++;
-
- /* If we expect an interrupt for the next entry, tell host
-
- * by writing event index and flush out the write before
-
- * the read in the next get_buf call. */
-
- if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
-
- vring_used_event(&vq->vring) = vq->last_used_idx;
-
- virtio_mb(vq->weak_barriers);
-
- }
-
-
-
- #ifdef DEBUG
-
- vq->last_add_time_valid = false;
-
- #endif
-
-
-
- END_USE(vq);
-
- return ret;
-
- }
-
- EXPORT_SYMBOL_GPL(virtqueue_get_buf);

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。