当前位置:   article > 正文

vhost源码分析

vhost源码分析

1、概述

       vhost的大致原理就是qemu在Guest、Host之间创建一些共享buffer,Guest作为生产者往buffer填充可用描述符信息,Host作为消费者从可用描述符里消费buffer。Host消费完buffer后再通知Guest回收描述符;

       本文主要基于3.10版本kernel,分析了vhost的报文收发过程。

2、vring

Guest、Host之间通过通过共享vring buffer的方式完成数据报文传递,相关数据结构如下,其中vring_virtqueue为Guest侧数据结构,vhost_virtqueue为Host侧数据结构;

  1. struct vring_virtqueue
  2. {
  3. struct virtqueue vq;
  4. /* Actual memory layout for this queue */
  5. /*
  6. 包含desc、avail、used三个vring,其中desc用于存放描述符信息,avail用于表示当前可用的desc 的
  7. head id,used用于描述当前已经使用的desc的head id
  8. */
  9. struct vring vring;
  10. /* Can we use weak barriers? */
  11. bool weak_barriers;
  12. /* Other side has made a mess, don't try any more. */
  13. bool broken;
  14. /* Host supports indirect buffers */
  15. bool indirect;
  16. /* Host publishes avail event idx */
  17. bool event;
  18. /* Head of free buffer list. */
  19. unsigned int free_head;
  20. /* Number we've added since last sync. */
  21. unsigned int num_added;
  22. /* Last used index we've seen. */
  23. /*
  24. 用于描述Guest当前已回收的最后一个desc id值
  25. */
  26. u16 last_used_idx;
  27. /* How to notify other side. FIXME: commonalize hcalls! */
  28. void (*notify)(struct virtqueue *vq);
  29. #ifdef DEBUG
  30. /* They're supposed to lock for us. */
  31. unsigned int in_use;
  32. /* Figure out if their kicks are too delayed. */
  33. bool last_add_time_valid;
  34. ktime_t last_add_time;
  35. #endif
  36. /* Tokens for callbacks. */
  37. void *data[];
  38. };
  1. struct vhost_virtqueue {
  2. struct vhost_dev *dev;
  3. /* The actual ring of buffers. */
  4. struct mutex mutex;
  5. unsigned int num;
  6. /*
  7. Qemu通过VHOST_SET_VRING_ADDR将Guest的三个vring地址通知给vhost,vhost填充到
  8. vhost_virtqueue对应字段
  9. */
  10. struct vring_desc __user *desc;
  11. struct vring_avail __user *avail;
  12. struct vring_used __user *used;
  13. struct file *kick;
  14. struct file *call;
  15. struct file *error;
  16. struct eventfd_ctx *call_ctx;
  17. struct eventfd_ctx *error_ctx;
  18. struct eventfd_ctx *log_ctx;
  19. struct vhost_poll poll;
  20. /* The routine to call when the Guest pings us, or timeout. */
  21. vhost_work_fn_t handle_kick;
  22. /* Last available index we saw. */
  23. /*
  24. Host可用的第一个desc id
  25. */
  26. u16 last_avail_idx;
  27. /* Caches available index value from user. */
  28. u16 avail_idx;
  29. /* Last index we used. */
  30. u16 last_used_idx;
  31. /* Used flags */
  32. u16 used_flags;
  33. /* Last used index value we have signalled on */
  34. u16 signalled_used;
  35. /* Last used index value we have signalled on */
  36. bool signalled_used_valid;
  37. /* Log writes to used structure. */
  38. bool log_used;
  39. u64 log_addr;
  40. struct iovec iov[UIO_MAXIOV];
  41. struct iovec *indirect;
  42. struct vring_used_elem *heads;
  43. /* We use a kind of RCU to access private pointer.
  44. * All readers access it from worker, which makes it possible to
  45. * flush the vhost_work instead of synchronize_rcu. Therefore readers do
  46. * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
  47. * vhost_work execution acts instead of rcu_read_lock() and the end of
  48. * vhost_work execution acts instead of rcu_read_unlock().
  49. * Writers use virtqueue mutex. */
  50. void __rcu *private_data;
  51. /* Log write descriptors */
  52. void __user *log_base;
  53. struct vhost_log *log;
  54. };

3、发包流程

3.1 Guest侧

  1. static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
  2. void (*callback)(struct virtqueue *vq),
  3. const char *name,
  4. u16 msix_vec)
  5. {
  6. struct virtio_pci_device *vp_dev = to_vp_device(vdev);
  7. struct virtio_pci_vq_info *info;
  8. struct virtqueue *vq;
  9. unsigned long flags, size;
  10. u16 num;
  11. int err;
  12. /* Select the queue we're interested in */
  13. iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
  14. /* Check if queue is either not available or already active. */
  15. /*获取当前配置的vring buffer个数*/
  16. num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM);
  17. if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN))
  18. return ERR_PTR(-ENOENT);
  19. /* allocate and fill out our structure the represents an active
  20. * queue */
  21. info = kmalloc(sizeof(struct virtio_pci_vq_info), GFP_KERNEL);
  22. if (!info)
  23. return ERR_PTR(-ENOMEM);
  24. info->num = num;
  25. info->msix_vector = msix_vec;
  26. size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
  27. /*分配desc页信息*/
  28. info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
  29. if (info->queue == NULL) {
  30. err = -ENOMEM;
  31. goto out_info;
  32. }
  33. /* activate the queue */
  34. iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
  35. vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
  36. /* create the vring */
  37. vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev,
  38. true, info->queue, vp_notify, callback, name);
  39. if (!vq) {
  40. err = -ENOMEM;
  41. goto out_activate_queue;
  42. }
  43. vq->priv = info;
  44. info->vq = vq;
  45. if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
  46. iowrite16(msix_vec, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
  47. msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
  48. if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
  49. err = -EBUSY;
  50. goto out_assign;
  51. }
  52. }
  53. if (callback) {
  54. spin_lock_irqsave(&vp_dev->lock, flags);
  55. list_add(&info->node, &vp_dev->virtqueues);
  56. spin_unlock_irqrestore(&vp_dev->lock, flags);
  57. } else {
  58. INIT_LIST_HEAD(&info->node);
  59. }
  60. return vq;
  61. out_assign:
  62. vring_del_virtqueue(vq);
  63. out_activate_queue:
  64. iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
  65. free_pages_exact(info->queue, size);
  66. out_info:
  67. kfree(info);
  68. return ERR_PTR(err);
  69. }

Guest在初始化virtio设备时,在setup_vq函数里,先获取配置的vring buffer个数信息,然后分配实际的desc内存,调用vring_new_virtqueue创建vq,完成vring信息的初始化;

  1. struct virtqueue *vring_new_virtqueue(unsigned int index,
  2. unsigned int num,
  3. unsigned int vring_align,
  4. struct virtio_device *vdev,
  5. bool weak_barriers,
  6. void *pages,
  7. void (*notify)(struct virtqueue *),
  8. void (*callback)(struct virtqueue *),
  9. const char *name)
  10. {
  11. struct vring_virtqueue *vq;
  12. unsigned int i;
  13. /* We assume num is a power of 2. */
  14. if (num & (num - 1)) {
  15. dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
  16. return NULL;
  17. }
  18. vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
  19. if (!vq)
  20. return NULL;
  21. vring_init(&vq->vring, num, pages, vring_align);
  22. vq->vq.callback = callback;
  23. vq->vq.vdev = vdev;
  24. vq->vq.name = name;
  25. vq->vq.num_free = num;
  26. vq->vq.index = index;
  27. vq->notify = notify;
  28. vq->weak_barriers = weak_barriers;
  29. vq->broken = false;
  30. vq->last_used_idx = 0;
  31. vq->num_added = 0;
  32. list_add_tail(&vq->vq.list, &vdev->vqs);
  33. #ifdef DEBUG
  34. vq->in_use = false;
  35. vq->last_add_time_valid = false;
  36. #endif
  37. vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
  38. vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
  39. /* No callback? Tell other side not to bother us. */
  40. if (!callback)
  41. vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
  42. /* Put everything in free lists. */
  43. vq->free_head = 0;
  44. /*初始化desc描述符vring*/
  45. for (i = 0; i < num-1; i++) {
  46. vq->vring.desc[i].next = i+1;
  47. vq->data[i] = NULL;
  48. }
  49. /*初始化data数据结构,Host接收报文时,就是将相应的报文数据填充到data*/
  50. vq->data[i] = NULL;
  51. return &vq->vq;
  52. }

当Guest需要向外发送报文时,会调用到start_xmit(virtio_net.c),该函数最终会调用virtqueue_add将skb_buffer填充到vq->data里;

  1. static inline int virtqueue_add(struct virtqueue *_vq,
  2. struct scatterlist *sgs[],
  3. struct scatterlist *(*next)
  4. (struct scatterlist *, unsigned int *),
  5. unsigned int total_out,
  6. unsigned int total_in,
  7. unsigned int out_sgs,
  8. unsigned int in_sgs,
  9. void *data,
  10. gfp_t gfp)
  11. {
  12. struct vring_virtqueue *vq = to_vvq(_vq);
  13. struct scatterlist *sg;
  14. unsigned int i, n, avail, uninitialized_var(prev), total_sg;
  15. int head;
  16. START_USE(vq);
  17. BUG_ON(data == NULL);
  18. #ifdef DEBUG
  19. {
  20. ktime_t now = ktime_get();
  21. /* No kick or get, with .1 second between? Warn. */
  22. if (vq->last_add_time_valid)
  23. WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
  24. > 100);
  25. vq->last_add_time = now;
  26. vq->last_add_time_valid = true;
  27. }
  28. #endif
  29. total_sg = total_in + total_out;
  30. /* If the host supports indirect descriptor tables, and we have multiple
  31. * buffers, then go indirect. FIXME: tune this threshold */
  32. if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
  33. head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
  34. total_in,
  35. out_sgs, in_sgs, gfp);
  36. if (likely(head >= 0))
  37. goto add_head;
  38. }
  39. BUG_ON(total_sg > vq->vring.num);
  40. BUG_ON(total_sg == 0);
  41. if (vq->vq.num_free < total_sg) {
  42. pr_debug("Can't add buf len %i - avail = %i\n",
  43. total_sg, vq->vq.num_free);
  44. /* FIXME: for historical reasons, we force a notify here if
  45. * there are outgoing parts to the buffer. Presumably the
  46. * host should service the ring ASAP. */
  47. if (out_sgs)
  48. vq->notify(&vq->vq);
  49. END_USE(vq);
  50. return -ENOSPC;
  51. }
  52. /* We're about to use some buffers from the free list. */
  53. vq->vq.num_free -= total_sg;
  54. /*获取当前首个可用描述符id*/
  55. head = i = vq->free_head;
  56. /*填充描述符信息*/
  57. for (n = 0; n < out_sgs; n++) {
  58. for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
  59. vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
  60. vq->vring.desc[i].addr = sg_phys(sg);
  61. vq->vring.desc[i].len = sg->length;
  62. prev = i;
  63. i = vq->vring.desc[i].next;
  64. }
  65. }
  66. for (; n < (out_sgs + in_sgs); n++) {
  67. for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
  68. vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
  69. vq->vring.desc[i].addr = sg_phys(sg);
  70. vq->vring.desc[i].len = sg->length;
  71. prev = i;
  72. i = vq->vring.desc[i].next;
  73. }
  74. }
  75. /* Last one doesn't continue. */
  76. vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
  77. /* Update free pointer */
  78. vq->free_head = i;
  79. add_head:
  80. /* Set token. */
  81. vq->data[head] = data;
  82. /* Put entry in available array (but don't update avail->idx until they
  83. * do sync). */
  84. avail = (vq->vring.avail->idx & (vq->vring.num-1));
  85. vq->vring.avail->ring[avail] = head;
  86. /* Descriptors and available array need to be set before we expose the
  87. * new available array entries. */
  88. virtio_wmb(vq->weak_barriers);
  89. /*更新avail idx,Host在收包get可用描述符时,会获取该值*/
  90. vq->vring.avail->idx++;
  91. vq->num_added++;
  92. /* This is very unlikely, but theoretically possible. Kick
  93. * just in case. */
  94. if (unlikely(vq->num_added == (1 << 16) - 1))
  95. virtqueue_kick(_vq);
  96. pr_debug("Added buffer head %i to %p\n", head, vq);
  97. END_USE(vq);
  98. return 0;
  99. }

start_xmit添加完outbuffer后,调用virtqueue_kick通知host;

  1. void virtqueue_kick(struct virtqueue *vq)
  2. {
  3. if (virtqueue_kick_prepare(vq))
  4. virtqueue_notify(vq);
  5. }

在virtqueue_kick,virtqueue_kick_prepare会根据vring_need_event的返回值判断是否需要通知host,下面重点看一下vring_need_event:

  1. static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
  2. {
  3. /* Note: Xen has similar logic for notification hold-off
  4. * in include/xen/interface/io/ring.h with req_event and req_prod
  5. * corresponding to event_idx + 1 and new_idx respectively.
  6. * Note also that req_event and req_prod in Xen start at 1,
  7. * event indexes in virtio start at 0. */
  8. return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
  9. }

看一下这个event_idx的定义:

#define vring_avail_event(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num])

那这个值是在哪里更新的呢?回到vhost代码;会发现vhost在使用完ring buffer后会调用vhost_update_avail_event,在这里会调用__put_user(vq->avail_idx, vhost_avail_event(vq)),会将avail_idx写到vhost_avail_event(vq)里,看下vhost_avail_event(vq)的定义:

#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])

可以发现这个vhost_avail_event其实就是vring_avail_event;至于avail_idx,vhost在handle_rx阶段获取可用描述符(vhost_get_vq_desc)时会通过__get_user(vq->avail_idx, &vq->avail->idx)获取,表示当前而Guest已添加的可用buffer id,Guest每添加一个buffer(virtqueue_add),就会更新一次vq->avail->idx,总结一下就是:

Guest在添加buffer的时候会更新vq->avail->idx,host在获取可用buffer时会获取这个值,然后将其写到vhost_avail_event里,Guest在添加完新的buffer后,就会判断当前host填进去的vhost_avail_enevt是不是Guest最新的值,如果是,则通过virtqueue_notify通知host,如果不是,表明host当前消息处理不过来,等下次再通知。

3.2 Host侧

Guest调用vrtrqueue_notify后,触发mmio异常陷出到host,然后通过eventfd机制唤醒vhost线程(vhost线程唤醒机制后续再单独做分析);当vhost需要发包被唤醒时,会调用handle_tx(drivers/vhost/net.c),在handle_tx里会调用vhost_get_vq_desc获取Guest填充的buffer信息;

  1. int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
  2. struct iovec iov[], unsigned int iov_size,
  3. unsigned int *out_num, unsigned int *in_num,
  4. struct vhost_log *log, unsigned int *log_num)
  5. {
  6. struct vring_desc desc;
  7. unsigned int i, head, found = 0;
  8. u16 last_avail_idx;
  9. int ret;
  10. /* Check it isn't doing very strange things with descriptor numbers. */
  11. last_avail_idx = vq->last_avail_idx;
  12. /*
  13. 获取Guest填充的最新的可用id值
  14. */
  15. if (unlikely(__get_user(vq->avail_idx, &vq->avail->idx))) {
  16. vq_err(vq, "Failed to access avail idx at %p\n",
  17. &vq->avail->idx);
  18. return -EFAULT;
  19. }
  20. if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
  21. vq_err(vq, "Guest moved used index from %u to %u",
  22. last_avail_idx, vq->avail_idx);
  23. return -EFAULT;
  24. }
  25. /* If there's nothing new since last we looked, return invalid. */
  26. /*
  27. Last_avail_idx表示vhost当前可用的id值,vhost每使用一个描述符,last_avail_idx就会加1;如果
  28. vq->avail_idx与last_avail_idx相等,则表明Guest没有填充新的buffer,也即当前没有需要发送的
  29. 数据
  30. */
  31. if (vq->avail_idx == last_avail_idx)
  32. return vq->num;
  33. /* Only get avail ring entries after they have been exposed by guest. */
  34. smp_rmb();
  35. /* Grab the next descriptor number they're advertising, and increment
  36. * the index we've seen. */
  37. if (unlikely(__get_user(head,
  38. &vq->avail->ring[last_avail_idx % vq->num]))) {
  39. vq_err(vq, "Failed to read head: idx %d address %p\n",
  40. last_avail_idx,
  41. &vq->avail->ring[last_avail_idx % vq->num]);
  42. return -EFAULT;
  43. }
  44. /* If their number is silly, that's an error. */
  45. if (unlikely(head >= vq->num)) {
  46. vq_err(vq, "Guest says index %u > %u is available",
  47. head, vq->num);
  48. return -EINVAL;
  49. }
  50. /* When we start there are none of either input nor output. */
  51. *out_num = *in_num = 0;
  52. if (unlikely(log))
  53. *log_num = 0;
  54. i = head;
  55. do {
  56. unsigned iov_count = *in_num + *out_num;
  57. if (unlikely(i >= vq->num)) {
  58. vq_err(vq, "Desc index is %u > %u, head = %u",
  59. i, vq->num, head);
  60. return -EINVAL;
  61. }
  62. if (unlikely(++found > vq->num)) {
  63. vq_err(vq, "Loop detected: last one at %u "
  64. "vq size %u head %u\n",
  65. i, vq->num, head);
  66. return -EINVAL;
  67. }
  68. ret = __copy_from_user(&desc, vq->desc + i, sizeof desc);
  69. if (unlikely(ret)) {
  70. vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
  71. i, vq->desc + i);
  72. return -EFAULT;
  73. }
  74. if (desc.flags & VRING_DESC_F_INDIRECT) {
  75. ret = get_indirect(dev, vq, iov, iov_size,
  76. out_num, in_num,
  77. log, log_num, &desc);
  78. if (unlikely(ret < 0)) {
  79. vq_err(vq, "Failure detected "
  80. "in indirect descriptor at idx %d\n", i);
  81. return ret;
  82. }
  83. continue;
  84. }
  85. ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count,
  86. iov_size - iov_count);
  87. if (unlikely(ret < 0)) {
  88. vq_err(vq, "Translation failure %d descriptor idx %d\n",
  89. ret, i);
  90. return ret;
  91. }
  92. if (desc.flags & VRING_DESC_F_WRITE) {
  93. /* If this is an input descriptor,
  94. * increment that count. */
  95. *in_num += ret;
  96. if (unlikely(log)) {
  97. log[*log_num].addr = desc.addr;
  98. log[*log_num].len = desc.len;
  99. ++*log_num;
  100. }
  101. } else {
  102. /* If it's an output descriptor, they're all supposed
  103. * to come before any input descriptors. */
  104. if (unlikely(*in_num)) {
  105. vq_err(vq, "Descriptor has out after in: "
  106. "idx %d\n", i);
  107. return -EINVAL;
  108. }
  109. *out_num += ret;
  110. }
  111. } while ((i = next_desc(&desc)) != -1);
  112. /* On success, increment avail index. */
  113. /*
  114. 成功获取一个buffer后,last_avail_idx加1
  115. */
  116. vq->last_avail_idx++;
  117. /* Assume notifications from guest are disabled at this point,
  118. * if they aren't we would need to update avail_event index. */
  119. BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
  120. return head;
  121. }

Vhost获取可用buffer后,通过sock->ops->sendmsg完成报文的发送;然后调用vhost_add_used将已使用的buffer信息回填给Guest;

  1. int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
  2. {
  3. struct vring_used_elem __user *used;
  4. /* The virtqueue contains a ring of used buffers. Get a pointer to the
  5. * next entry in that used ring. */
  6. used = &vq->used->ring[vq->last_used_idx % vq->num];
  7. if (__put_user(head, &used->id)) {
  8. vq_err(vq, "Failed to write used id");
  9. return -EFAULT;
  10. }
  11. if (__put_user(len, &used->len)) {
  12. vq_err(vq, "Failed to write used len");
  13. return -EFAULT;
  14. }
  15. /* Make sure buffer is written before we update index. */
  16. smp_wmb();
  17. /*
  18. last_used_idx+1写到vq->used->idx里
  19. */
  20. if (__put_user(vq->last_used_idx + 1, &vq->used->idx)) {
  21. vq_err(vq, "Failed to increment used idx");
  22. return -EFAULT;
  23. }
  24. if (unlikely(vq->log_used)) {
  25. /* Make sure data is seen before log. */
  26. smp_wmb();
  27. /* Log used ring entry write. */
  28. log_write(vq->log_base,
  29. vq->log_addr +
  30. ((void __user *)used - (void __user *)vq->used),
  31. sizeof *used);
  32. /* Log used index update. */
  33. log_write(vq->log_base,
  34. vq->log_addr + offsetof(struct vring_used, idx),
  35. sizeof vq->used->idx);
  36. if (vq->log_ctx)
  37. eventfd_signal(vq->log_ctx, 1);
  38. }
  39. /*
  40. Vhost已使用的id加1
  41. */
  42. vq->last_used_idx++;
  43. /* If the driver never bothers to signal in a very long while,
  44. * used index might wrap around. If that happens, invalidate
  45. * signalled_used index we stored. TODO: make sure driver
  46. * signals at least once in 2^16 and remove this. */
  47. if (unlikely(vq->last_used_idx == vq->signalled_used))
  48. vq->signalled_used_valid = false;
  49. return 0;
  50. }

 在vhost_add_used里会调用__put_user(vq->last_used_idx + 1, &vq->used->idx)将vhost当前已使用的used id写到vq->used->idx里,这里的作用是让Guest知道当前vhost已经使用的id值,这样当Guest需要回收buffer或者接收vhost转给它的报文时才知道需要从哪里获取。

4、收包流程

4.1 Host侧

Vhost在接受网卡上送的报文时,会调用handle_rx,在handle_rx里首先通过get_rx_bufs获取当前可用描述符信息,然后通过sock->ops->recvmsg完成报文接收;报文接收完成后调用vhost_add_used_and_signal_n添加已使用id信息;

  1. int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
  2. unsigned count)
  3. {
  4. int start, n, r;
  5. start = vq->last_used_idx % vq->num;
  6. n = vq->num - start;
  7. if (n < count) {
  8. r = __vhost_add_used_n(vq, heads, n);
  9. if (r < 0)
  10. return r;
  11. heads += n;
  12. count -= n;
  13. }
  14. r = __vhost_add_used_n(vq, heads, count);
  15. /* Make sure buffer is written before we update index. */
  16. smp_wmb();
  17. /*
  18. 将当前已经的id值写到vq->used->idx里,供Guest接收报文时使用
  19. */
  20. if (put_user(vq->last_used_idx, &vq->used->idx)) {
  21. vq_err(vq, "Failed to increment used idx");
  22. return -EFAULT;
  23. }
  24. if (unlikely(vq->log_used)) {
  25. /* Log used index update. */
  26. log_write(vq->log_base,
  27. vq->log_addr + offsetof(struct vring_used, idx),
  28. sizeof vq->used->idx);
  29. if (vq->log_ctx)
  30. eventfd_signal(vq->log_ctx, 1);
  31. }
  32. return r;
  33. }

在__vhost_add_used_n里vhost会更新last_used_idx(new = (vq->last_used_idx += count)),更新完成后,将最新的last_used_idx通过(put_user(vq->last_used_idx, &vq->used->idx)通知给Guest;vhost更新完id信息后,调用vhost_signal通知Guest接收报文,在vhost_signal里,同样会先判断是否需要通知,判断的原理跟Guest决定是否通知Host时的类似,这里通过vring_need_event判断是否需要通知Guest,其中event_idx为Guest当前已回收的描述符id信息,如果Guest描述符回收过慢,说明当前Guest还有很多接收报文待处理,暂不通知Guest。

  1. static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
  2. {
  3. /* Note: Xen has similar logic for notification hold-off
  4. * in include/xen/interface/io/ring.h with req_event and req_prod
  5. * corresponding to event_idx + 1 and new_idx respectively.
  6. * Note also that req_event and req_prod in Xen start at 1,
  7. * event indexes in virtio start at 0. */
  8. return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
  9. }

4.2 Guest侧

当vhost判断当前需要通知Guest时,会通过irqfd往Guest注入一个虚拟中断(详细流程到时会单独分析);Guest接收到中断时,进入中断处理函数skb_recv_done,然后触发NET_RX_SOFTIRQ软中断,然后进入virtnet_poll处理报文接收(详细看init_vqs drivers/net/virtio_net.c);

  1. static int virtnet_poll(struct napi_struct *napi, int budget)
  2. {
  3.        struct receive_queue *rq =
  4.               container_of(napi, struct receive_queue, napi);
  5.        struct virtnet_info *vi = rq->vq->vdev->priv;
  6.        void *buf;
  7.        unsigned int r, len, received = 0;
  8. again:
  9.        while (received < budget &&
  10.            /* 获取Host填充的报文信息*/
  11.               (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
  12.               receive_buf(rq, buf, len);
  13.               --rq->num;
  14.               received++;
  15.        }
  16.        if (rq->num < rq->max / 2) {
  17.          /*回收描述符*/
  18.               if (!try_fill_recv(rq, GFP_ATOMIC))
  19.                      schedule_delayed_work(&vi->refill, 0);
  20.        }
  21.        /* Out of packets? */
  22.        if (received < budget) {
  23.               r = virtqueue_enable_cb_prepare(rq->vq);
  24.               napi_complete(napi);
  25.               if (unlikely(virtqueue_poll(rq->vq, r)) &&
  26.                   napi_schedule_prep(napi)) {
  27.                      virtqueue_disable_cb(rq->vq);
  28.                      __napi_schedule(napi);
  29.                      goto again;
  30.               }
  31.        }
  32.        return received;
  33. }

   在virtqueue_get_buf里会获取Host填充的报文数据信息,然后再将报文发给协议栈处理;

  1. void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
  2. {
  3.        struct vring_virtqueue *vq = to_vvq(_vq);
  4.        void *ret;
  5.        unsigned int i;
  6.        u16 last_used;
  7.        START_USE(vq);
  8.        if (unlikely(vq->broken)) {
  9.               END_USE(vq);
  10.               return NULL;
  11.        }
  12. /*
  13.        这里判断vq-> last_used_idx与vq->vring.used->idx是否相等,其中vq->last_used_idx表示
  14. Guest上一次已处理的已使用的描述符id值,vq->vring.used->id是vhost填写的,表示vhost当 前
  15. 已使用的描述符id值,如果这两个值相等,说明Guest没有需要处理的报文了
  16. */
  17.        if (!more_used(vq)) {
  18.               pr_debug("No more buffers in queue\n");
  19.               END_USE(vq);
  20.               return NULL;
  21.        }
  22.        /* Only get used array entries after they have been exposed by host. */
  23.        virtio_rmb(vq->weak_barriers);
  24.        last_used = (vq->last_used_idx & (vq->vring.num - 1));
  25.        i = vq->vring.used->ring[last_used].id;
  26.        *len = vq->vring.used->ring[last_used].len;
  27.        if (unlikely(i >= vq->vring.num)) {
  28.               BAD_RING(vq, "id %u out of range\n", i);
  29.               return NULL;
  30.        }
  31.        if (unlikely(!vq->data[i])) {
  32.               BAD_RING(vq, "id %u is not a head!\n", i);
  33.               return NULL;
  34.        }
  35.        /* detach_buf clears data, so grab it now. */
  36. /*Vq->data[i]为具体的报文buffer*/   
  37.        ret = vq->data[i];
  38.        detach_buf(vq, i);
  39. /*每处理完一个报文,last_used_idx加1*/
  40.        vq->last_used_idx++;
  41.        /* If we expect an interrupt for the next entry, tell host
  42.         * by writing event index and flush out the write before
  43.         * the read in the next get_buf call. */
  44.        if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
  45.               vring_used_event(&vq->vring) = vq->last_used_idx;
  46.               virtio_mb(vq->weak_barriers);
  47.        }
  48. #ifdef DEBUG
  49.        vq->last_add_time_valid = false;
  50. #endif
  51.        END_USE(vq);
  52.        return ret;
  53. }
  54. EXPORT_SYMBOL_GPL(virtqueue_get_buf);

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/寸_铁/article/detail/999072
推荐阅读
相关标签
  

闽ICP备14008679号