赞
踩
-chardev socket,id=charnet1,path=/run/openvswitch/vhu1,server
-netdev vhost-user,chardev=charnet1,queues=2,id=hostnet1
-device virtio-net-pci,mrg_rxbuf=on,mq=on,vectors=6,netdev=hostnet1,id=net1,mac=52:54:00:3f:8f:56,bus=pci.0,addr=0x4
virtio-net-pci
,指定id为hostnet1的后端netdev设备:-device virtio-net-pci,mrg_rxbuf=on,mq=on,vectors=6,netdev=hostnet1,id=net1,mac=52:54:00:3f:8f:56,bus=pci.0,addr=0x4
复合
的数据结构,的确,这个数据结构既描述了网卡在虚机内部的pci信息,又描述了网卡在qemu端的信息。struct VirtIONetPCI {
VirtIOPCIProxy parent_obj; /* 网卡pci设备相关信息*/
VirtIONet vdev; /* 主机侧网卡设备状态 */
};
virtio_net_device_realize
函数中被初始化,保存网卡VirtIO数据队列,控制队列,网卡状态,网卡后端类型,网卡MAC地址,后端是否支持TSO、UFO包卸载能力,用户配置的网卡属性等。struct VirtIONet { VirtIODevice parent_obj; uint8_t mac[ETH_ALEN]; uint16_t status; VirtIONetQueue *vqs; /* 网卡数据队列 */ VirtQueue *ctrl_vq; /* 控制队列 */ NICState *nic; /* 网卡状态 */ ...... uint32_t has_vnet_hdr; /* 后端为tap设备的网卡是否有vnet头,用于探测网卡支持的卸载能力 */ size_t host_hdr_len; size_t guest_hdr_len; uint64_t host_features; /* 用户配置的网卡属性,mrg_rxbuf,mq,tso,ufo */ ...... uint8_t has_ufo; /* 网卡后端是否支持UDP包卸载 */ uint32_t mergeable_rx_bufs; /* 是否开启接收端缓存合并 */ ...... uint8_t vhost_started; /* 后端为vhost的网卡是否为启动状态 */ ...... };
typedef struct NICState {
NetClientState *ncs;
NICConf *conf;
void *opaque;
bool peer_deleted;
} NICState;
NetClientState
就是连接device层和netdev层的数据结构,从数据结构的名字可以想象,对于网卡后端netdev设备,device设备是作为其客户端存在,device和netdev,也可以看成是peer-to-peer的关系:struct NetClientState {
NetClientInfo *info; /* 1 */
int link_down; /* 2 */
QTAILQ_ENTRY(NetClientState) next;
NetClientState *peer; /* 3 */
......
}
typedef struct NetClientInfo {
NetClientDriver type; /* 4 */
......
}
NET_CLIENT_DRIVER_NIC
, 对于device对应的具体后端,比如本篇中介绍的vhost-user,或者是tap设备,亦或是vdpa设备,peer侧持有的NetClientState,其类型字段type各不相同。NET_CLIENT_DRIVER_NIC
,peer端的类型为NET_CLIENT_DRIVER_VHOST_USER
,对于vhost-net网卡,NetClientState端的类型为NET_CLIENT_DRIVER_NIC
,peer端的类型为NET_CLIENT_DRIVER_TAP
typedef enum NetClientDriver { NET_CLIENT_DRIVER_NONE, NET_CLIENT_DRIVER_NIC, NET_CLIENT_DRIVER_USER, NET_CLIENT_DRIVER_TAP, NET_CLIENT_DRIVER_L2TPV3, NET_CLIENT_DRIVER_SOCKET, NET_CLIENT_DRIVER_VDE, NET_CLIENT_DRIVER_BRIDGE, NET_CLIENT_DRIVER_HUBPORT, NET_CLIENT_DRIVER_NETMAP, NET_CLIENT_DRIVER_VHOST_USER, NET_CLIENT_DRIVER_VHOST_VDPA, #if defined(CONFIG_VMNET) NET_CLIENT_DRIVER_VMNET_HOST, #endif /* defined(CONFIG_VMNET) */ #if defined(CONFIG_VMNET) NET_CLIENT_DRIVER_VMNET_SHARED, #endif /* defined(CONFIG_VMNET) */ #if defined(CONFIG_VMNET) NET_CLIENT_DRIVER_VMNET_BRIDGED, #endif /* defined(CONFIG_VMNET) */ NET_CLIENT_DRIVER__MAX, } NetClientDriver;
-netdev vhost-user,chardev=charnet1,queues=2,id=hostnet1
typedef struct NetVhostUserState {
NetClientState nc; /* 1 */
CharBackend chr; /* 2 */
VHostNetState *vhost_net; /* 3 */
......
uint64_t acked_features; /* 4 */
bool started; /* 5 */
} NetVhostUserState;
typedef struct vhost_net VHostNetState;
struct vhost_net {
struct vhost_dev dev;
......
};
struct vhost_dev {
VirtIODevice *vdev;
......
uint64_t features; /* 1 */
uint64_t acked_features; /* 2 */
uint64_t backend_features; /* 3 */
uint64_t protocol_features; /* 4 */
......
};
-chardev socket,id=charnet1,path=/run/openvswitch/vhu1,server
struct Chardev {
Object parent_obj; /* 字符设备基类,ChardevClass与Chardev通过指向相同父类的ObjectClass建立联系 */
......
CharBackend *be;
......
int be_open; /* 标记字符设备是否被打开 */
......
GSource *gsource;
GMainContext *gcontext;
};
char_socket_class_init
中注册了socket字符设备的操作:typedef struct ChardevClass {
ObjectClass parent_class;
......
void (*open)(Chardev *chr, ChardevBackend *backend, /* qmp_chardev_open_socket */
bool *be_opened, Error **errp);
int (*chr_write)(Chardev *s, const uint8_t *buf, int len); /* tcp_chr_write */
......
int (*chr_wait_connected)(Chardev *chr, Error **errp); /* tcp_chr_wait_connected */
void (*chr_disconnect)(Chardev *chr); /* tcp_chr_disconnect */
......
} ChardevClass;
ChardevSocket
,采用udp时是ChardevUdp
,这些设备都是chardev字符设备,即它们的基类都是字符设备。struct SocketChardev {
Chardev parent;
QIOChannel *ioc; /* Client I/O channel */
QIOChannelSocket *sioc; /* Client master channel */
......
SocketAddress *addr; /* socket设备地址 */
......
}
struct VirtIONet {
......
uint64_t host_features;
}
struct VirtIODevice
{
......
uint64_t guest_features; /* 1 */
uint64_t host_features; /* 2 */
uint64_t backend_features; /* 3 */
};
VIRTIO_PCI_COMMON_GF
),保存其值到guest_features。相关函数:virtio_set_features
VIRTIO_PCI_COMMON_DF
)时会返回该值,guest的features只能是device features的子集。它将用户配置并保存在VirtIONet host_features的属性作为输入,再根据VirtIONet关联的peer设备的能力,手动增删部分features,计算出最终features集合,存放到VirtIODevice host_features中。相关函数:virtio_net_get_features
virtio_net_get_features
struct vhost_dev {
......
uint64_t features; /* 1 */
uint64_t acked_features; /* 2 */
uint64_t backend_features; /* 3 */
uint64_t protocol_features; /* 4 */
}
VHOST_USER_GET_FEATURES
获取的features。相关函数:vhost_dev_init
virtio_net_set_features
vhost_user_backend_init
vhost_user_backend_init
NetVhostUserState
中的acked_features作保存vhost_dev中的acked_features之用。typedef struct NetVhostUserState {
NetClientState nc;
......
uint64_t acked_features;
} NetVhostUserState;
acked_features
中,当vhost_dev启动时设置给slave,此操作的前提是slave必须支持这些features,Qemu将vhost-user设备slave侧支持的features硬编码为user_feature_bits,将vhost-net设备slave侧支持的features硬编码为kernel_feature_bits,如下所示。当Qemu在保存前端设置的features时,如果是vhost-user网卡,只允许存在于user_feature_bits中的features被设置,vhost-net网卡亦然。/* Features supported by others. */ static const int user_feature_bits[] = { VIRTIO_F_NOTIFY_ON_EMPTY, VIRTIO_RING_F_INDIRECT_DESC, VIRTIO_RING_F_EVENT_IDX, VIRTIO_F_ANY_LAYOUT, VIRTIO_F_VERSION_1, VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GSO, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_MTU, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_PACKED, VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, /* This bit implies RARP isn't sent by QEMU out of band */ VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, VHOST_INVALID_FEATURE_BIT }; /* Features supported by host kernel. */ static const int kernel_feature_bits[] = { VIRTIO_F_NOTIFY_ON_EMPTY, VIRTIO_RING_F_INDIRECT_DESC, VIRTIO_RING_F_EVENT_IDX, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_F_VERSION_1, VIRTIO_NET_F_MTU, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_PACKED, VIRTIO_NET_F_HASH_REPORT, VHOST_INVALID_FEATURE_BIT };
vhost_user_socket
struct vhost_user_socket { ...... char *path; /* 1 */ bool is_server; /* 2*/ /* * The "supported_features" indicates the feature bits the * vhost driver supports. The "features" indicates the feature * bits after the rte_vhost_driver_features_disable/enable(). * It is also the final feature bits used for vhost-user * features negotiation. */ uint64_t supported_features; /* 3 */ uint64_t features; /* 4 */ uint64_t protocol_features; /* 5 */ ...... }
qemu_init qemu_create_late_backends net_init_clients /* 针对每个网卡设备调用初始化函数net_init_netdev */ qemu_opts_foreach(qemu_find_opts("netdev"), net_init_netdev, NULL, errp)) net_client_init net_client_init1 /* 根据网卡类型匹配初始化函数列表,调用对应的初始化函数 * 这里是vhost-user类型,调用net_init_vhost_user函数 */ net_client_init_fun[netdev->type](netdev, netdev->id, peer, errp) <=> net_init_vhost_user net_vhost_user_init do { qemu_chr_fe_wait_connected } while (!s->started); qemu_chr_wait_connected cc->chr_wait_connected <=> tcp_chr_wait_connected
tcp_chr_wait_connected
while (s->state != TCP_CHARDEV_STATE_CONNECTED) {
if (s->is_listen) {
tcp_chr_accept_server_sync(chr);
qio_net_listener_wait_client
g_main_loop_run(loop);
net_vhost_user_init
do {
qemu_chr_fe_wait_connected(&s->chr, &err);
qemu_chr_fe_set_handlers(&s->chr, NULL, NULL,
net_vhost_user_event, NULL, nc0->name, NULL,
true);
} while (!s->started);
net_vhost_user_event
就是连接到达的回调函数,首次启动时slave端发起连接后,此函数就会被调用,在虚机运行过程中,如果连接断开或者重连,net_vhost_user_event
也会被调用,vhost-user网卡的启动和停用都在这个回调中实现。net_vhost_user_event
switch (event) {
case CHR_EVENT_OPENED:
vhost_user_start(queues, ncs, s->vhost_user)
vhost_net_init
struct vhost_net *vhost_net_init(VhostNetOptions *options) { int r; /* 判断是否为vhost-net网卡,vhost-net网卡的数据面由kernel卸载,因此backend为kernel */ bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL; /* 为vhost_net结构体分配内存 */ struct vhost_net *net = g_new0(struct vhost_net, 1); if (backend_kernel) { /* 如果是vhost-net网卡,获取backend的fd,通常就是tap设备的fd */ r = vhost_net_get_fd(options->net_backend); /* TODO */ net->dev.backend_features = qemu_has_vnet_hdr(options->net_backend) ? 0 : (1ULL << VHOST_NET_F_VIRTIO_NET_HDR); /* 设置backend文件描述符 */ net->backend = r; net->dev.protocol_features = 0; } else { /* 初始化各features字段 */ net->dev.backend_features = 0; net->dev.protocol_features = 0; net->backend = -1; /* vhost-user needs vq_index to initiate a specific queue pair */ net->dev.vq_index = net->nc->queue_index * net->dev.nvqs; } /* 初始化vhost_dev结构体,核心动作是启动vhost_dev设备,完成vhost protocol协商 */ r = vhost_dev_init(&net->dev, options->opaque, options->backend_type, options->busyloop_timeout, &local_err); /* 从NetVhostUserState获取acked_features * 作为vhost_dev acked_features字段的初始值 */ /* Set sane init value. Override when guest acks. */ if (net->nc->info->type == NET_CLIENT_DRIVER_VHOST_USER) { features = vhost_user_get_acked_features(net->nc); if (~net->dev.features & features) { fprintf(stderr, "vhost lacks feature mask %" PRIu64 " for backend\n", (uint64_t)(~net->dev.features & features)); goto fail; } } /* 初始化vhost_dev.acked_features字段 */ vhost_net_ack_features(net, features); }
vhost_dev_init
中features相关逻辑:int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
VhostBackendType backend_type, uint32_t busyloop_timeout,
Error **errp)
{
uint64_t features;
vhost_set_backend_type(hdev, backend_type); /* 1 */
dev->vhost_ops = &user_ops;
hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp); /* 2 */
hdev->vhost_ops->vhost_get_features(hdev, &features); /* 3 */
hdev->features = features; /* 4 */
......
}
vhost_user_backend_init
中会初始化backend_features以及protocol_features:#define VIRTIO_F_BAD_FEATURE 30 #define VHOST_USER_F_PROTOCOL_FEATURES 30 static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, Error **errp) { uint64_t features, protocol_features; /* 发送vhost协议VHOST_USER_GET_FEATURES命令字获取dpdk侧支持的features */ vhost_user_get_features(dev, &features); /* 检查dpdk的是否支持VHOST_USER_F_PROTOCOL_FEATURES */ if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) { /* 如果支持,将VHOST_USER_F_PROTOCOL_FEATURES设置为后端的features */ dev->backend_features |= 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; /* 发送VHOST_USER_GET_PROTOCOL_FEATURES命令字获取protocol features */ vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &protocol_features); /* 保存VHOST_USER_GET_PROTOCOL_FEATURES features到protocol_features */ dev->protocol_features = protocol_features & VHOST_USER_PROTOCOL_FEATURE_MASK; ...... } ..... }
VHOST_USER_F_PROTOCOL_FEATURES
。这里还有一个小trick,dpdk将VHOST_USER_F_PROTOCOL_FEATURES
feature定义为bit 30,这个bit位在virtio规范里被作为保留位,Host置位该bit表示协商失败,Guest永远不会读取并设置该bit位,因此该位被dpdk赋予新的含义,即是否支持VHOST_USER_F_PROTOCOL_FEATURES
。VHOST_USER_F_PROTOCOL_FEATURES
,则还需要进一步调用vhost协议命令字获取dpdk支持的protocol features并保存。VHOST_USER_F_PROTOCOL_FEATURES
,它的bit位不能与virtio features bit位冲突。user_feature_bits
的其中之一。VHOST_USER_F_PROTOCOL_FEATURES
。与virtio规范定义的features无关。The driver MUST follow this sequence to initialize a device:
1. Reset the device.
2. Set the ACKNOWLEDGE status bit: the guest OS has noticed the device.
3. Set the DRIVER status bit: the guest OS knows how to drive the device.
4. Read device feature bits, and write the subset of feature bits understood by the OS and driver to the
device. During this step the driver MAY read (but MUST NOT write) the device-specific configuration
fields to check that it can support the device before accepting it.
5. Set the FEATURES_OK status bit. The driver MUST NOT accept new feature bits after this step.
6. Re-read device status to ensure the FEATURES_OK bit is still set: otherwise, the device does not
support our subset of features and the device is unusable.
7. Perform device-specific setup, including discovery of virtqueues for the device, optional per-bus setup,
reading and possibly writing the device’s virtio configuration space, and population of virtqueues.
8. Set the DRIVER_OK status bit. At this point the device is “live”.
VirtIODevice
中的host_features,driver选择其中支持的features设置到device,Qemu侧触发如下流程:virtio_pci_common_write
switch (addr) {
case VIRTIO_PCI_COMMON_GF:
virtio_set_features
virtio_set_features_nocheck
virtio_set_features_nocheck
函数:static int virtio_set_features_nocheck(VirtIODevice *vdev, uint64_t val)
{
VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
bool bad = (val & ~(vdev->host_features)) != 0; /* 1 */
val &= vdev->host_features; /* 2 */
if (k->set_features) {
k->set_features(vdev, val); <=> virtio_net_set_features
}
vdev->guest_features = val; /* 3 */
return bad ? -1 : 0;
}
virtio_net_set_features
函数最终调用vhost_ack_features
,保存Guest设置的feature到acked_features,以备设置features到slave:vhost_ack_features(&net->dev, vhost_net_get_feature_bits(net), features);
void vhost_net_ack_features(struct vhost_net *net, uint64_t features)
{
/* 将backend_features作为向slave设置feature的初始值 */
net->dev.acked_features = net->dev.backend_features;
/* guest设置的feature保存到acked_features中 */
vhost_ack_features(&net->dev, vhost_net_get_feature_bits(net), features);
}
user_feature_bits
,逐一比较guest是否使能,如果使能则将acked_features对应bit置位:void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
uint64_t features)
{
const int *bit = feature_bits;
while (*bit != VHOST_INVALID_FEATURE_BIT) {
uint64_t bit_mask = (1ULL << *bit);
if (features & bit_mask) {
hdev->acked_features |= bit_mask;
}
bit++;
}
}
VirtIODevice.guest_features
,这里保存的feature经过了host_features过滤,二是vhost_dev.acked_features
,这里保存的feature不只经过host_feature过滤,还经过user_feature_bits
的过滤,只保留了user_feature_bits
支持的feature,通常情况下两者是相同的。除此之外,可以确认Guest设置feature后Qemu会保存,但不会立即同步设置给slave。那什么时候设置呢?这是在virtio 规范定义的设备初始化的第5步,设置device状态时。如下:virtio_pci_common_write
switch (addr) {
case VIRTIO_PCI_COMMON_STATUS:
virtio_set_status(vdev, val & 0xFF);
k->set_status <=> virtio_net_set_status
......
}
virtio_net_set_status
在设置状态时会通过VirtIONet.vhost_started首先检查vhost设备是否启动,未启动则会触发启动流程,如下:virtio_net_set_status
virtio_net_vhost_status
if (!n->vhost_started) {
vhost_net_start
vhost_net_start_one
n->vhost_started = 1
}
VIRTIO_PCI_COMMON_GF
时设置的feature给slave,之后就是核心动作:根据vhost协议,将virtio数据面卸载所需信息传递给slave:vhost_dev_start
vhost_dev_set_features
/* 获取vhost_dev中存放的acked_features */
uint64_t features = dev->acked_features
/* 通过VHOST_USER_SET_FEATURES命令字传递给slave */
dev->vhost_ops->vhost_set_features
/* 传递虚机内存布局给slave */
hdev->vhost_ops->vhost_set_mem_table
/* 传递virtio队列相关信息 */
vhost_virtqueue_start
net_vhost_user_event
触发:net_vhost_user_event
switch (event) {
case CHR_EVENT_CLOSED:
/* 触发下半部,在主线程中指向连接断开相关回调 */
aio_bh_schedule_oneshot(ctx, chr_closed_bh, opaque);
}
static void chr_closed_bh(void *opaque) { const char *name = opaque; NetClientState *ncs[MAX_QUEUE_NUM]; NetVhostUserState *s; int queues, i; /* 遍历所有网卡net_clients,将其中非NET_CLIENT_DRIVER_NIC类型的NetClientState找到 * 在vhost-user场景下,即获取所有NET_CLIENT_DRIVER_USER类型网卡的NetClientState */ queues = qemu_find_net_clients_except(name, ncs, NET_CLIENT_DRIVER_NIC, MAX_QUEUE_NUM); /* NetVhostUserState的父类是NetClientState * 通过NetClientState找到NetVhostUserState的指针 */ s = DO_UPCAST(NetVhostUserState, nc, ncs[0]); /* 针对每个vhost设备,将其acked_features保存到NetVhostUserState的acked_features字段中 */ for (i = queues -1; i >= 0; i--) { s = DO_UPCAST(NetVhostUserState, nc, ncs[i]); if (s->vhost_net) { s->acked_features = vhost_net_get_acked_features(s->vhost_net); } } /* 标记网卡link状态位down */ qmp_set_link(name, false, &err); /* 仍然更新socket连接的IO回调为net_vhost_user_event */ qemu_chr_fe_set_handlers(&s->chr, NULL, NULL, net_vhost_user_event, NULL, opaque, NULL, true); ...... }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。