• QEMU tap数据接收流程


    QEMU直接从tap/tun取数据

    QEMU tap数据接收步骤:

    1. qemu从tun取数据包
    2. qemu将数据包放入virtio硬件网卡。
    3. qemu触发中断。
    4. 虚拟机收到中断,从virtio读取数据。

    在qemu中步骤1(tap_read_packet)和步骤2(qemu_send_packet_async)都是在tap_send中完成的,其中步骤2是异步流程。

    qemu/net/tap.c
    static void tap_send(void *opaque)                                               
     {                                                                                
         TAPState *s = opaque;                                                       
         int size;                                                                   
         int packets = 0;                                                             
         while (true) {                                                               
             uint8_t *buf = s->buf;                                                 
             uint8_t min_pkt[ETH_ZLEN];                                             
             size_t min_pktsz = sizeof(min_pkt);                                                                                                                    
             size = tap_read_packet(s->fd, s->buf, sizeof(s->buf));                   
             if (size <= 0) {                                                       
                 break;                                                             
             }                                                                       
    		 
             if (s->host_vnet_hdr_len && !s->using_vnet_hdr) {                      
                 buf  += s->host_vnet_hdr_len;                                       
                 size -= s->host_vnet_hdr_len;                                       
             }                                                                                                                                                 
             if (net_peer_needs_padding(&s->nc)) {                                   
                 if (eth_pad_short_frame(min_pkt, &min_pktsz, buf, size)) {         
                     buf = min_pkt;                                                 
                     size = min_pktsz;                                               
                 }                                                                    
             }                                                                       
                                                                                     
             size = qemu_send_packet_async(&s->nc, buf, size, tap_send_completed);   
             if (size == 0) {                                                       
                 tap_read_poll(s, false);                                           
                 break;                                                             
             } else if (size < 0) {                                                 
                 break;                                                             
             }                                                                                                                                                
             /*                                                                     
              * When the host keeps receiving more packets while tap_send() is      
              * running we can hog the QEMU global mutex.  Limit the number of       
              * packets that are processed per tap_send() callback to prevent       
              * stalling the guest.                                                 
              */                                                                      
             packets++;                                                             
             if (packets >= 50) {                                                   
                 break;                                                             
             }                                                                        
         }                                                                            
     }                                
    

    tap_send

    qemu通过qemu_net_queue_deliver将数据包发送到virtio_queue,在发送之前若delivering或!qemu_can_send_packet满足,则先将数据包加入packets队列,随后在qemu_net_queue_flush阶段将数据包发送到virtio_queue中,上图中virtio_net_receive就到达virtio虚拟硬件网卡了。

    qemu tap 数据接收流程

    QEMU通过vhost-net从tap/tun取数据

    vhost-net驱动加载时会生成/dev/vhost-net设备。
    qemu-kvm启动时会open设备/dev/vhost-net,将调用vhost_net_open完成这个过程,vhost_net_open会进行handle_tx_net、handle_rx_net poll函数的初始化。
    handle_tx_net、handle_rx_net最终会调用tun_recvmsg、tun_sendmsg进行数据收发。

    /drivers/vhost/net.c:
    static int vhost_net_open(struct inode *inode, struct file *f)
    {
    ... ...
    vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
    vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
    ... ...
    }
    
    static void handle_rx_net(struct vhost_work *work)
    {
    	struct vhost_net *net = container_of(work, struct vhost_net,
    					     poll[VHOST_NET_VQ_RX].work);
    	handle_rx(net);
    }
    

    handle_rx函数中recvmsg完成从tun取数据,通过copy_to_iter将msg放入virtio_queue,最后vhost_add_used_and_signal_n实现通知机制,qemu收到数据。

    static void handle_rx(struct vhost_net *net)
    {
    ... ...
    		err = sock->ops->recvmsg(sock, &msg,
    					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
    ... ...
    		num_buffers = cpu_to_vhost16(vq, headcount);
    		if (likely(mergeable) &&
    		    copy_to_iter(&num_buffers, sizeof num_buffers,
    				 &fixup) != sizeof num_buffers) {
    			vq_err(vq, "Failed num_buffers write");
    			vhost_discard_vq_desc(vq, headcount);
    			break;
    		}
    		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
    					    headcount);
    ... ...
    }
    

    vhost_net通过vhost_worker内核线程进行工作队列的调度用于完成poll,vhost_worker内核线程是qemu通过vhost_dev_ioctl VHOST_SET_OWNER时创建的。

    drivers/vhost/vhost.c:
    static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
    			     void *key)
    {
    	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
    
    	if (!(key_to_poll(key) & poll->mask))
    		return 0;
    
    	vhost_poll_queue(poll);
    	return 0;
    }
    
    void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
    {
    	clear_bit(VHOST_WORK_QUEUED, &work->flags);
    	work->fn = fn;
    }
    EXPORT_SYMBOL_GPL(vhost_work_init);
    
    /* Init poll structure */
    void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
    		     __poll_t mask, struct vhost_dev *dev)
    {
    	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
    	init_poll_funcptr(&poll->table, vhost_poll_func);
    	poll->mask = mask;
    	poll->dev = dev;
    	poll->wqh = NULL;
    
    	vhost_work_init(&poll->work, fn);
    }
    EXPORT_SYMBOL_GPL(vhost_poll_init);
    
    static int vhost_worker(void *data)
    {
    ... ...
    	for (;;) {
    	... ...
    		node = llist_del_all(&dev->work_list);
    		if (!node)
    			schedule();
    
    		node = llist_reverse_order(node);
    		/* make sure flag is seen after deletion */
    		smp_wmb();
    		llist_for_each_entry_safe(work, work_next, node, node) {
    			clear_bit(VHOST_WORK_QUEUED, &work->flags);
    			__set_current_state(TASK_RUNNING);
    			work->fn(work);
    			if (need_resched())
    				schedule();
    		}
    	... ...
    	}
    ... ...
    }
    
    long vhost_dev_set_owner(struct vhost_dev *dev)
    {
    ... ...
    	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
    ... ...
    }
    
    long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
    {
    ... ...
    	if (ioctl == VHOST_SET_OWNER) {
    		r = vhost_dev_set_owner(d);
    		goto done;
    	}
    ... ...
    }
    
    drivers/vhost/net.c:
    static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
    			    unsigned long arg)
    {
    ... ...
    	switch (ioctl) {
    ... ...
    	case VHOST_RESET_OWNER:
    		return vhost_net_reset_owner(n);
    	case VHOST_SET_OWNER:
    		return vhost_net_set_owner(n);
    	default:
    		mutex_lock(&n->dev.mutex);
    		r = vhost_dev_ioctl(&n->dev, ioctl, argp);
    		if (r == -ENOIOCTLCMD)
    			r = vhost_vring_ioctl(&n->dev, ioctl, argp);
    		else
    			vhost_net_flush(n);
    		mutex_unlock(&n->dev.mutex);
    		return r;
    	}
    }
    
    static long vhost_net_set_owner(struct vhost_net *n)
    {
    ... ...
    	r = vhost_dev_set_owner(&n->dev);
    ... ...
    	return r;
    }
    
    static const struct file_operations vhost_net_fops = {
    	.owner          = THIS_MODULE,
    	.release        = vhost_net_release,
    	.read_iter      = vhost_net_chr_read_iter,
    	.write_iter     = vhost_net_chr_write_iter,
    	.poll           = vhost_net_chr_poll,
    	.unlocked_ioctl = vhost_net_ioctl,
    #ifdef CONFIG_COMPAT
    	.compat_ioctl   = vhost_net_compat_ioctl,
    #endif
    	.open           = vhost_net_open,
    	.llseek		= noop_llseek,
    };
    
    static struct miscdevice vhost_net_misc = {
    	.minor = VHOST_NET_MINOR,
    	.name = "vhost-net",
    	.fops = &vhost_net_fops,
    };
    
    static int vhost_net_init(void)
    {
    	if (experimental_zcopytx)
    		vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
    	return misc_register(&vhost_net_misc);
    }
    

    主机vhost驱动加载时调用vhost_net_init注册一个MISC驱动,生成/dev/vhost-net的设备文件。
    主机qemu-kvm启动时调用open对应的vhost_net_open做主要创建队列和收发函数的挂载,接着调用ioctl启动内核线程vhost,做收发包的处理。
    主机qemu通过ioctl配置kvm模块,主要设置通信方式,因为主机vhost和virtio只进行报文的传输,kvm进行提醒。
    虚拟机virtio模块注册,生成虚拟机的网络设备,配置中断和NAPI。
    虚拟机发包流程如下:
    直接从应用层走协议栈最后调用发送接口ndo_start_xmit对应的start_xmit,将报文放入发送队列,vp_notify通知kvm。
    kvm通过vmx_handle_exit一系列调用到wake_up_process唤醒vhost线程。
    vhost模块的线程激活并且拿到报文,在通过之前绑定的发送接口handle_tx_kick进行发送,调用虚拟网卡的tun_sendmsg最终到netif_rx接口进入主机内核协议栈。
    虚拟机收包流程如下:
    tap设备的ndo_start_xmit对应的tun_net_xmit最终调用到wake_up_process激活vhost线程,调用handle_rx_kick,将报文放入接收队列。
    通过一系列的调用到kvm模块的接口kvm_vcpu_kick,向qemu虚拟机注入中断。
    虚拟机virtio模块中断调用接口vp_interrupt,调用virtnet_poll,再调用到netif_receive_skb进入虚拟机的协议栈。
    资料来源:https://blog.csdn.net/qq_20817327/article/details/106838029

  • 相关阅读:
    12.Ansible安装NFS案例
    【开发篇】十五、Spring Task实现定时任务
    暴雨信息|低碳发展共拓数字能源产业绿色空间
    .NET Framework 4.5.2组件安装 官网免费版
    leaflet教程039: 只显示一屏地图,设定范围不让循环延展
    开源啦!一键部署免费使用!Kubernetes上直接运行大数据平台!
    苗大东:京东基于强化学习的电商搜索排序算法
    测试/开发程序员面试建议,把握机会轻松拿offer......
    大厂面试题-索引有哪些缺点以及具体有哪些索引类型
    文件批量重命名001到100 最简单的数字序号递增的改名技巧
  • 原文地址:https://www.cnblogs.com/forwards/p/17514221.html