KVM模块分析(一)模块初始化

kvm是一种基于硬件辅助的虚拟化解决方案,和qemu模拟器一起来完成整个系统的虚拟化。kvm以内核模块的形式存在,下面以intel虚拟化为例讲述kvm的代码框架和流程。

  • KVM在内核源码中的分布
研究内核模块先要分析一下整个模块的构成,正如下图看到的,x86架构下kvm由两个内核模块组成,kvm.ko模块负责kvm的整个框架实现,而kvm_intel.ko负责与具体架构相关的硬件实现。
                  
KVM目前已经被很多架构平台所支持,除了x86还包括arm,mips和ppc等,从内核代码中也很容易能看出这个关系。
                                   
  •   KVM初始化
在kvm_intel模块初始化一开始主要做了两件事,第一件事挂载vmx_x86_ops回调函数,供后面虚拟化调用;第二件事就是调用kvm_init来初始化kvm框架。
module_init(vmx_init)
       ===》vmx_init //挂载vmx_x86_ops
          ===》kvm_init //初始化kvm框架层
             ===》kvm_arch_init
             ===》kvm_irqfd_init
             ===》kvm_arch_hardware_setup
             ===》kvm_arch_check_processor_compat
             ===》misc_register(&kvm_dev) // 创建字符设备文件/dev/kvm

misc_register函数是linux内核的一个通用接口,主要作用是为了注册设备文件,kvm模块借用该接口创建了/dev/kvm设备文件,下面是设备文件的描述结构:
static struct file_operations kvm_chardev_ops = {
	.unlocked_ioctl = kvm_dev_ioctl,
	.compat_ioctl   = kvm_dev_ioctl,
	.llseek		= noop_llseek,
};

static struct miscdevice kvm_dev = {
	KVM_MINOR,
	"kvm",
	&kvm_chardev_ops,
};
/dev/kvm的ioctl的接口kvm_dev_ioctl主要提供了创建vm和校验版本号的功能
static long kvm_dev_ioctl(struct file *filp,
			  unsigned int ioctl, unsigned long arg)
{
	long r = -EINVAL;

	switch (ioctl) {
	case KVM_GET_API_VERSION:             //获取api版本
		if (arg)
			goto out;
		r = KVM_API_VERSION;
		break;
	case KVM_CREATE_VM:                    //创建VM,返回vmfd
		r = kvm_dev_ioctl_create_vm(arg);
		break;
	case KVM_CHECK_EXTENSION:
		r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
		break;
	case KVM_GET_VCPU_MMAP_SIZE:
		if (arg)
			goto out;
		r = PAGE_SIZE;     /* struct kvm_run */
#ifdef CONFIG_X86
		r += PAGE_SIZE;    /* pio data page */
#endif
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
		r += PAGE_SIZE;    /* coalesced mmio ring page */
#endif
		break;

	default:
		return kvm_arch_dev_ioctl(filp, ioctl, arg); //默认创建设备
	}
out:
	return r;
}
kvm_dev_ioctl_create_vm 首先创建vm,然后挂载vmfd的ioctl接口来提供vm的控制
static int kvm_dev_ioctl_create_vm(unsigned long type)
{

	kvm = kvm_create_vm(type); //创建VM

	r = get_unused_fd_flags(O_CLOEXEC);

	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);//挂载vmfd的ioctl接口

	return r;
}
static struct file_operations kvm_vm_fops = {
	.release        = kvm_vm_release,
	.unlocked_ioctl = kvm_vm_ioctl,//vmfd的ioctl接口,提供对vm级别的控制
#ifdef CONFIG_KVM_COMPAT
	.compat_ioctl   = kvm_vm_compat_ioctl,
#endif
	.llseek		= noop_llseek,
};
static long kvm_vm_ioctl(struct file *filp,
			   unsigned int ioctl, unsigned long arg)
{
	switch (ioctl) {
	case KVM_CREATE_VCPU:
		r = kvm_vm_ioctl_create_vcpu(kvm, arg);//创建vcpu
		break;
	case KVM_SET_USER_MEMORY_REGION: {
		struct kvm_userspace_memory_region kvm_userspace_mem;

		r = -EFAULT;
		if (copy_from_user(&kvm_userspace_mem, argp,
						sizeof(kvm_userspace_mem)))
			goto out;

		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);//设置虚机内存
	case KVM_IRQFD: {
		struct kvm_irqfd data;

		r = -EFAULT;
		if (copy_from_user(&data, argp, sizeof(data)))
			goto out;
		r = kvm_irqfd(kvm, &data);
		break;
	}
	case KVM_IOEVENTFD: {
		struct kvm_ioeventfd data;

		r = -EFAULT;
		if (copy_from_user(&data, argp, sizeof(data)))
			goto out;
		r = kvm_ioeventfd(kvm, &data);
		break;
	}
	case KVM_CREATE_DEVICE: {
		struct kvm_create_device cd;

		r = -EFAULT;
		if (copy_from_user(&cd, argp, sizeof(cd)))
			goto out;

		r = kvm_ioctl_create_device(kvm, &cd);
		if (r)
			goto out;

		r = -EFAULT;
		if (copy_to_user(argp, &cd, sizeof(cd)))
			goto out;

		r = 0;
		break;
	}
	case KVM_CHECK_EXTENSION:
		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
		break;
	default:
		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
	}
out:
	return r;
}
kvm_vm_ioctl_create_vcpu创建vcpu时再挂载ioctl接口来提供cpu级别的控制,
static struct file_operations kvm_vcpu_fops = {
	.release        = kvm_vcpu_release,
	.unlocked_ioctl = kvm_vcpu_ioctl,
	.mmap           = kvm_vcpu_mmap,
	.llseek		= noop_llseek,
static int create_vcpu_fd(struct kvm_vcpu *vcpu)
{
	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);//挂载vcpu的ioctl接口kvm_vcpu_ioctl
}
vcpu的ioctl接口提供了run方法
static long kvm_vcpu_ioctl(struct file *filp,
			   unsigned int ioctl, unsigned long arg)
{
        r = vcpu_load(vcpu);
	if (r)
		return r;
	switch (ioctl) {
	case KVM_RUN:
		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
		break;
至此,kvm的所有框架都初始化完成,用户态接口也都实现好了。

   
   
   
  • CPU RUN流程
kvm_arch_vcpu_ioctl_run
===》vcpu_run    //死循环进入vcpu_enter_guest
    ===》vcpu_enter_guest  //准备guestos的运行上下文
        ===》kvm_x86_ops->run(vcpu);  //回到我们之前提到的初始化流程,这里调用vmx_vcpu_run
        ===》kvm_x86_ops->handle_exit  //vmexit的处理
vmx_vcpu_run主要实现是一段汇编代码,利用intel的VT-X技术,通过指令vmlautch和vmexit来处理root与non-root的模式切换。

     
     
     
  • KVM与qemu的交互流程以及KVM的分布
附件PPT中描述了整个KVM与qemu的交互流程
kvm流程分析.pptx

本文来自网易实践者社区,经作者赵建明授权发布。