qemuRemoveCgroup(vm);
if (virFileMakePath(cfg->logDir) < 0) {
virReportSystemError(errno,
_("cannot create log directory %s"),
cfg->logDir);
goto cleanup;
}
if ((logfile = qemuDomainCreateLog(driver, vm, false)) < 0)
goto cleanup;
/dev/kvm
设备文件是否存在if (vm->def->virtType == VIR_DOMAIN_VIRT_KVM) {
VIR_DEBUG("Checking for KVM availability");
if (!virFileExists("/dev/kvm")) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("Domain requires KVM, but it is not available. "
"Check that virtualization is enabled in the host BIOS, "
"and host configuration is setup to load the kvm modules."));
goto cleanup;
}
}
if (!qemuValidateCpuMax(vm->def, priv->qemuCaps))
goto cleanup;
if (qemuAssignDeviceAliases(vm->def, priv->qemuCaps) < 0)
goto cleanup;
#分配的别名可以通过virsh dumpxml命令查看
lscpu
命令可以看到宿主机的numa配置#如果配置文件中指定numa为自动模式,会从numad中获取自动分配的结果。
if ((vm->def->placement_mode ==
VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) ||
(vm->def->numatune.memory.placement_mode ==
VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) {
nodeset = virNumaGetAutoPlacementAdvice(vm->def->vcpus,
vm->def->mem.max_balloon);
if (!nodeset)
goto cleanup;
VIR_DEBUG("Nodeset returned from numad: %s", nodeset);
if (virBitmapParse(nodeset, 0, &nodemask,
VIR_DOMAIN_CPUMASK_LEN) < 0)
goto cleanup;
}
#在hook中记录对应的numa配置
hookData.nodemask = nodemask;
if (VIR_ALLOC(priv->monConfig) < 0)
goto cleanup;
if (qemuProcessPrepareMonitorChr(cfg, priv->monConfig, vm->def->name) < 0)
goto cleanup;
priv->monJSON = virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_MONITOR_JSON);
priv->monError = false;
priv->monStart = 0;
priv->gotShutdown = false;
VIR_FREE(priv->pidfile);
if (!(priv->pidfile = virPidFileBuildPath(cfg->stateDir, vm->def->name))) {
virReportSystemError(errno,
"%s", _("Failed to build pidfile path."));
goto cleanup;
}
if (unlink(priv->pidfile) < 0 &&
errno != ENOENT) {
virReportSystemError(errno,
_("Cannot remove stale PID file %s"),
priv->pidfile);
goto cleanup;
}
if (virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) {
VIR_DEBUG("Assigning domain PCI addresses");
if ((qemuDomainAssignAddresses(vm->def, priv->qemuCaps, vm)) < 0)
goto cleanup;
}
if (!(cmd = qemuBuildCommandLine(conn, driver, vm->def, priv->monConfig,
priv->monJSON, priv->qemuCaps,
migrateFrom, stdin_fd, snapshot, vmop,
&buildCommandLineCallbacks)))
goto cleanup;
if (virHookPresent(VIR_HOOK_DRIVER_QEMU)) {
char *xml = qemuDomainDefFormatXML(driver, vm->def, 0);
int hookret;
hookret = virHookCall(VIR_HOOK_DRIVER_QEMU, vm->def->name,
VIR_HOOK_QEMU_OP_START, VIR_HOOK_SUBOP_BEGIN,
NULL, xml, NULL);
VIR_FREE(xml);
if (hookret < 0)
goto cleanup;
}
if ((timestamp = virTimeStringNow()) == NULL) {
goto cleanup;
} else {
if (safewrite(logfile, timestamp, strlen(timestamp)) < 0 ||
safewrite(logfile, START_POSTFIX, strlen(START_POSTFIX)) < 0) {
VIR_WARN("Unable to write timestamp to logfile: %s",
virStrerror(errno, ebuf, sizeof(ebuf)));
}
VIR_FREE(timestamp);
}
virCommandWriteArgLog(cmd, logfile);
qemuDomainObjCheckTaint(driver, vm, logfile);
if ((pos = lseek(logfile, 0, SEEK_END)) < 0)
VIR_WARN("Unable to seek to end of logfile: %s",
virStrerror(errno, ebuf, sizeof(ebuf)));
virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData);
virCommandSetMaxProcesses(cmd, cfg->maxProcesses);
virCommandSetMaxFiles(cmd, cfg->maxFiles);
VIR_DEBUG("Setting up security labelling");
if (virSecurityManagerSetChildProcessLabel(driver->securityManager,
vm->def, cmd) < 0) {
goto cleanup;
}
#qemu的标准输出定向到日志文件
virCommandSetOutputFD(cmd, &logfile);
#qemu错误输出定向到日志文件
virCommandSetErrorFD(cmd, &logfile);
virCommandNonblockingFDs(cmd);
virCommandSetPidFile(cmd, priv->pidfile);
virCommandDaemonize(cmd);
#创建一个握手连接,用于qemu和libvirt之间通信。可以确保hook的执行时间可以由libvirtd控制。当qemu进程启动,但是还未完成的时候,libvirtd没有通过这个连接发送信号,qemu的hook不会执行。qemu进程启动完成之后,libvirtd检测到并且发送信号,这时候才去执行qemu的hook脚本。
virCommandRequireHandshake(cmd);
ret = virCommandRun(cmd, NULL);
#通过fork返回值和pid file内容判断
if (ret == 0) {
if (virPidFileReadPath(priv->pidfile, &vm->pid) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Domain %s didn't show up"), vm->def->name);
ret = -1;
}
VIR_DEBUG("QEMU vm=%p name=%s running with pid=%llu",
vm, vm->def->name, (unsigned long long)vm->pid);
} else {
VIR_DEBUG("QEMU vm=%p name=%s failed to spawn",
vm, vm->def->name);
}
if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm) < 0) {
goto cleanup;
}
if (virCommandHandshakeWait(cmd) < 0) {
goto cleanup;
}
#首先要初始化当前虚拟机的cgroup目录,在每一个cgroup子系统的machine层级下创建虚拟机对应的层级。
#device子系统,设置当前虚拟机可以访问的设备号。
#blkio子系统,设置磁盘qos参数。
#memory子系统,设置内存qos参数,这个目前暂时没有配置。
#cpu子系统,设置cpu qos参数。只是设置其中的share参数,即CPU权重,同样VCPU数量的前提下,权重越大,获得的CPU时间越多。
#cpuset子系统的设置项较多,包括:
#如果配置文件中指定了numatune配置,则使用指定的参数。如果没有指定,则使用默认生成的推荐参数。
#如果配置文件中指定CPU绑定方式为auto,则会根据默认生成的numa配置参数配置相应的CPU绑定关系。如果指定了CPU绑定关系,则按照指定的绑定关系配置。
if (qemuSetupCgroup(driver, vm, nodemask) < 0)
goto cleanup;
if (!vm->def->cputune.emulatorpin &&
qemuProcessInitCpuAffinity(driver, vm, nodemask) < 0)
goto cleanup;
if (virCommandHandshakeNotify(cmd) < 0) {
goto cleanup;
}
if (migrateFrom)
flags |= VIR_QEMU_PROCESS_START_PAUSED;
if (qemuProcessWaitForMonitor(driver, vm, priv->qemuCaps, pos) < 0)
goto cleanup;
if (qemuConnectAgent(driver, vm) < 0) {
VIR_WARN("Cannot connect to QEMU guest agent for %s",
vm->def->name);
virResetLastError();
priv->agentError = true;
}
<vcpu placement='static' cpuset="1-4,^3,6" current="1">2</vcpu>
和
<cputune>
<vcpupin vcpu="0" cpuset="1-4,^2"/>
<vcpupin vcpu="1" cpuset="0,1"/>
<vcpupin vcpu="2" cpuset="2,3"/>
<vcpupin vcpu="3" cpuset="0,4"/>
<emulatorpin cpuset="1-3"/>
<iothreadpin iothread="1" cpuset="5,6"/>
<iothreadpin iothread="2" cpuset="7,8"/>
<shares>2048</shares>
<period>1000000</period>
<quota>-1</quota>
<emulator_period>1000000</emulator_period>
<emulator_quota>-1</emulator_quota>
<iothread_period>1000000</iothread_period>
<iothread_quota>-1</iothread_quota>
<vcpusched vcpus='0-4,^3' scheduler='fifo' priority='1'/>
<iothreadsched iothreads='2' scheduler='batch'/>
</cputune>
上面我们已经根据vcpu的placement设置过一次亲和性,那一次是设置整个qemu进程的亲和性。libvirt同时还提供了更细粒度的设置方式cputune。libvirt的策略是两处同时指定的话,cputune会覆盖vcpu placement的配置。
#因为vcpu实际上是qemu进程中的线程,通过线程号来绑定vcpu的亲和性,所以需要先获取qemu中所有的线程号,包括emulator和vcpu。
if (qemuProcessDetectVcpuPIDs(driver, vm) < 0)
goto cleanup;
#设置vcpu的pin,quota和period等参数
if (qemuSetupCgroupForVcpu(vm) < 0)
goto cleanup;
#设置emulator的cputune参数
if (qemuSetupCgroupForEmulator(driver, vm, nodemask) < 0)
goto cleanup;
#通过taskset设置vcpu线程和emulator的cpu亲和性。如果没有配置单独的vcpupin直接返回,否则按照vcpupin的配置设置线程亲和性。如果cputune中配置了emulatorpin信息优先使用此配置,否则尝试使用vcpu placement中的cpuset信息,如果都没有直接返回。
#这两步设置不是很清楚具体的原因。个人理解是首先尝试设置cgroup,如果cgroup不存在则继续通过taskset设置。如果存在则设置两次。
if (qemuProcessSetVcpuAffinities(conn, vm) < 0)
goto cleanup;
if (qemuProcessSetEmulatorAffinities(conn, vm) < 0)
goto cleanup;
if (qemuProcessInitPasswords(conn, driver, vm) < 0)
goto cleanup;
if (!virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) {
VIR_DEBUG("Determining domain device PCI addresses");
if (qemuProcessInitPCIAddresses(driver, vm) < 0)
goto cleanup;
}
qemuDomainObjEnterMonitor(driver, vm);
if (qemuProcessSetLinkStates(vm) < 0) {
qemuDomainObjExitMonitor(driver, vm);
goto cleanup;
}
qemuDomainObjExitMonitor(driver, vm);
if (qemuDomainUpdateDeviceList(driver, vm) < 0)
goto cleanup;
cur_balloon = vm->def->mem.cur_balloon;
if (cur_balloon != vm->def->mem.cur_balloon) {
virReportError(VIR_ERR_OVERFLOW,
_("unable to set balloon to %lld"),
vm->def->mem.cur_balloon);
goto cleanup;
}
qemuDomainObjEnterMonitor(driver, vm);
if (vm->def->memballoon && vm->def->memballoon->period)
qemuMonitorSetMemoryStatsPeriod(priv->mon, vm->def->memballoon->period);
if (qemuMonitorSetBalloon(priv->mon, cur_balloon) < 0) {
qemuDomainObjExitMonitor(driver, vm);
goto cleanup;
}
qemuDomainObjExitMonitor(driver, vm);
if (!(flags & VIR_QEMU_PROCESS_START_PAUSED)) {
if (qemuProcessStartCPUs(driver, vm, conn,
VIR_DOMAIN_RUNNING_BOOTED,
QEMU_ASYNC_JOB_NONE) < 0) {
if (virGetLastError() == NULL)
virReportError(VIR_ERR_INTERNAL_ERROR,
"%s", _("resume operation failed"));
goto cleanup;
}
} else {
virDomainObjSetState(vm, VIR_DOMAIN_PAUSED,
migrateFrom ?
VIR_DOMAIN_PAUSED_MIGRATION :
VIR_DOMAIN_PAUSED_USER);
}
if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm) < 0)
goto cleanup;
相关阅读:虚拟机创建流程-libvirt篇(上)
本文来自网易实践者社区,经作者岳文远授权发布。