本文以 QEMU 8.2.2 为例,分析其作为系统仿真工具的启动过程,并为读者展示各种 QEMU 系统仿真的启动配置实例。
本文读者需要具备一定的 QEMU 系统仿真使用经验,并对 C 语言编程有一定了解。
QEMU 是一个通用且开源的机器模拟器和虚拟机。
其官方主页是:https://www.qemu.org/
QEMU 作为系统仿真工具,其入口代码在 system/main.c 文件中,初始化函数 qemu_init() 的实现在 system/vl.c 文件中,在完成 QEMU 虚拟机配置项处理和应用后,系统将配置虚拟机的加速器。本篇文章将完成以下代码部分的分析。
这部分代码在 system/vl.c 文件中,实现如下:
void qemu_init(int argc, char **argv)
{
...
/*
* Note: uses machine properties such as kernel-irqchip, must run
* after qemu_apply_machine_options.
*/
configure_accelerators(argv[0]);
phase_advance(PHASE_ACCEL_CREATED);
...
}
代码首先通过函数 qemu_disable_default_devices() 将所有默认设备的设置复位。
此函数在 /system/vl.c 文件中,定义如下:
static void configure_accelerators(const char *progname)
{
bool init_failed = false;
qemu_opts_foreach(qemu_find_opts("icount"),
do_configure_icount, NULL, &error_fatal);
if (QTAILQ_EMPTY(&qemu_accel_opts.head)) {
char **accel_list, **tmp;
if (accelerators == NULL) {
/* Select the default accelerator */
bool have_tcg = accel_find("tcg");
bool have_kvm = accel_find("kvm");
if (have_tcg && have_kvm) {
if (g_str_has_suffix(progname, "kvm")) {
/* If the program name ends with "kvm", we prefer KVM */
accelerators = "kvm:tcg";
} else {
accelerators = "tcg:kvm";
}
} else if (have_kvm) {
accelerators = "kvm";
} else if (have_tcg) {
accelerators = "tcg";
} else {
error_report("No accelerator selected and"
" no default accelerator available");
exit(1);
}
}
accel_list = g_strsplit(accelerators, ":", 0);
for (tmp = accel_list; *tmp; tmp++) {
/*
* Filter invalid accelerators here, to prevent obscenities
* such as "-machine accel=tcg,,thread=single".
*/
if (accel_find(*tmp)) {
qemu_opts_parse_noisily(qemu_find_opts("accel"), *tmp, true);
} else {
init_failed = true;
error_report("invalid accelerator %s", *tmp);
}
}
g_strfreev(accel_list);
} else {
if (accelerators != NULL) {
error_report("The -accel and \"-machine accel=\" options are incompatible");
exit(1);
}
}
if (!qemu_opts_foreach(qemu_find_opts("accel"),
do_configure_accelerator, &init_failed, &error_fatal)) {
if (!init_failed) {
error_report("no accelerator found");
}
exit(1);
}
if (init_failed && !qtest_chrdev) {
error_report("falling back to %s", current_accel_name());
}
if (icount_enabled() && !tcg_enabled()) {
error_report("-icount is not allowed with hardware virtualization");
exit(1);
}
}
、
函数 do_configure_icount() 定义如下:
static int do_configure_icount(void *opaque, QemuOpts *opts, Error **errp)
{
icount_configure(opts, errp);
return 0;
}
而函数 icount_configure() 在 TCG 加速器中定义如下:
void icount_configure(QemuOpts *opts, Error **errp)
{
const char *option = qemu_opt_get(opts, "shift");
bool sleep = qemu_opt_get_bool(opts, "sleep", true);
bool align = qemu_opt_get_bool(opts, "align", false);
long time_shift = -1;
if (!option) {
if (qemu_opt_get(opts, "align") != NULL) {
error_setg(errp, "Please specify shift option when using align");
}
return;
}
if (align && !sleep) {
error_setg(errp, "align=on and sleep=off are incompatible");
return;
}
if (strcmp(option, "auto") != 0) {
if (qemu_strtol(option, NULL, 0, &time_shift) < 0
|| time_shift < 0 || time_shift > MAX_ICOUNT_SHIFT) {
error_setg(errp, "icount: Invalid shift value");
return;
}
} else if (icount_align_option) {
error_setg(errp, "shift=auto and align=on are incompatible");
return;
} else if (!icount_sleep) {
error_setg(errp, "shift=auto and sleep=off are incompatible");
return;
}
icount_sleep = sleep;
if (icount_sleep) {
timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
icount_timer_cb, NULL);
}
icount_align_option = align;
if (time_shift >= 0) {
timers_state.icount_time_shift = time_shift;
icount_enable_precise();
return;
}
icount_enable_adaptive();
/*
* 125MIPS seems a reasonable initial guess at the guest speed.
* It will be corrected fairly quickly anyway.
*/
timers_state.icount_time_shift = 3;
/*
* Have both realtime and virtual time triggers for speed adjustment.
* The realtime trigger catches emulated time passing too slowly,
* the virtual time trigger catches emulated time passing too fast.
* Realtime triggers occur even when idle, so use them less frequently
* than VM triggers.
*/
timers_state.vm_clock_warp_start = -1;
timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
icount_adjust_rt, NULL);
timer_mod(timers_state.icount_rt_timer,
qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
icount_adjust_vm, NULL);
timer_mod(timers_state.icount_vm_timer,
qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
NANOSECONDS_PER_SECOND / 10);
}
如果没有设定加速器,默认查找是否支持 TCG 或 KVM 加速器,如果有则配置默认加速器,实现代码如下:
if (accelerators == NULL) {
/* Select the default accelerator */
bool have_tcg = accel_find("tcg");
bool have_kvm = accel_find("kvm");
if (have_tcg && have_kvm) {
if (g_str_has_suffix(progname, "kvm")) {
/* If the program name ends with "kvm", we prefer KVM */
accelerators = "kvm:tcg";
} else {
accelerators = "tcg:kvm";
}
} else if (have_kvm) {
accelerators = "kvm";
} else if (have_tcg) {
accelerators = "tcg";
} else {
error_report("No accelerator selected and"
" no default accelerator available");
exit(1);
}
}
函数 do_configure_accelerator() 定义如下:
static int do_configure_accelerator(void *opaque, QemuOpts *opts, Error **errp)
{
bool *p_init_failed = opaque;
const char *acc = qemu_opt_get(opts, "accel");
AccelClass *ac = accel_find(acc);
AccelState *accel;
int ret;
bool qtest_with_kvm;
if (!acc) {
error_setg(errp, QERR_MISSING_PARAMETER, "accel");
goto bad;
}
qtest_with_kvm = g_str_equal(acc, "kvm") && qtest_chrdev != NULL;
if (!ac) {
if (!qtest_with_kvm) {
error_report("invalid accelerator %s", acc);
}
goto bad;
}
accel = ACCEL(object_new_with_class(OBJECT_CLASS(ac)));
object_apply_compat_props(OBJECT(accel));
qemu_opt_foreach(opts, accelerator_set_property,
accel,
&error_fatal);
/*
* If legacy -singlestep option is set, honour it for TCG and
* silently ignore for any other accelerator (which is how this
* option has always behaved).
*/
if (opt_one_insn_per_tb) {
/*
* This will always succeed for TCG, and we want to ignore
* the error from trying to set a nonexistent property
* on any other accelerator.
*/
object_property_set_bool(OBJECT(accel), "one-insn-per-tb", true, NULL);
}
ret = accel_init_machine(accel, current_machine);
if (ret < 0) {
if (!qtest_with_kvm || ret != -ENOENT) {
error_report("failed to initialize %s: %s", acc, strerror(-ret));
}
goto bad;
}
return 1;
bad:
*p_init_failed = true;
return 0;
}
函数 accel_find() 定义如下:
/* Lookup AccelClass from opt_name. Returns NULL if not found */
AccelClass *accel_find(const char *opt_name)
{
char *class_name = g_strdup_printf(ACCEL_CLASS_NAME("%s"), opt_name);
AccelClass *ac = ACCEL_CLASS(module_object_class_by_name(class_name));
g_free(class_name);
return ac;
}
数据结构 struct AccelClass 定义如下:
struct AccelState {
/*< private >*/
Object parent_obj;
};
typedef struct AccelClass {
/*< private >*/
ObjectClass parent_class;
/*< public >*/
const char *name;
int (*init_machine)(MachineState *ms);
#ifndef CONFIG_USER_ONLY
void (*setup_post)(MachineState *ms, AccelState *accel);
bool (*has_memory)(MachineState *ms, AddressSpace *as,
hwaddr start_addr, hwaddr size);
#endif
bool (*cpu_common_realize)(CPUState *cpu, Error **errp);
void (*cpu_common_unrealize)(CPUState *cpu);
/* gdbstub related hooks */
int (*gdbstub_supported_sstep_flags)(void);
bool *allowed;
/*
* Array of global properties that would be applied when specific
* accelerator is chosen. It works like MachineClass.compat_props
* but it's for accelerators not machines. Accelerator-provided
* global properties may be overridden by machine-type
* compat_props or user-provided global properties.
*/
GPtrArray *compat_props;
} AccelClass;
函数 accel_init_machine() 定义如下:
int accel_init_machine(AccelState *accel, MachineState *ms)
{
AccelClass *acc = ACCEL_GET_CLASS(accel);
int ret;
ms->accelerator = accel;
*(acc->allowed) = true;
ret = acc->init_machine(ms);
if (ret < 0) {
ms->accelerator = NULL;
*(acc->allowed) = false;
object_unref(OBJECT(accel));
} else {
object_set_accelerator_compat_props(acc->compat_props);
}
return ret;
}
通过 accel_init_machine() 将目标机器和加速器关联在一起。
加速器的 init_machine() 函数根据每个加速器不同,有不同的实现,WHPX 的定义如下:
/*
* Partition support
*/
static int whpx_accel_init(MachineState *ms)
{
struct whpx_state *whpx;
int ret;
HRESULT hr;
WHV_CAPABILITY whpx_cap;
UINT32 whpx_cap_size;
WHV_PARTITION_PROPERTY prop;
UINT32 cpuidExitList[] = {1, 0x80000001};
WHV_CAPABILITY_FEATURES features = {0};
whpx = &whpx_global;
if (!init_whp_dispatch()) {
ret = -ENOSYS;
goto error;
}
whpx->mem_quota = ms->ram_size;
hr = whp_dispatch.WHvGetCapability(
WHvCapabilityCodeHypervisorPresent, &whpx_cap,
sizeof(whpx_cap), &whpx_cap_size);
if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
error_report("WHPX: No accelerator found, hr=%08lx", hr);
ret = -ENOSPC;
goto error;
}
hr = whp_dispatch.WHvGetCapability(
WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
if (FAILED(hr)) {
error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
if (FAILED(hr)) {
error_report("WHPX: Failed to create partition, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
/*
* Query the XSAVE capability of the partition. Any error here is not
* considered fatal.
*/
hr = whp_dispatch.WHvGetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeProcessorXsaveFeatures,
&whpx_xsave_cap,
sizeof(whpx_xsave_cap),
&whpx_cap_size);
/*
* Windows version which don't support this property will return with the
* specific error code.
*/
if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
}
if (!whpx_has_xsave()) {
printf("WHPX: Partition is not XSAVE capable\n");
}
memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
prop.ProcessorCount = ms->smp.cpus;
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeProcessorCount,
&prop,
sizeof(WHV_PARTITION_PROPERTY));
if (FAILED(hr)) {
error_report("WHPX: Failed to set partition processor count to %u,"
" hr=%08lx", prop.ProcessorCount, hr);
ret = -EINVAL;
goto error;
}
/*
* Error out if WHP doesn't support apic emulation and user is requiring
* it.
*/
if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
!whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
error_report("WHPX: kernel irqchip requested, but unavailable. "
"Try without kernel-irqchip or with kernel-irqchip=off");
ret = -EINVAL;
goto error;
}
if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
WHvX64LocalApicEmulationModeXApic;
printf("WHPX: setting APIC emulation mode in the hypervisor\n");
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeLocalApicEmulationMode,
&mode,
sizeof(mode));
if (FAILED(hr)) {
error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
if (whpx->kernel_irqchip_required) {
error_report("WHPX: kernel irqchip requested, but unavailable");
ret = -EINVAL;
goto error;
}
} else {
whpx->apic_in_platform = true;
}
}
/* Register for MSR and CPUID exits */
memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
prop.ExtendedVmExits.X64MsrExit = 1;
prop.ExtendedVmExits.X64CpuidExit = 1;
prop.ExtendedVmExits.ExceptionExit = 1;
if (whpx_apic_in_platform()) {
prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
}
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeExtendedVmExits,
&prop,
sizeof(WHV_PARTITION_PROPERTY));
if (FAILED(hr)) {
error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeCpuidExitList,
cpuidExitList,
RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
if (FAILED(hr)) {
error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
hr);
ret = -EINVAL;
goto error;
}
/*
* We do not want to intercept any exceptions from the guest,
* until we actually start debugging with gdb.
*/
whpx->exception_exit_bitmap = -1;
hr = whpx_set_exception_exit_bitmap(0);
if (FAILED(hr)) {
error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
hr = whp_dispatch.WHvSetupPartition(whpx->partition);
if (FAILED(hr)) {
error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
whpx_memory_init();
printf("Windows Hypervisor Platform accelerator is operational\n");
return 0;
error:
if (NULL != whpx->partition) {
whp_dispatch.WHvDeletePartition(whpx->partition);
whpx->partition = NULL;
}
return ret;
}
函数 init_whp_dispatch() 在 /target/i386/whpx/whpx-all.c 文件中,定义如下:
bool init_whp_dispatch(void)
{
if (whp_dispatch_initialized) {
return true;
}
if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
goto error;
}
if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
goto error;
}
assert(load_whp_dispatch_fns(&hWinHvPlatform,
WINHV_PLATFORM_FNS_SUPPLEMENTAL));
whp_dispatch_initialized = true;
return true;
error:
if (hWinHvPlatform) {
FreeLibrary(hWinHvPlatform);
}
if (hWinHvEmulation) {
FreeLibrary(hWinHvEmulation);
}
return false;
}
函数 load_whp_dispatch_fns() 定义如下:
/*
* Load the functions from the given library, using the given handle. If a
* handle is provided, it is used, otherwise the library is opened. The
* handle will be updated on return with the opened one.
*/
static bool load_whp_dispatch_fns(HMODULE *handle,
WHPFunctionList function_list)
{
HMODULE hLib = *handle;
#define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
#define WINHV_EMULATION_DLL "WinHvEmulation.dll"
#define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
whp_dispatch.function_name = \
(function_name ## _t)GetProcAddress(hLib, #function_name); \
#define WHP_LOAD_FIELD(return_type, function_name, signature) \
whp_dispatch.function_name = \
(function_name ## _t)GetProcAddress(hLib, #function_name); \
if (!whp_dispatch.function_name) { \
error_report("Could not load function %s", #function_name); \
goto error; \
} \
#define WHP_LOAD_LIB(lib_name, handle_lib) \
if (!handle_lib) { \
handle_lib = LoadLibrary(lib_name); \
if (!handle_lib) { \
error_report("Could not load library %s.", lib_name); \
goto error; \
} \
} \
switch (function_list) {
case WINHV_PLATFORM_FNS_DEFAULT:
WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
break;
case WINHV_EMULATION_FNS_DEFAULT:
WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
break;
case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
break;
}
*handle = hLib;
return true;
error:
if (hLib) {
FreeLibrary(hLib);
}
return false;
}
WHPX 提供的函数类型列表如下:
typedef enum WHPFunctionList {
WINHV_PLATFORM_FNS_DEFAULT,
WINHV_EMULATION_FNS_DEFAULT,
WINHV_PLATFORM_FNS_SUPPLEMENTAL
} WHPFunctionList;
接下来,将机器状态置为 PHASE_ACCEL_CREATED,实现代码如下:
phase_advance(PHASE_ACCEL_CREATED);
函数 phase_advance() 定义如下:
void phase_advance(MachineInitPhase phase)
{
assert(machine_phase == phase - 1);
machine_phase = phase;
}
机器状态 machine_phase 的数据类型定义如下:
typedef enum MachineInitPhase {
/* current_machine is NULL. */
PHASE_NO_MACHINE,
/* current_machine is not NULL, but current_machine->accel is NULL. */
PHASE_MACHINE_CREATED,
/*
* current_machine->accel is not NULL, but the machine properties have
* not been validated and machine_class->init has not yet been called.
*/
PHASE_ACCEL_CREATED,
/*
* machine_class->init has been called, thus creating any embedded
* devices and validating machine properties. Devices created at
* this time are considered to be cold-plugged.
*/
PHASE_MACHINE_INITIALIZED,
/*
* QEMU is ready to start CPUs and devices created at this time
* are considered to be hot-plugged. The monitor is not restricted
* to "preconfig" commands.
*/
PHASE_MACHINE_READY,
} MachineInitPhase;
至此,完成了对象字典中与虚拟机相关的加速器的配置项处理和应用。
以上分析了 QEMU 系统仿真在启动过程中,QEMU 系统仿真对目标虚拟机的加速器配置项所做的处理和应用。