最近在鲲鹏 aarch64 服务器上,发现了一个 QEMU 的异常,启动虚拟机的时候加载 UEFI firmware 时间很长,表现就是需要几分钟才能出现虚拟机的 grub 界面。这个问题不是稳定复现,通过进一步研究发现,问题出在 qemu 的 vhost 部分。
修复的 patch 在这里。 这里涉及到 3 个 patch,主要内容如下:
- | diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
- | index 07179bb74f..45997cbf27 100644
- | --- a/hw/virtio/vhost.c
- | +++ b/hw/virtio/vhost.c
- | @@ -451,8 +451,13 @@ static void vhost_commit(MemoryListener *listener)
- | changed = true;
- | } else {
- | /* Same size, lets check the contents */
- | - changed = n_old_sections && memcmp(dev->mem_sections, old_sections,
- | - n_old_sections * sizeof(old_sections[0])) != 0;
- | + for (i = 0; i < n_old_sections; i++) {
- | + if (!MemoryRegionSection_eq(&old_sections[i],
- | + &dev->mem_sections[i])) {
- | + changed = true;
- | + break;
- | + }
- | + }
- | }
- |
- | trace_vhost_commit(dev->started, changed);
- | diff --git a/include/exec/memory.h b/include/exec/memory.h
- | index 32fb294308..a0eeff13e3 100644
- | --- a/include/exec/memory.h
- | +++ b/include/exec/memory.h
- | @@ -391,6 +391,17 @@ struct MemoryRegionSection {
- | bool readonly;
- | };
- |
- | +static inline bool MemoryRegionSection_eq(MemoryRegionSection *a,
- | + MemoryRegionSection *b)
- | +{
- | + return a->mr == b->mr &&
- | + a->fv == b->fv &&
- | + a->offset_within_region == b->offset_within_region &&
- | + a->offset_within_address_space == b->offset_within_address_space &&
- | + int128_eq(a->size, b->size) &&
- | + a->readonly == b->readonly;
- | +}
- | +
可以看到,核心就是去掉 memcmp 比较两个结构体,而是手动的去一个字段一个字段的比较,这里的差别是什么呢?答案是对齐填充。
memcmp 比较的类型如下所示,这个类型的长度为 56 字节。
- struct Int128 {
- uint64_t lo;
- int64_t hi;
- };
- struct MemoryRegionSection {
- struct Int128 size;
- void *mr;
- void *fv;
- uint64_t offset_within_region;
- uint64_t offset_within_address_space;
- bool readonly;
- };
学习地址: Dpdk/网络协议栈/vpp/OvS/DDos/NFV/虚拟化/高性能专家-学习视频教程-腾讯课堂
更多DPDK相关学习资料有需要的可以自行报名学习,免费订阅,久学习,或点击这里加qun免费
领取,关注我持续更新哦! !
POC 代码如下:
- struct MemoryRegionSection {
- struct Int128 size;
- void *mr;
- void *fv;
- uint64_t offset_within_region;
- uint64_t offset_within_address_space;
- bool readonly;
- };
-
- void hexdump(void *ptr, size_t size) {
- size_t i;
- unsigned char *p = (unsigned char*)ptr;
- for (i = 0; i < size; i++) {
- printf("%02x ", p[i]);
- }
- printf("\n");
- }
-
- int main() {
- struct MemoryRegionSection section1, section2;
- printf("sizeof(struct MemoryRegionSection) = %lu\n",
- sizeof(struct MemoryRegionSection));
- printf("offsetof(struct MemoryRegionSection, readonly) = %lu\n",
- offsetof(struct MemoryRegionSection, readonly));
- section2.size.hi = section1.size.hi = 0xabcdef;
- section2.size.lo = section1.size.lo = 0xabcdef;
- section2.mr = section1.mr = (void*)0xabcdef;
- section2.fv = section1.fv = (void*)0xabcdef;
- section2.offset_within_region = section1.offset_within_region = 0xabcdef;
- section2.offset_within_address_space = section1.offset_within_address_space = 0xabcdef;
- section2.readonly = section1.readonly = true;
- printf("memcmp(section1, section2, sizeof(section1)) = %d\n",
- memcmp(§ion1, §ion2, sizeof(section1)));
- printf("memcmp(section1, section2, 48) = %d\n",
- memcmp(§ion1, §ion2, 48));
- printf("hexdump(section1, sizeof(section1))\n");
- hexdump(§ion1, sizeof(section1));
- printf("hexdump(section2, sizeof(section2))\n");
- hexdump(§ion2, sizeof(section2));
- return 0;
- }
输出:
- sizeof(struct MemoryRegionSection) = 56
- offsetof(struct MemoryRegionSection, readonly) = 48
- memcmp(section1, section2, sizeof(section1)) = -64
- memcmp(section1, section2, 48) = 0
- hexdump(section1, sizeof(section1))
- ef cd ab 00 00 00 00 00 ef cd ab 00 00 00 00 00 ef cd ab 00 00 00 00 00 ef cd ab 00 00 00 00 00 ef cd ab 00 00 00 00 00 ef cd ab 00 00 00 00 00 01 <0a> 73 23 ff 7e 00 00
- hexdump(section2, sizeof(section2))
- ef cd ab 00 00 00 00 00 ef cd ab 00 00 00 00 00 ef cd ab 00 00 00 00 00 ef cd ab 00 00 00 00 00 ef cd ab 00 00 00 00 00 ef cd ab 00 00 00 00 00 01 <4a> 72 23 ff 7e 00 00
很明显,结构体在 64 bit 系统上自动会进行对齐到 8 字节,所以最后的 bool 1 字节后面会自动填充 7 个字节,都是未初始化的内存,直接进行 memcmp 是非常危险的。 很大可能跟期望的不一样。
Qemu 发生这样的低级错误,有些出乎意料。memcmp 在使用的时候需要小心,需要考虑到结构体的内存布局,内存是否经过初始化,否则就跟期望的不一致。