[PATCH v4 21/21] KVM: selftests: Test READ=>WRITE dirty logging behavior for shadow MMU
Yosry Ahmed
yosry.ahmed at linux.dev
Fri Jan 2 09:36:38 PST 2026
On Tue, Dec 30, 2025 at 03:01:50PM -0800, Sean Christopherson wrote:
> Update the nested dirty log test to validate KVM's handling of READ faults
> when dirty logging is enabled. Specifically, set the Dirty bit in the
> guest PTEs used to map L2 GPAs, so that KVM will create writable SPTEs
> when handling L2 read faults. When handling read faults in the shadow MMU,
> KVM opportunistically creates a writable SPTE if the mapping can be
> writable *and* the gPTE is dirty (or doesn't support the Dirty bit), i.e.
> if KVM doesn't need to intercept writes in order to emulate Dirty-bit
> updates.
>
> To actually test the L2 READ=>WRITE sequence, e.g. without masking a false
> pass by other test activity, route the READ=>WRITE and WRITE=>WRITE
> sequences to separate L1 pages, and differentiate between "marked dirty
> due to a WRITE access/fault" and "marked dirty due to creating a writable
> SPTE for a READ access/fault". The updated sequence exposes the bug fixed
> by KVM commit 1f4e5fc83a42 ("KVM: x86: fix nested guest live migration
> with PML") when the guest performs a READ=>WRITE sequence.
>
> Signed-off-by: Sean Christopherson <seanjc at google.com>
> ---
> .../selftests/kvm/include/x86/processor.h | 1 +
> .../testing/selftests/kvm/lib/x86/processor.c | 7 ++
> .../selftests/kvm/x86/nested_dirty_log_test.c | 115 +++++++++++++-----
> 3 files changed, 90 insertions(+), 33 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
> index ab29b1c7ed2d..8945c9eea704 100644
> --- a/tools/testing/selftests/kvm/include/x86/processor.h
> +++ b/tools/testing/selftests/kvm/include/x86/processor.h
> @@ -1483,6 +1483,7 @@ bool kvm_cpu_has_tdp(void);
> void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size);
> void tdp_identity_map_default_memslots(struct kvm_vm *vm);
> void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size);
> +uint64_t *tdp_get_pte(struct kvm_vm *vm, uint64_t l2_gpa);
>
> /*
> * Basic CPU control in CR0
> diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
> index ab869a98bbdc..fab18e9be66c 100644
> --- a/tools/testing/selftests/kvm/lib/x86/processor.c
> +++ b/tools/testing/selftests/kvm/lib/x86/processor.c
> @@ -390,6 +390,13 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm,
> return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K);
> }
>
> +uint64_t *tdp_get_pte(struct kvm_vm *vm, uint64_t l2_gpa)
> +{
> + int level = PG_LEVEL_4K;
> +
> + return __vm_get_page_table_entry(vm, &vm->stage2_mmu, l2_gpa, &level);
> +}
> +
> uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr)
> {
> int level = PG_LEVEL_4K;
> diff --git a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
> index 89d2e86a0db9..1e7c1ed917e1 100644
> --- a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
> +++ b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
> @@ -17,29 +17,39 @@
>
> /* The memory slot index to track dirty pages */
> #define TEST_MEM_SLOT_INDEX 1
> -#define TEST_MEM_PAGES 3
> +#define TEST_MEM_PAGES 4
>
> /* L1 guest test virtual memory offset */
> -#define GUEST_TEST_MEM 0xc0000000
> +#define GUEST_TEST_MEM1 0xc0000000
> +#define GUEST_TEST_MEM2 0xc0002000
>
> /* L2 guest test virtual memory offset */
> #define NESTED_TEST_MEM1 0xc0001000
> -#define NESTED_TEST_MEM2 0xc0002000
> +#define NESTED_TEST_MEM2 0xc0003000
>
> #define L2_GUEST_STACK_SIZE 64
>
> +#define TEST_SYNC_PAGE_MASK 0xfull
> +#define TEST_SYNC_READ_FAULT BIT(4)
> +#define TEST_SYNC_WRITE_FAULT BIT(5)
> +#define TEST_SYNC_NO_FAULT BIT(6)
> +
> static void l2_guest_code(u64 *a, u64 *b)
> {
> READ_ONCE(*a);
> + GUEST_SYNC(0 | TEST_SYNC_READ_FAULT);
> WRITE_ONCE(*a, 1);
> - GUEST_SYNC(true);
> - GUEST_SYNC(false);
> + GUEST_SYNC(0 | TEST_SYNC_WRITE_FAULT);
> + READ_ONCE(*a);
> + GUEST_SYNC(0 | TEST_SYNC_NO_FAULT);
>
> WRITE_ONCE(*b, 1);
> - GUEST_SYNC(true);
> + GUEST_SYNC(2 | TEST_SYNC_WRITE_FAULT);
> WRITE_ONCE(*b, 1);
> - GUEST_SYNC(true);
> - GUEST_SYNC(false);
> + GUEST_SYNC(2 | TEST_SYNC_WRITE_FAULT);
> + READ_ONCE(*b);
> + GUEST_SYNC(2 | TEST_SYNC_NO_FAULT);
> + GUEST_SYNC(2 | TEST_SYNC_NO_FAULT);
Instead of hardcoding 0 and 2 here, which IIUC correspond to the
physical addresses 0xc0000000 and 0xc0002000, as well as indices in
host_test_mem, can we make the overall definitions a bit more intuitive?
For example:
#define GUEST_GPA_START 0xc0000000
#define GUEST_PAGE1_IDX 0
#define GUEST_PAGE2_IDX 1
#define GUEST_GPA_PAGE1 (GUEST_GPA_START + GUEST_PAGE1_IDX * PAGE_SIZE)
#define GUEST_GPA_PAGE2 (GUEST_GPA_START + GUEST_PAGE2_IDX * PAGE_SIZE)
/* Mapped to GUEST_GPA_PAGE1 and GUEST_GPA_PAGE2 */
#define GUEST_GVA_PAGE1 0xd0000000
#define GUEST_GVA_PAGE2 0xd0002000
/* Mapped to GUEST_GPA_PAGE1 and GUEST_GPA_PAGE2 using TDP in L1 */
#define GUEST_GVA_NESTED_PAGE1 0xd0001000
#define GUEST_GVA_NESTED_PAGE2 0xd0003000
Then in L2 code, we can explicitly take in the GVA of page1 and page2
and use the definitions above in the GUEST_SYNC() calls, for example:
static void l2_guest_code(u64 *page1_gva, u64 *page2_gva)
{
READ_ONCE(*page1_gva);
GUEST_SYNC(GUEST_PAGE1_IDX | TEST_SYNC_READ_FAULT);
WRITE_ONCE(*page1_gva, 1);
GUEST_SYNC(GUEST_PAGE1_IDX | TEST_SYNC_WRITE_FAULT);
...
}
and we can explicitly read page1 and page2 from the host (instead of
using host_test_mem).
Alternatively, we can pass in the guest GVA directly into GUEST_SYNC(),
and use the lower bits for TEST_SYNC_READ_FAULT, TEST_SYNC_WRITE_FAULT,
and TEST_SYNC_NO_FAULT.
WDYT?
>
> /* Exit to L1 and never come back. */
> vmcall();
> @@ -53,7 +63,7 @@ static void l2_guest_code_tdp_enabled(void)
> static void l2_guest_code_tdp_disabled(void)
> {
> /* Access the same L1 GPAs as l2_guest_code_tdp_enabled() */
> - l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM);
> + l2_guest_code((u64 *)GUEST_TEST_MEM1, (u64 *)GUEST_TEST_MEM2);
> }
>
> void l1_vmx_code(struct vmx_pages *vmx)
> @@ -72,9 +82,11 @@ void l1_vmx_code(struct vmx_pages *vmx)
>
> prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
>
> - GUEST_SYNC(false);
> + GUEST_SYNC(0 | TEST_SYNC_NO_FAULT);
> + GUEST_SYNC(2 | TEST_SYNC_NO_FAULT);
> GUEST_ASSERT(!vmlaunch());
> - GUEST_SYNC(false);
> + GUEST_SYNC(0 | TEST_SYNC_NO_FAULT);
> + GUEST_SYNC(2 | TEST_SYNC_NO_FAULT);
> GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL);
> GUEST_DONE();
> }
> @@ -91,9 +103,11 @@ static void l1_svm_code(struct svm_test_data *svm)
>
> generic_svm_setup(svm, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
>
> - GUEST_SYNC(false);
> + GUEST_SYNC(0 | TEST_SYNC_NO_FAULT);
> + GUEST_SYNC(2 | TEST_SYNC_NO_FAULT);
> run_guest(svm->vmcb, svm->vmcb_gpa);
> - GUEST_SYNC(false);
> + GUEST_SYNC(0 | TEST_SYNC_NO_FAULT);
> + GUEST_SYNC(2 | TEST_SYNC_NO_FAULT);
> GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL);
> GUEST_DONE();
> }
> @@ -106,6 +120,11 @@ static void l1_guest_code(void *data)
> l1_svm_code(data);
> }
>
> +static uint64_t test_read_host_page(uint64_t *host_test_mem, int page_nr)
> +{
> + return host_test_mem[PAGE_SIZE * page_nr / sizeof(*host_test_mem)];
> +}
> +
> static void test_dirty_log(bool nested_tdp)
> {
> vm_vaddr_t nested_gva = 0;
> @@ -133,32 +152,45 @@ static void test_dirty_log(bool nested_tdp)
>
> /* Add an extra memory slot for testing dirty logging */
> vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
> - GUEST_TEST_MEM,
> + GUEST_TEST_MEM1,
> TEST_MEM_SLOT_INDEX,
> TEST_MEM_PAGES,
> KVM_MEM_LOG_DIRTY_PAGES);
>
> /*
> - * Add an identity map for GVA range [0xc0000000, 0xc0002000). This
> + * Add an identity map for GVA range [0xc0000000, 0xc0004000). This
> * affects both L1 and L2. However...
> */
> - virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES);
> + virt_map(vm, GUEST_TEST_MEM1, GUEST_TEST_MEM1, TEST_MEM_PAGES);
>
> /*
> - * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to
> - * 0xc0000000.
> + * ... pages in the L2 GPA ranges [0xc0001000, 0xc0002000) and
> + * [0xc0003000, 0xc0004000) will map to 0xc0000000 and 0xc0001000
> + * respectively.
> *
> * When TDP is disabled, the L2 guest code will still access the same L1
> * GPAs as the TDP enabled case.
> + *
> + * Set the Dirty bit in the PTEs used by L2 so that KVM will create
> + * writable SPTEs when handling read faults (if the Dirty bit isn't
> + * set, KVM must intercept the next write to emulate the Dirty bit
> + * update).
> */
> if (nested_tdp) {
> tdp_identity_map_default_memslots(vm);
> - tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE);
> - tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE);
> + tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM1, PAGE_SIZE);
> + tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM2, PAGE_SIZE);
> +
> +
> + *tdp_get_pte(vm, NESTED_TEST_MEM1) |= PTE_DIRTY_MASK(&vm->stage2_mmu);
> + *tdp_get_pte(vm, NESTED_TEST_MEM2) |= PTE_DIRTY_MASK(&vm->stage2_mmu);
> + } else {
> + *vm_get_pte(vm, GUEST_TEST_MEM1) |= PTE_DIRTY_MASK(&vm->mmu);
> + *vm_get_pte(vm, GUEST_TEST_MEM2) |= PTE_DIRTY_MASK(&vm->mmu);
> }
>
> bmap = bitmap_zalloc(TEST_MEM_PAGES);
> - host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM);
> + host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM1);
>
> while (!done) {
> memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE);
> @@ -169,25 +201,42 @@ static void test_dirty_log(bool nested_tdp)
> case UCALL_ABORT:
> REPORT_GUEST_ASSERT(uc);
> /* NOT REACHED */
> - case UCALL_SYNC:
> + case UCALL_SYNC: {
> + int page_nr = uc.args[1] & TEST_SYNC_PAGE_MASK;
> + int i;
> +
> /*
> * The nested guest wrote at offset 0x1000 in the memslot, but the
> * dirty bitmap must be filled in according to L1 GPA, not L2.
> */
> kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
> - if (uc.args[1]) {
> - TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean");
> - TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest");
> - } else {
> - TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty");
> - TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest");
> +
> + /*
> + * If a fault is expected, the page should be dirty
> + * as the Dirty bit is set in the gPTE. KVM should
> + * create a writable SPTE even on a read fault, *and*
> + * KVM must mark the GFN as dirty when doing so.
> + */
> + TEST_ASSERT(test_bit(page_nr, bmap) == !(uc.args[1] & TEST_SYNC_NO_FAULT),
> + "Page %u incorrectly reported %s on %s fault", page_nr,
> + test_bit(page_nr, bmap) ? "dirty" : "clean",
> + uc.args[1] & TEST_SYNC_NO_FAULT ? "no" :
> + uc.args[1] & TEST_SYNC_READ_FAULT ? "read" : "write");
> +
> + for (i = 0; i < TEST_MEM_PAGES; i++) {
> + if (i == page_nr && uc.args[1] & TEST_SYNC_WRITE_FAULT)
> + TEST_ASSERT(test_read_host_page(host_test_mem, i) == 1,
> + "Page %u not written by guest", i);
> + else
> + TEST_ASSERT(test_read_host_page(host_test_mem, i) == 0xaaaaaaaaaaaaaaaaULL,
> + "Page %u written by guest", i);
> +
> + if (i != page_nr)
> + TEST_ASSERT(!test_bit(i, bmap),
> + "Page %u incorrectly reported dirty", i);
> }
> -
> - TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty");
> - TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest");
> - TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty");
> - TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest");
> break;
> + }
> case UCALL_DONE:
> done = true;
> break;
> --
> 2.52.0.351.gbe84eed79e-goog
>
More information about the linux-riscv
mailing list