1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * KVM dirty page logging test
4 *
5 * Copyright (C) 2018, Red Hat, Inc.
6 */
7
8 #define _GNU_SOURCE /* for program_invocation_name */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <time.h>
14 #include <pthread.h>
15 #include <linux/bitmap.h>
16 #include <linux/bitops.h>
17
18 #include "test_util.h"
19 #include "kvm_util.h"
20 #include "processor.h"
21
22 #define VCPU_ID 1
23
24 /* The memory slot index to track dirty pages */
25 #define TEST_MEM_SLOT_INDEX 1
26
27 /* Default guest test virtual memory offset */
28 #define DEFAULT_GUEST_TEST_MEM 0xc0000000
29
30 /* How many pages to dirty for each guest loop */
31 #define TEST_PAGES_PER_LOOP 1024
32
33 /* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */
34 #define TEST_HOST_LOOP_N 32UL
35
36 /* Interval for each host loop (ms) */
37 #define TEST_HOST_LOOP_INTERVAL 10UL
38
39 /* Dirty bitmaps are always little endian, so we need to swap on big endian */
40 #if defined(__s390x__)
41 # define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7)
42 # define test_bit_le(nr, addr) \
43 test_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
44 # define set_bit_le(nr, addr) \
45 set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
46 # define clear_bit_le(nr, addr) \
47 clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
48 # define test_and_set_bit_le(nr, addr) \
49 test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
50 # define test_and_clear_bit_le(nr, addr) \
51 test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
52 #else
53 # define test_bit_le test_bit
54 # define set_bit_le set_bit
55 # define clear_bit_le clear_bit
56 # define test_and_set_bit_le test_and_set_bit
57 # define test_and_clear_bit_le test_and_clear_bit
58 #endif
59
60 /*
61 * Guest/Host shared variables. Ensure addr_gva2hva() and/or
62 * sync_global_to/from_guest() are used when accessing from
63 * the host. READ/WRITE_ONCE() should also be used with anything
64 * that may change.
65 */
66 static uint64_t host_page_size;
67 static uint64_t guest_page_size;
68 static uint64_t guest_num_pages;
69 static uint64_t random_array[TEST_PAGES_PER_LOOP];
70 static uint64_t iteration;
71
72 /*
73 * Guest physical memory offset of the testing memory slot.
74 * This will be set to the topmost valid physical address minus
75 * the test memory size.
76 */
77 static uint64_t guest_test_phys_mem;
78
79 /*
80 * Guest virtual memory offset of the testing memory slot.
81 * Must not conflict with identity mapped test code.
82 */
83 static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
84
85 /*
86 * Continuously write to the first 8 bytes of a random pages within
87 * the testing memory region.
88 */
guest_code(void)89 static void guest_code(void)
90 {
91 uint64_t addr;
92 int i;
93
94 /*
95 * On s390x, all pages of a 1M segment are initially marked as dirty
96 * when a page of the segment is written to for the very first time.
97 * To compensate this specialty in this test, we need to touch all
98 * pages during the first iteration.
99 */
100 for (i = 0; i < guest_num_pages; i++) {
101 addr = guest_test_virt_mem + i * guest_page_size;
102 *(uint64_t *)addr = READ_ONCE(iteration);
103 }
104
105 while (true) {
106 for (i = 0; i < TEST_PAGES_PER_LOOP; i++) {
107 addr = guest_test_virt_mem;
108 addr += (READ_ONCE(random_array[i]) % guest_num_pages)
109 * guest_page_size;
110 addr &= ~(host_page_size - 1);
111 *(uint64_t *)addr = READ_ONCE(iteration);
112 }
113
114 /* Tell the host that we need more random numbers */
115 GUEST_SYNC(1);
116 }
117 }
118
119 /* Host variables */
120 static bool host_quit;
121
122 /* Points to the test VM memory region on which we track dirty logs */
123 static void *host_test_mem;
124 static uint64_t host_num_pages;
125
126 /* For statistics only */
127 static uint64_t host_dirty_count;
128 static uint64_t host_clear_count;
129 static uint64_t host_track_next_count;
130
131 enum log_mode_t {
132 /* Only use KVM_GET_DIRTY_LOG for logging */
133 LOG_MODE_DIRTY_LOG = 0,
134
135 /* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */
136 LOG_MODE_CLEAR_LOG = 1,
137
138 LOG_MODE_NUM,
139
140 /* Run all supported modes */
141 LOG_MODE_ALL = LOG_MODE_NUM,
142 };
143
144 /* Mode of logging to test. Default is to run all supported modes */
145 static enum log_mode_t host_log_mode_option = LOG_MODE_ALL;
146 /* Logging mode for current run */
147 static enum log_mode_t host_log_mode;
148
clear_log_supported(void)149 static bool clear_log_supported(void)
150 {
151 return kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
152 }
153
clear_log_create_vm_done(struct kvm_vm * vm)154 static void clear_log_create_vm_done(struct kvm_vm *vm)
155 {
156 struct kvm_enable_cap cap = {};
157 u64 manual_caps;
158
159 manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
160 TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!");
161 manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
162 KVM_DIRTY_LOG_INITIALLY_SET);
163 cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2;
164 cap.args[0] = manual_caps;
165 vm_enable_cap(vm, &cap);
166 }
167
dirty_log_collect_dirty_pages(struct kvm_vm * vm,int slot,void * bitmap,uint32_t num_pages)168 static void dirty_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
169 void *bitmap, uint32_t num_pages)
170 {
171 kvm_vm_get_dirty_log(vm, slot, bitmap);
172 }
173
clear_log_collect_dirty_pages(struct kvm_vm * vm,int slot,void * bitmap,uint32_t num_pages)174 static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
175 void *bitmap, uint32_t num_pages)
176 {
177 kvm_vm_get_dirty_log(vm, slot, bitmap);
178 kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages);
179 }
180
181 struct log_mode {
182 const char *name;
183 /* Return true if this mode is supported, otherwise false */
184 bool (*supported)(void);
185 /* Hook when the vm creation is done (before vcpu creation) */
186 void (*create_vm_done)(struct kvm_vm *vm);
187 /* Hook to collect the dirty pages into the bitmap provided */
188 void (*collect_dirty_pages) (struct kvm_vm *vm, int slot,
189 void *bitmap, uint32_t num_pages);
190 } log_modes[LOG_MODE_NUM] = {
191 {
192 .name = "dirty-log",
193 .collect_dirty_pages = dirty_log_collect_dirty_pages,
194 },
195 {
196 .name = "clear-log",
197 .supported = clear_log_supported,
198 .create_vm_done = clear_log_create_vm_done,
199 .collect_dirty_pages = clear_log_collect_dirty_pages,
200 },
201 };
202
203 /*
204 * We use this bitmap to track some pages that should have its dirty
205 * bit set in the _next_ iteration. For example, if we detected the
206 * page value changed to current iteration but at the same time the
207 * page bit is cleared in the latest bitmap, then the system must
208 * report that write in the next get dirty log call.
209 */
210 static unsigned long *host_bmap_track;
211
log_modes_dump(void)212 static void log_modes_dump(void)
213 {
214 int i;
215
216 printf("all");
217 for (i = 0; i < LOG_MODE_NUM; i++)
218 printf(", %s", log_modes[i].name);
219 printf("\n");
220 }
221
log_mode_supported(void)222 static bool log_mode_supported(void)
223 {
224 struct log_mode *mode = &log_modes[host_log_mode];
225
226 if (mode->supported)
227 return mode->supported();
228
229 return true;
230 }
231
log_mode_create_vm_done(struct kvm_vm * vm)232 static void log_mode_create_vm_done(struct kvm_vm *vm)
233 {
234 struct log_mode *mode = &log_modes[host_log_mode];
235
236 if (mode->create_vm_done)
237 mode->create_vm_done(vm);
238 }
239
log_mode_collect_dirty_pages(struct kvm_vm * vm,int slot,void * bitmap,uint32_t num_pages)240 static void log_mode_collect_dirty_pages(struct kvm_vm *vm, int slot,
241 void *bitmap, uint32_t num_pages)
242 {
243 struct log_mode *mode = &log_modes[host_log_mode];
244
245 TEST_ASSERT(mode->collect_dirty_pages != NULL,
246 "collect_dirty_pages() is required for any log mode!");
247 mode->collect_dirty_pages(vm, slot, bitmap, num_pages);
248 }
249
generate_random_array(uint64_t * guest_array,uint64_t size)250 static void generate_random_array(uint64_t *guest_array, uint64_t size)
251 {
252 uint64_t i;
253
254 for (i = 0; i < size; i++)
255 guest_array[i] = random();
256 }
257
vcpu_worker(void * data)258 static void *vcpu_worker(void *data)
259 {
260 int ret;
261 struct kvm_vm *vm = data;
262 uint64_t *guest_array;
263 uint64_t pages_count = 0;
264 struct kvm_run *run;
265
266 run = vcpu_state(vm, VCPU_ID);
267
268 guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array);
269 generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
270
271 while (!READ_ONCE(host_quit)) {
272 /* Let the guest dirty the random pages */
273 ret = _vcpu_run(vm, VCPU_ID);
274 TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
275 if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) {
276 pages_count += TEST_PAGES_PER_LOOP;
277 generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
278 } else {
279 TEST_FAIL("Invalid guest sync status: "
280 "exit_reason=%s\n",
281 exit_reason_str(run->exit_reason));
282 }
283 }
284
285 pr_info("Dirtied %"PRIu64" pages\n", pages_count);
286
287 return NULL;
288 }
289
vm_dirty_log_verify(enum vm_guest_mode mode,unsigned long * bmap)290 static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap)
291 {
292 uint64_t step = vm_num_host_pages(mode, 1);
293 uint64_t page;
294 uint64_t *value_ptr;
295
296 for (page = 0; page < host_num_pages; page += step) {
297 value_ptr = host_test_mem + page * host_page_size;
298
299 /* If this is a special page that we were tracking... */
300 if (test_and_clear_bit_le(page, host_bmap_track)) {
301 host_track_next_count++;
302 TEST_ASSERT(test_bit_le(page, bmap),
303 "Page %"PRIu64" should have its dirty bit "
304 "set in this iteration but it is missing",
305 page);
306 }
307
308 if (test_and_clear_bit_le(page, bmap)) {
309 host_dirty_count++;
310 /*
311 * If the bit is set, the value written onto
312 * the corresponding page should be either the
313 * previous iteration number or the current one.
314 */
315 TEST_ASSERT(*value_ptr == iteration ||
316 *value_ptr == iteration - 1,
317 "Set page %"PRIu64" value %"PRIu64
318 " incorrect (iteration=%"PRIu64")",
319 page, *value_ptr, iteration);
320 } else {
321 host_clear_count++;
322 /*
323 * If cleared, the value written can be any
324 * value smaller or equals to the iteration
325 * number. Note that the value can be exactly
326 * (iteration-1) if that write can happen
327 * like this:
328 *
329 * (1) increase loop count to "iteration-1"
330 * (2) write to page P happens (with value
331 * "iteration-1")
332 * (3) get dirty log for "iteration-1"; we'll
333 * see that page P bit is set (dirtied),
334 * and not set the bit in host_bmap_track
335 * (4) increase loop count to "iteration"
336 * (which is current iteration)
337 * (5) get dirty log for current iteration,
338 * we'll see that page P is cleared, with
339 * value "iteration-1".
340 */
341 TEST_ASSERT(*value_ptr <= iteration,
342 "Clear page %"PRIu64" value %"PRIu64
343 " incorrect (iteration=%"PRIu64")",
344 page, *value_ptr, iteration);
345 if (*value_ptr == iteration) {
346 /*
347 * This page is _just_ modified; it
348 * should report its dirtyness in the
349 * next run
350 */
351 set_bit_le(page, host_bmap_track);
352 }
353 }
354 }
355 }
356
create_vm(enum vm_guest_mode mode,uint32_t vcpuid,uint64_t extra_mem_pages,void * guest_code)357 static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
358 uint64_t extra_mem_pages, void *guest_code)
359 {
360 struct kvm_vm *vm;
361 uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
362
363 pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
364
365 vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
366 kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
367 #ifdef __x86_64__
368 vm_create_irqchip(vm);
369 #endif
370 log_mode_create_vm_done(vm);
371 vm_vcpu_add_default(vm, vcpuid, guest_code);
372 return vm;
373 }
374
375 #define DIRTY_MEM_BITS 30 /* 1G */
376 #define PAGE_SHIFT_4K 12
377
run_test(enum vm_guest_mode mode,unsigned long iterations,unsigned long interval,uint64_t phys_offset)378 static void run_test(enum vm_guest_mode mode, unsigned long iterations,
379 unsigned long interval, uint64_t phys_offset)
380 {
381 pthread_t vcpu_thread;
382 struct kvm_vm *vm;
383 unsigned long *bmap;
384
385 if (!log_mode_supported()) {
386 print_skip("Log mode '%s' not supported",
387 log_modes[host_log_mode].name);
388 return;
389 }
390
391 /*
392 * We reserve page table for 2 times of extra dirty mem which
393 * will definitely cover the original (1G+) test range. Here
394 * we do the calculation with 4K page size which is the
395 * smallest so the page number will be enough for all archs
396 * (e.g., 64K page size guest will need even less memory for
397 * page tables).
398 */
399 vm = create_vm(mode, VCPU_ID,
400 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K),
401 guest_code);
402
403 guest_page_size = vm_get_page_size(vm);
404 /*
405 * A little more than 1G of guest page sized pages. Cover the
406 * case where the size is not aligned to 64 pages.
407 */
408 guest_num_pages = (1ul << (DIRTY_MEM_BITS -
409 vm_get_page_shift(vm))) + 3;
410 guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
411
412 host_page_size = getpagesize();
413 host_num_pages = vm_num_host_pages(mode, guest_num_pages);
414
415 if (!phys_offset) {
416 guest_test_phys_mem = (vm_get_max_gfn(vm) -
417 guest_num_pages) * guest_page_size;
418 guest_test_phys_mem &= ~(host_page_size - 1);
419 } else {
420 guest_test_phys_mem = phys_offset;
421 }
422
423 #ifdef __s390x__
424 /* Align to 1M (segment size) */
425 guest_test_phys_mem &= ~((1 << 20) - 1);
426 #endif
427
428 pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
429
430 bmap = bitmap_alloc(host_num_pages);
431 host_bmap_track = bitmap_alloc(host_num_pages);
432
433 /* Add an extra memory slot for testing dirty logging */
434 vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
435 guest_test_phys_mem,
436 TEST_MEM_SLOT_INDEX,
437 guest_num_pages,
438 KVM_MEM_LOG_DIRTY_PAGES);
439
440 /* Do mapping for the dirty track memory slot */
441 virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
442
443 /* Cache the HVA pointer of the region */
444 host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
445
446 #ifdef __x86_64__
447 vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
448 #endif
449 ucall_init(vm, NULL);
450
451 /* Export the shared variables to the guest */
452 sync_global_to_guest(vm, host_page_size);
453 sync_global_to_guest(vm, guest_page_size);
454 sync_global_to_guest(vm, guest_test_virt_mem);
455 sync_global_to_guest(vm, guest_num_pages);
456
457 /* Start the iterations */
458 iteration = 1;
459 sync_global_to_guest(vm, iteration);
460 host_quit = false;
461 host_dirty_count = 0;
462 host_clear_count = 0;
463 host_track_next_count = 0;
464
465 pthread_create(&vcpu_thread, NULL, vcpu_worker, vm);
466
467 while (iteration < iterations) {
468 /* Give the vcpu thread some time to dirty some pages */
469 usleep(interval * 1000);
470 log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX,
471 bmap, host_num_pages);
472 vm_dirty_log_verify(mode, bmap);
473 iteration++;
474 sync_global_to_guest(vm, iteration);
475 }
476
477 /* Tell the vcpu thread to quit */
478 host_quit = true;
479 pthread_join(vcpu_thread, NULL);
480
481 pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
482 "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count,
483 host_track_next_count);
484
485 free(bmap);
486 free(host_bmap_track);
487 ucall_uninit(vm);
488 kvm_vm_free(vm);
489 }
490
491 struct guest_mode {
492 bool supported;
493 bool enabled;
494 };
495 static struct guest_mode guest_modes[NUM_VM_MODES];
496
497 #define guest_mode_init(mode, supported, enabled) ({ \
498 guest_modes[mode] = (struct guest_mode){ supported, enabled }; \
499 })
500
help(char * name)501 static void help(char *name)
502 {
503 int i;
504
505 puts("");
506 printf("usage: %s [-h] [-i iterations] [-I interval] "
507 "[-p offset] [-m mode]\n", name);
508 puts("");
509 printf(" -i: specify iteration counts (default: %"PRIu64")\n",
510 TEST_HOST_LOOP_N);
511 printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n",
512 TEST_HOST_LOOP_INTERVAL);
513 printf(" -p: specify guest physical test memory offset\n"
514 " Warning: a low offset can conflict with the loaded test code.\n");
515 printf(" -M: specify the host logging mode "
516 "(default: run all log modes). Supported modes: \n\t");
517 log_modes_dump();
518 printf(" -m: specify the guest mode ID to test "
519 "(default: test all supported modes)\n"
520 " This option may be used multiple times.\n"
521 " Guest mode IDs:\n");
522 for (i = 0; i < NUM_VM_MODES; ++i) {
523 printf(" %d: %s%s\n", i, vm_guest_mode_string(i),
524 guest_modes[i].supported ? " (supported)" : "");
525 }
526 puts("");
527 exit(0);
528 }
529
main(int argc,char * argv[])530 int main(int argc, char *argv[])
531 {
532 unsigned long iterations = TEST_HOST_LOOP_N;
533 unsigned long interval = TEST_HOST_LOOP_INTERVAL;
534 bool mode_selected = false;
535 uint64_t phys_offset = 0;
536 unsigned int mode;
537 int opt, i, j;
538
539 #ifdef __x86_64__
540 guest_mode_init(VM_MODE_PXXV48_4K, true, true);
541 #endif
542 #ifdef __aarch64__
543 guest_mode_init(VM_MODE_P40V48_4K, true, true);
544 guest_mode_init(VM_MODE_P40V48_64K, true, true);
545
546 {
547 unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
548
549 if (limit >= 52)
550 guest_mode_init(VM_MODE_P52V48_64K, true, true);
551 if (limit >= 48) {
552 guest_mode_init(VM_MODE_P48V48_4K, true, true);
553 guest_mode_init(VM_MODE_P48V48_64K, true, true);
554 }
555 }
556 #endif
557 #ifdef __s390x__
558 guest_mode_init(VM_MODE_P40V48_4K, true, true);
559 #endif
560
561 while ((opt = getopt(argc, argv, "hi:I:p:m:M:")) != -1) {
562 switch (opt) {
563 case 'i':
564 iterations = strtol(optarg, NULL, 10);
565 break;
566 case 'I':
567 interval = strtol(optarg, NULL, 10);
568 break;
569 case 'p':
570 phys_offset = strtoull(optarg, NULL, 0);
571 break;
572 case 'm':
573 if (!mode_selected) {
574 for (i = 0; i < NUM_VM_MODES; ++i)
575 guest_modes[i].enabled = false;
576 mode_selected = true;
577 }
578 mode = strtoul(optarg, NULL, 10);
579 TEST_ASSERT(mode < NUM_VM_MODES,
580 "Guest mode ID %d too big", mode);
581 guest_modes[mode].enabled = true;
582 break;
583 case 'M':
584 if (!strcmp(optarg, "all")) {
585 host_log_mode_option = LOG_MODE_ALL;
586 break;
587 }
588 for (i = 0; i < LOG_MODE_NUM; i++) {
589 if (!strcmp(optarg, log_modes[i].name)) {
590 pr_info("Setting log mode to: '%s'\n",
591 optarg);
592 host_log_mode_option = i;
593 break;
594 }
595 }
596 if (i == LOG_MODE_NUM) {
597 printf("Log mode '%s' invalid. Please choose "
598 "from: ", optarg);
599 log_modes_dump();
600 exit(1);
601 }
602 break;
603 case 'h':
604 default:
605 help(argv[0]);
606 break;
607 }
608 }
609
610 TEST_ASSERT(iterations > 2, "Iterations must be greater than two");
611 TEST_ASSERT(interval > 0, "Interval must be greater than zero");
612
613 pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n",
614 iterations, interval);
615
616 srandom(time(0));
617
618 for (i = 0; i < NUM_VM_MODES; ++i) {
619 if (!guest_modes[i].enabled)
620 continue;
621 TEST_ASSERT(guest_modes[i].supported,
622 "Guest mode ID %d (%s) not supported.",
623 i, vm_guest_mode_string(i));
624 if (host_log_mode_option == LOG_MODE_ALL) {
625 /* Run each log mode */
626 for (j = 0; j < LOG_MODE_NUM; j++) {
627 pr_info("Testing Log Mode '%s'\n",
628 log_modes[j].name);
629 host_log_mode = j;
630 run_test(i, iterations, interval, phys_offset);
631 }
632 } else {
633 host_log_mode = host_log_mode_option;
634 run_test(i, iterations, interval, phys_offset);
635 }
636 }
637
638 return 0;
639 }
640