1 /*
2 * Stress userfaultfd syscall.
3 *
4 * Copyright (C) 2015 Red Hat, Inc.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
8 *
9 * This test allocates two virtual areas and bounces the physical
10 * memory across the two virtual areas (from area_src to area_dst)
11 * using userfaultfd.
12 *
13 * There are three threads running per CPU:
14 *
15 * 1) one per-CPU thread takes a per-page pthread_mutex in a random
16 * page of the area_dst (while the physical page may still be in
17 * area_src), and increments a per-page counter in the same page,
18 * and checks its value against a verification region.
19 *
20 * 2) another per-CPU thread handles the userfaults generated by
21 * thread 1 above. userfaultfd blocking reads or poll() modes are
22 * exercised interleaved.
23 *
24 * 3) one last per-CPU thread transfers the memory in the background
25 * at maximum bandwidth (if not already transferred by thread
26 * 2). Each cpu thread takes cares of transferring a portion of the
27 * area.
28 *
29 * When all threads of type 3 completed the transfer, one bounce is
30 * complete. area_src and area_dst are then swapped. All threads are
31 * respawned and so the bounce is immediately restarted in the
32 * opposite direction.
33 *
34 * per-CPU threads 1 by triggering userfaults inside
35 * pthread_mutex_lock will also verify the atomicity of the memory
36 * transfer (UFFDIO_COPY).
37 *
38 * The program takes two parameters: the amounts of physical memory in
39 * megabytes (MiB) of the area and the number of bounces to execute.
40 *
41 * # 100MiB 99999 bounces
42 * ./userfaultfd 100 99999
43 *
44 * # 1GiB 99 bounces
45 * ./userfaultfd 1000 99
46 *
47 * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
48 * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
49 */
50
51 #define _GNU_SOURCE
52 #include <stdio.h>
53 #include <errno.h>
54 #include <unistd.h>
55 #include <stdlib.h>
56 #include <sys/types.h>
57 #include <sys/stat.h>
58 #include <fcntl.h>
59 #include <time.h>
60 #include <signal.h>
61 #include <poll.h>
62 #include <string.h>
63 #include <sys/mman.h>
64 #include <sys/syscall.h>
65 #include <sys/ioctl.h>
66 #include <sys/wait.h>
67 #include <pthread.h>
68 #include <linux/userfaultfd.h>
69
70 #ifdef __NR_userfaultfd
71
72 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
73
74 #define BOUNCE_RANDOM (1<<0)
75 #define BOUNCE_RACINGFAULTS (1<<1)
76 #define BOUNCE_VERIFY (1<<2)
77 #define BOUNCE_POLL (1<<3)
78 static int bounces;
79
80 #ifdef HUGETLB_TEST
81 static int huge_fd;
82 static char *huge_fd_off0;
83 #endif
84 static unsigned long long *count_verify;
85 static int uffd, uffd_flags, finished, *pipefd;
86 static char *area_src, *area_dst;
87 static char *zeropage;
88 pthread_attr_t attr;
89
90 /* pthread_mutex_t starts at page offset 0 */
91 #define area_mutex(___area, ___nr) \
92 ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
93 /*
94 * count is placed in the page after pthread_mutex_t naturally aligned
95 * to avoid non alignment faults on non-x86 archs.
96 */
97 #define area_count(___area, ___nr) \
98 ((volatile unsigned long long *) ((unsigned long) \
99 ((___area) + (___nr)*page_size + \
100 sizeof(pthread_mutex_t) + \
101 sizeof(unsigned long long) - 1) & \
102 ~(unsigned long)(sizeof(unsigned long long) \
103 - 1)))
104
105 #if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST)
106
107 /* Anonymous memory */
108 #define EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
109 (1 << _UFFDIO_COPY) | \
110 (1 << _UFFDIO_ZEROPAGE))
111
release_pages(char * rel_area)112 static int release_pages(char *rel_area)
113 {
114 int ret = 0;
115
116 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
117 perror("madvise");
118 ret = 1;
119 }
120
121 return ret;
122 }
123
allocate_area(void ** alloc_area)124 static void allocate_area(void **alloc_area)
125 {
126 if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
127 fprintf(stderr, "out of memory\n");
128 *alloc_area = NULL;
129 }
130 }
131
132 #else /* HUGETLB_TEST or SHMEM_TEST */
133
134 #define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_BASIC
135
136 #ifdef HUGETLB_TEST
137
138 /* HugeTLB memory */
release_pages(char * rel_area)139 static int release_pages(char *rel_area)
140 {
141 int ret = 0;
142
143 if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
144 rel_area == huge_fd_off0 ? 0 :
145 nr_pages * page_size,
146 nr_pages * page_size)) {
147 perror("fallocate");
148 ret = 1;
149 }
150
151 return ret;
152 }
153
154
allocate_area(void ** alloc_area)155 static void allocate_area(void **alloc_area)
156 {
157 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
158 MAP_PRIVATE | MAP_HUGETLB, huge_fd,
159 *alloc_area == area_src ? 0 :
160 nr_pages * page_size);
161 if (*alloc_area == MAP_FAILED) {
162 fprintf(stderr, "mmap of hugetlbfs file failed\n");
163 *alloc_area = NULL;
164 }
165
166 if (*alloc_area == area_src)
167 huge_fd_off0 = *alloc_area;
168 }
169
170 #elif defined(SHMEM_TEST)
171
172 /* Shared memory */
release_pages(char * rel_area)173 static int release_pages(char *rel_area)
174 {
175 int ret = 0;
176
177 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
178 perror("madvise");
179 ret = 1;
180 }
181
182 return ret;
183 }
184
allocate_area(void ** alloc_area)185 static void allocate_area(void **alloc_area)
186 {
187 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
188 MAP_ANONYMOUS | MAP_SHARED, -1, 0);
189 if (*alloc_area == MAP_FAILED) {
190 fprintf(stderr, "shared memory mmap failed\n");
191 *alloc_area = NULL;
192 }
193 }
194
195 #else /* SHMEM_TEST */
196 #error "Undefined test type"
197 #endif /* HUGETLB_TEST */
198
199 #endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */
200
my_bcmp(char * str1,char * str2,size_t n)201 static int my_bcmp(char *str1, char *str2, size_t n)
202 {
203 unsigned long i;
204 for (i = 0; i < n; i++)
205 if (str1[i] != str2[i])
206 return 1;
207 return 0;
208 }
209
locking_thread(void * arg)210 static void *locking_thread(void *arg)
211 {
212 unsigned long cpu = (unsigned long) arg;
213 struct random_data rand;
214 unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
215 int32_t rand_nr;
216 unsigned long long count;
217 char randstate[64];
218 unsigned int seed;
219 time_t start;
220
221 if (bounces & BOUNCE_RANDOM) {
222 seed = (unsigned int) time(NULL) - bounces;
223 if (!(bounces & BOUNCE_RACINGFAULTS))
224 seed += cpu;
225 bzero(&rand, sizeof(rand));
226 bzero(&randstate, sizeof(randstate));
227 if (initstate_r(seed, randstate, sizeof(randstate), &rand))
228 fprintf(stderr, "srandom_r error\n"), exit(1);
229 } else {
230 page_nr = -bounces;
231 if (!(bounces & BOUNCE_RACINGFAULTS))
232 page_nr += cpu * nr_pages_per_cpu;
233 }
234
235 while (!finished) {
236 if (bounces & BOUNCE_RANDOM) {
237 if (random_r(&rand, &rand_nr))
238 fprintf(stderr, "random_r 1 error\n"), exit(1);
239 page_nr = rand_nr;
240 if (sizeof(page_nr) > sizeof(rand_nr)) {
241 if (random_r(&rand, &rand_nr))
242 fprintf(stderr, "random_r 2 error\n"), exit(1);
243 page_nr |= (((unsigned long) rand_nr) << 16) <<
244 16;
245 }
246 } else
247 page_nr += 1;
248 page_nr %= nr_pages;
249
250 start = time(NULL);
251 if (bounces & BOUNCE_VERIFY) {
252 count = *area_count(area_dst, page_nr);
253 if (!count)
254 fprintf(stderr,
255 "page_nr %lu wrong count %Lu %Lu\n",
256 page_nr, count,
257 count_verify[page_nr]), exit(1);
258
259
260 /*
261 * We can't use bcmp (or memcmp) because that
262 * returns 0 erroneously if the memory is
263 * changing under it (even if the end of the
264 * page is never changing and always
265 * different).
266 */
267 #if 1
268 if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
269 page_size))
270 fprintf(stderr,
271 "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
272 page_nr, count,
273 count_verify[page_nr]), exit(1);
274 #else
275 unsigned long loops;
276
277 loops = 0;
278 /* uncomment the below line to test with mutex */
279 /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
280 while (!bcmp(area_dst + page_nr * page_size, zeropage,
281 page_size)) {
282 loops += 1;
283 if (loops > 10)
284 break;
285 }
286 /* uncomment below line to test with mutex */
287 /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
288 if (loops) {
289 fprintf(stderr,
290 "page_nr %lu all zero thread %lu %p %lu\n",
291 page_nr, cpu, area_dst + page_nr * page_size,
292 loops);
293 if (loops > 10)
294 exit(1);
295 }
296 #endif
297 }
298
299 pthread_mutex_lock(area_mutex(area_dst, page_nr));
300 count = *area_count(area_dst, page_nr);
301 if (count != count_verify[page_nr]) {
302 fprintf(stderr,
303 "page_nr %lu memory corruption %Lu %Lu\n",
304 page_nr, count,
305 count_verify[page_nr]), exit(1);
306 }
307 count++;
308 *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
309 pthread_mutex_unlock(area_mutex(area_dst, page_nr));
310
311 if (time(NULL) - start > 1)
312 fprintf(stderr,
313 "userfault too slow %ld "
314 "possible false positive with overcommit\n",
315 time(NULL) - start);
316 }
317
318 return NULL;
319 }
320
copy_page(int ufd,unsigned long offset)321 static int copy_page(int ufd, unsigned long offset)
322 {
323 struct uffdio_copy uffdio_copy;
324
325 if (offset >= nr_pages * page_size)
326 fprintf(stderr, "unexpected offset %lu\n",
327 offset), exit(1);
328 uffdio_copy.dst = (unsigned long) area_dst + offset;
329 uffdio_copy.src = (unsigned long) area_src + offset;
330 uffdio_copy.len = page_size;
331 uffdio_copy.mode = 0;
332 uffdio_copy.copy = 0;
333 if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
334 /* real retval in ufdio_copy.copy */
335 if (uffdio_copy.copy != -EEXIST)
336 fprintf(stderr, "UFFDIO_COPY error %Ld\n",
337 uffdio_copy.copy), exit(1);
338 } else if (uffdio_copy.copy != page_size) {
339 fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
340 uffdio_copy.copy), exit(1);
341 } else
342 return 1;
343 return 0;
344 }
345
uffd_poll_thread(void * arg)346 static void *uffd_poll_thread(void *arg)
347 {
348 unsigned long cpu = (unsigned long) arg;
349 struct pollfd pollfd[2];
350 struct uffd_msg msg;
351 struct uffdio_register uffd_reg;
352 int ret;
353 unsigned long offset;
354 char tmp_chr;
355 unsigned long userfaults = 0;
356
357 pollfd[0].fd = uffd;
358 pollfd[0].events = POLLIN;
359 pollfd[1].fd = pipefd[cpu*2];
360 pollfd[1].events = POLLIN;
361
362 for (;;) {
363 ret = poll(pollfd, 2, -1);
364 if (!ret)
365 fprintf(stderr, "poll error %d\n", ret), exit(1);
366 if (ret < 0)
367 perror("poll"), exit(1);
368 if (pollfd[1].revents & POLLIN) {
369 if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
370 fprintf(stderr, "read pipefd error\n"),
371 exit(1);
372 break;
373 }
374 if (!(pollfd[0].revents & POLLIN))
375 fprintf(stderr, "pollfd[0].revents %d\n",
376 pollfd[0].revents), exit(1);
377 ret = read(uffd, &msg, sizeof(msg));
378 if (ret < 0) {
379 if (errno == EAGAIN)
380 continue;
381 perror("nonblocking read error"), exit(1);
382 }
383 switch (msg.event) {
384 default:
385 fprintf(stderr, "unexpected msg event %u\n",
386 msg.event), exit(1);
387 break;
388 case UFFD_EVENT_PAGEFAULT:
389 if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
390 fprintf(stderr, "unexpected write fault\n"), exit(1);
391 offset = (char *)(unsigned long)msg.arg.pagefault.address -
392 area_dst;
393 offset &= ~(page_size-1);
394 if (copy_page(uffd, offset))
395 userfaults++;
396 break;
397 case UFFD_EVENT_FORK:
398 uffd = msg.arg.fork.ufd;
399 pollfd[0].fd = uffd;
400 break;
401 case UFFD_EVENT_REMOVE:
402 uffd_reg.range.start = msg.arg.remove.start;
403 uffd_reg.range.len = msg.arg.remove.end -
404 msg.arg.remove.start;
405 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
406 fprintf(stderr, "remove failure\n"), exit(1);
407 break;
408 case UFFD_EVENT_REMAP:
409 area_dst = (char *)(unsigned long)msg.arg.remap.to;
410 break;
411 }
412 }
413 return (void *)userfaults;
414 }
415
416 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
417
uffd_read_thread(void * arg)418 static void *uffd_read_thread(void *arg)
419 {
420 unsigned long *this_cpu_userfaults;
421 struct uffd_msg msg;
422 unsigned long offset;
423 int ret;
424
425 this_cpu_userfaults = (unsigned long *) arg;
426 *this_cpu_userfaults = 0;
427
428 pthread_mutex_unlock(&uffd_read_mutex);
429 /* from here cancellation is ok */
430
431 for (;;) {
432 ret = read(uffd, &msg, sizeof(msg));
433 if (ret != sizeof(msg)) {
434 if (ret < 0)
435 perror("blocking read error"), exit(1);
436 else
437 fprintf(stderr, "short read\n"), exit(1);
438 }
439 if (msg.event != UFFD_EVENT_PAGEFAULT)
440 fprintf(stderr, "unexpected msg event %u\n",
441 msg.event), exit(1);
442 if (bounces & BOUNCE_VERIFY &&
443 msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
444 fprintf(stderr, "unexpected write fault\n"), exit(1);
445 offset = (char *)(unsigned long)msg.arg.pagefault.address -
446 area_dst;
447 offset &= ~(page_size-1);
448 if (copy_page(uffd, offset))
449 (*this_cpu_userfaults)++;
450 }
451 return (void *)NULL;
452 }
453
background_thread(void * arg)454 static void *background_thread(void *arg)
455 {
456 unsigned long cpu = (unsigned long) arg;
457 unsigned long page_nr;
458
459 for (page_nr = cpu * nr_pages_per_cpu;
460 page_nr < (cpu+1) * nr_pages_per_cpu;
461 page_nr++)
462 copy_page(uffd, page_nr * page_size);
463
464 return NULL;
465 }
466
stress(unsigned long * userfaults)467 static int stress(unsigned long *userfaults)
468 {
469 unsigned long cpu;
470 pthread_t locking_threads[nr_cpus];
471 pthread_t uffd_threads[nr_cpus];
472 pthread_t background_threads[nr_cpus];
473 void **_userfaults = (void **) userfaults;
474
475 finished = 0;
476 for (cpu = 0; cpu < nr_cpus; cpu++) {
477 if (pthread_create(&locking_threads[cpu], &attr,
478 locking_thread, (void *)cpu))
479 return 1;
480 if (bounces & BOUNCE_POLL) {
481 if (pthread_create(&uffd_threads[cpu], &attr,
482 uffd_poll_thread, (void *)cpu))
483 return 1;
484 } else {
485 if (pthread_create(&uffd_threads[cpu], &attr,
486 uffd_read_thread,
487 &_userfaults[cpu]))
488 return 1;
489 pthread_mutex_lock(&uffd_read_mutex);
490 }
491 if (pthread_create(&background_threads[cpu], &attr,
492 background_thread, (void *)cpu))
493 return 1;
494 }
495 for (cpu = 0; cpu < nr_cpus; cpu++)
496 if (pthread_join(background_threads[cpu], NULL))
497 return 1;
498
499 /*
500 * Be strict and immediately zap area_src, the whole area has
501 * been transferred already by the background treads. The
502 * area_src could then be faulted in in a racy way by still
503 * running uffdio_threads reading zeropages after we zapped
504 * area_src (but they're guaranteed to get -EEXIST from
505 * UFFDIO_COPY without writing zero pages into area_dst
506 * because the background threads already completed).
507 */
508 if (release_pages(area_src))
509 return 1;
510
511 for (cpu = 0; cpu < nr_cpus; cpu++) {
512 char c;
513 if (bounces & BOUNCE_POLL) {
514 if (write(pipefd[cpu*2+1], &c, 1) != 1) {
515 fprintf(stderr, "pipefd write error\n");
516 return 1;
517 }
518 if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
519 return 1;
520 } else {
521 if (pthread_cancel(uffd_threads[cpu]))
522 return 1;
523 if (pthread_join(uffd_threads[cpu], NULL))
524 return 1;
525 }
526 }
527
528 finished = 1;
529 for (cpu = 0; cpu < nr_cpus; cpu++)
530 if (pthread_join(locking_threads[cpu], NULL))
531 return 1;
532
533 return 0;
534 }
535
userfaultfd_open(int features)536 static int userfaultfd_open(int features)
537 {
538 struct uffdio_api uffdio_api;
539
540 uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
541 if (uffd < 0) {
542 fprintf(stderr,
543 "userfaultfd syscall not available in this kernel\n");
544 return 1;
545 }
546 uffd_flags = fcntl(uffd, F_GETFD, NULL);
547
548 uffdio_api.api = UFFD_API;
549 uffdio_api.features = features;
550 if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
551 fprintf(stderr, "UFFDIO_API\n");
552 return 1;
553 }
554 if (uffdio_api.api != UFFD_API) {
555 fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
556 return 1;
557 }
558
559 return 0;
560 }
561
562 /*
563 * For non-cooperative userfaultfd test we fork() a process that will
564 * generate pagefaults, will mremap the area monitored by the
565 * userfaultfd and at last this process will release the monitored
566 * area.
567 * For the anonymous and shared memory the area is divided into two
568 * parts, the first part is accessed before mremap, and the second
569 * part is accessed after mremap. Since hugetlbfs does not support
570 * mremap, the entire monitored area is accessed in a single pass for
571 * HUGETLB_TEST.
572 * The release of the pages currently generates event for shmem and
573 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
574 * for hugetlb.
575 */
faulting_process(void)576 static int faulting_process(void)
577 {
578 unsigned long nr;
579 unsigned long long count;
580
581 #ifndef HUGETLB_TEST
582 unsigned long split_nr_pages = (nr_pages + 1) / 2;
583 #else
584 unsigned long split_nr_pages = nr_pages;
585 #endif
586
587 for (nr = 0; nr < split_nr_pages; nr++) {
588 count = *area_count(area_dst, nr);
589 if (count != count_verify[nr]) {
590 fprintf(stderr,
591 "nr %lu memory corruption %Lu %Lu\n",
592 nr, count,
593 count_verify[nr]), exit(1);
594 }
595 }
596
597 #ifndef HUGETLB_TEST
598 area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
599 MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
600 if (area_dst == MAP_FAILED)
601 perror("mremap"), exit(1);
602
603 for (; nr < nr_pages; nr++) {
604 count = *area_count(area_dst, nr);
605 if (count != count_verify[nr]) {
606 fprintf(stderr,
607 "nr %lu memory corruption %Lu %Lu\n",
608 nr, count,
609 count_verify[nr]), exit(1);
610 }
611 }
612
613 if (release_pages(area_dst))
614 return 1;
615
616 for (nr = 0; nr < nr_pages; nr++) {
617 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
618 fprintf(stderr, "nr %lu is not zero\n", nr), exit(1);
619 }
620
621 #endif /* HUGETLB_TEST */
622
623 return 0;
624 }
625
uffdio_zeropage(int ufd,unsigned long offset)626 static int uffdio_zeropage(int ufd, unsigned long offset)
627 {
628 struct uffdio_zeropage uffdio_zeropage;
629 int ret;
630 unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE);
631
632 if (offset >= nr_pages * page_size)
633 fprintf(stderr, "unexpected offset %lu\n",
634 offset), exit(1);
635 uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
636 uffdio_zeropage.range.len = page_size;
637 uffdio_zeropage.mode = 0;
638 ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
639 if (ret) {
640 /* real retval in ufdio_zeropage.zeropage */
641 if (has_zeropage) {
642 if (uffdio_zeropage.zeropage == -EEXIST)
643 fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"),
644 exit(1);
645 else
646 fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n",
647 uffdio_zeropage.zeropage), exit(1);
648 } else {
649 if (uffdio_zeropage.zeropage != -EINVAL)
650 fprintf(stderr,
651 "UFFDIO_ZEROPAGE not -EINVAL %Ld\n",
652 uffdio_zeropage.zeropage), exit(1);
653 }
654 } else if (has_zeropage) {
655 if (uffdio_zeropage.zeropage != page_size) {
656 fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n",
657 uffdio_zeropage.zeropage), exit(1);
658 } else
659 return 1;
660 } else {
661 fprintf(stderr,
662 "UFFDIO_ZEROPAGE succeeded %Ld\n",
663 uffdio_zeropage.zeropage), exit(1);
664 }
665
666 return 0;
667 }
668
669 /* exercise UFFDIO_ZEROPAGE */
userfaultfd_zeropage_test(void)670 static int userfaultfd_zeropage_test(void)
671 {
672 struct uffdio_register uffdio_register;
673 unsigned long expected_ioctls;
674
675 printf("testing UFFDIO_ZEROPAGE: ");
676 fflush(stdout);
677
678 if (release_pages(area_dst))
679 return 1;
680
681 if (userfaultfd_open(0) < 0)
682 return 1;
683 uffdio_register.range.start = (unsigned long) area_dst;
684 uffdio_register.range.len = nr_pages * page_size;
685 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
686 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
687 fprintf(stderr, "register failure\n"), exit(1);
688
689 expected_ioctls = EXPECTED_IOCTLS;
690 if ((uffdio_register.ioctls & expected_ioctls) !=
691 expected_ioctls)
692 fprintf(stderr,
693 "unexpected missing ioctl for anon memory\n"),
694 exit(1);
695
696 if (uffdio_zeropage(uffd, 0)) {
697 if (my_bcmp(area_dst, zeropage, page_size))
698 fprintf(stderr, "zeropage is not zero\n"), exit(1);
699 }
700
701 close(uffd);
702 printf("done.\n");
703 return 0;
704 }
705
userfaultfd_events_test(void)706 static int userfaultfd_events_test(void)
707 {
708 struct uffdio_register uffdio_register;
709 unsigned long expected_ioctls;
710 unsigned long userfaults;
711 pthread_t uffd_mon;
712 int err, features;
713 pid_t pid;
714 char c;
715
716 printf("testing events (fork, remap, remove): ");
717 fflush(stdout);
718
719 if (release_pages(area_dst))
720 return 1;
721
722 features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
723 UFFD_FEATURE_EVENT_REMOVE;
724 if (userfaultfd_open(features) < 0)
725 return 1;
726 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
727
728 uffdio_register.range.start = (unsigned long) area_dst;
729 uffdio_register.range.len = nr_pages * page_size;
730 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
731 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
732 fprintf(stderr, "register failure\n"), exit(1);
733
734 expected_ioctls = EXPECTED_IOCTLS;
735 if ((uffdio_register.ioctls & expected_ioctls) !=
736 expected_ioctls)
737 fprintf(stderr,
738 "unexpected missing ioctl for anon memory\n"),
739 exit(1);
740
741 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
742 perror("uffd_poll_thread create"), exit(1);
743
744 pid = fork();
745 if (pid < 0)
746 perror("fork"), exit(1);
747
748 if (!pid)
749 return faulting_process();
750
751 waitpid(pid, &err, 0);
752 if (err)
753 fprintf(stderr, "faulting process failed\n"), exit(1);
754
755 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
756 perror("pipe write"), exit(1);
757 if (pthread_join(uffd_mon, (void **)&userfaults))
758 return 1;
759
760 close(uffd);
761 printf("userfaults: %ld\n", userfaults);
762
763 return userfaults != nr_pages;
764 }
765
userfaultfd_stress(void)766 static int userfaultfd_stress(void)
767 {
768 void *area;
769 char *tmp_area;
770 unsigned long nr;
771 struct uffdio_register uffdio_register;
772 unsigned long cpu;
773 int err;
774 unsigned long userfaults[nr_cpus];
775
776 allocate_area((void **)&area_src);
777 if (!area_src)
778 return 1;
779 allocate_area((void **)&area_dst);
780 if (!area_dst)
781 return 1;
782
783 if (userfaultfd_open(0) < 0)
784 return 1;
785
786 count_verify = malloc(nr_pages * sizeof(unsigned long long));
787 if (!count_verify) {
788 perror("count_verify");
789 return 1;
790 }
791
792 for (nr = 0; nr < nr_pages; nr++) {
793 *area_mutex(area_src, nr) = (pthread_mutex_t)
794 PTHREAD_MUTEX_INITIALIZER;
795 count_verify[nr] = *area_count(area_src, nr) = 1;
796 /*
797 * In the transition between 255 to 256, powerpc will
798 * read out of order in my_bcmp and see both bytes as
799 * zero, so leave a placeholder below always non-zero
800 * after the count, to avoid my_bcmp to trigger false
801 * positives.
802 */
803 *(area_count(area_src, nr) + 1) = 1;
804 }
805
806 pipefd = malloc(sizeof(int) * nr_cpus * 2);
807 if (!pipefd) {
808 perror("pipefd");
809 return 1;
810 }
811 for (cpu = 0; cpu < nr_cpus; cpu++) {
812 if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
813 perror("pipe");
814 return 1;
815 }
816 }
817
818 if (posix_memalign(&area, page_size, page_size)) {
819 fprintf(stderr, "out of memory\n");
820 return 1;
821 }
822 zeropage = area;
823 bzero(zeropage, page_size);
824
825 pthread_mutex_lock(&uffd_read_mutex);
826
827 pthread_attr_init(&attr);
828 pthread_attr_setstacksize(&attr, 16*1024*1024);
829
830 err = 0;
831 while (bounces--) {
832 unsigned long expected_ioctls;
833
834 printf("bounces: %d, mode:", bounces);
835 if (bounces & BOUNCE_RANDOM)
836 printf(" rnd");
837 if (bounces & BOUNCE_RACINGFAULTS)
838 printf(" racing");
839 if (bounces & BOUNCE_VERIFY)
840 printf(" ver");
841 if (bounces & BOUNCE_POLL)
842 printf(" poll");
843 printf(", ");
844 fflush(stdout);
845
846 if (bounces & BOUNCE_POLL)
847 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
848 else
849 fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
850
851 /* register */
852 uffdio_register.range.start = (unsigned long) area_dst;
853 uffdio_register.range.len = nr_pages * page_size;
854 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
855 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
856 fprintf(stderr, "register failure\n");
857 return 1;
858 }
859 expected_ioctls = EXPECTED_IOCTLS;
860 if ((uffdio_register.ioctls & expected_ioctls) !=
861 expected_ioctls) {
862 fprintf(stderr,
863 "unexpected missing ioctl for anon memory\n");
864 return 1;
865 }
866
867 /*
868 * The madvise done previously isn't enough: some
869 * uffd_thread could have read userfaults (one of
870 * those already resolved by the background thread)
871 * and it may be in the process of calling
872 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
873 * area_src and it would map a zero page in it (of
874 * course such a UFFDIO_COPY is perfectly safe as it'd
875 * return -EEXIST). The problem comes at the next
876 * bounce though: that racing UFFDIO_COPY would
877 * generate zeropages in the area_src, so invalidating
878 * the previous MADV_DONTNEED. Without this additional
879 * MADV_DONTNEED those zeropages leftovers in the
880 * area_src would lead to -EEXIST failure during the
881 * next bounce, effectively leaving a zeropage in the
882 * area_dst.
883 *
884 * Try to comment this out madvise to see the memory
885 * corruption being caught pretty quick.
886 *
887 * khugepaged is also inhibited to collapse THP after
888 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
889 * required to MADV_DONTNEED here.
890 */
891 if (release_pages(area_dst))
892 return 1;
893
894 /* bounce pass */
895 if (stress(userfaults))
896 return 1;
897
898 /* unregister */
899 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
900 fprintf(stderr, "register failure\n");
901 return 1;
902 }
903
904 /* verification */
905 if (bounces & BOUNCE_VERIFY) {
906 for (nr = 0; nr < nr_pages; nr++) {
907 if (*area_count(area_dst, nr) != count_verify[nr]) {
908 fprintf(stderr,
909 "error area_count %Lu %Lu %lu\n",
910 *area_count(area_src, nr),
911 count_verify[nr],
912 nr);
913 err = 1;
914 bounces = 0;
915 }
916 }
917 }
918
919 /* prepare next bounce */
920 tmp_area = area_src;
921 area_src = area_dst;
922 area_dst = tmp_area;
923
924 printf("userfaults:");
925 for (cpu = 0; cpu < nr_cpus; cpu++)
926 printf(" %lu", userfaults[cpu]);
927 printf("\n");
928 }
929
930 if (err)
931 return err;
932
933 close(uffd);
934 return userfaultfd_zeropage_test() || userfaultfd_events_test();
935 }
936
937 #ifndef HUGETLB_TEST
938
main(int argc,char ** argv)939 int main(int argc, char **argv)
940 {
941 if (argc < 3)
942 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
943 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
944 page_size = sysconf(_SC_PAGE_SIZE);
945 if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
946 > page_size)
947 fprintf(stderr, "Impossible to run this test\n"), exit(2);
948 nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
949 nr_cpus;
950 if (!nr_pages_per_cpu) {
951 fprintf(stderr, "invalid MiB\n");
952 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
953 }
954 bounces = atoi(argv[2]);
955 if (bounces <= 0) {
956 fprintf(stderr, "invalid bounces\n");
957 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
958 }
959 nr_pages = nr_pages_per_cpu * nr_cpus;
960 printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
961 nr_pages, nr_pages_per_cpu);
962 return userfaultfd_stress();
963 }
964
965 #else /* HUGETLB_TEST */
966
967 /*
968 * Copied from mlock2-tests.c
969 */
default_huge_page_size(void)970 unsigned long default_huge_page_size(void)
971 {
972 unsigned long hps = 0;
973 char *line = NULL;
974 size_t linelen = 0;
975 FILE *f = fopen("/proc/meminfo", "r");
976
977 if (!f)
978 return 0;
979 while (getline(&line, &linelen, f) > 0) {
980 if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
981 hps <<= 10;
982 break;
983 }
984 }
985
986 free(line);
987 fclose(f);
988 return hps;
989 }
990
main(int argc,char ** argv)991 int main(int argc, char **argv)
992 {
993 if (argc < 4)
994 fprintf(stderr, "Usage: <MiB> <bounces> <hugetlbfs_file>\n"),
995 exit(1);
996 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
997 page_size = default_huge_page_size();
998 if (!page_size)
999 fprintf(stderr, "Unable to determine huge page size\n"),
1000 exit(2);
1001 if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1002 > page_size)
1003 fprintf(stderr, "Impossible to run this test\n"), exit(2);
1004 nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
1005 nr_cpus;
1006 if (!nr_pages_per_cpu) {
1007 fprintf(stderr, "invalid MiB\n");
1008 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
1009 }
1010 bounces = atoi(argv[2]);
1011 if (bounces <= 0) {
1012 fprintf(stderr, "invalid bounces\n");
1013 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
1014 }
1015 nr_pages = nr_pages_per_cpu * nr_cpus;
1016 huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755);
1017 if (huge_fd < 0) {
1018 fprintf(stderr, "Open of %s failed", argv[3]);
1019 perror("open");
1020 exit(1);
1021 }
1022 if (ftruncate(huge_fd, 0)) {
1023 fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
1024 perror("ftruncate");
1025 exit(1);
1026 }
1027 printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1028 nr_pages, nr_pages_per_cpu);
1029 return userfaultfd_stress();
1030 }
1031
1032 #endif
1033 #else /* __NR_userfaultfd */
1034
1035 #warning "missing __NR_userfaultfd definition"
1036
main(void)1037 int main(void)
1038 {
1039 printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1040 return 0;
1041 }
1042
1043 #endif /* __NR_userfaultfd */
1044