• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stress userfaultfd syscall.
4  *
5  *  Copyright (C) 2015  Red Hat, Inc.
6  *
7  * This test allocates two virtual areas and bounces the physical
8  * memory across the two virtual areas (from area_src to area_dst)
9  * using userfaultfd.
10  *
11  * There are three threads running per CPU:
12  *
13  * 1) one per-CPU thread takes a per-page pthread_mutex in a random
14  *    page of the area_dst (while the physical page may still be in
15  *    area_src), and increments a per-page counter in the same page,
16  *    and checks its value against a verification region.
17  *
18  * 2) another per-CPU thread handles the userfaults generated by
19  *    thread 1 above. userfaultfd blocking reads or poll() modes are
20  *    exercised interleaved.
21  *
22  * 3) one last per-CPU thread transfers the memory in the background
23  *    at maximum bandwidth (if not already transferred by thread
24  *    2). Each cpu thread takes cares of transferring a portion of the
25  *    area.
26  *
27  * When all threads of type 3 completed the transfer, one bounce is
28  * complete. area_src and area_dst are then swapped. All threads are
29  * respawned and so the bounce is immediately restarted in the
30  * opposite direction.
31  *
32  * per-CPU threads 1 by triggering userfaults inside
33  * pthread_mutex_lock will also verify the atomicity of the memory
34  * transfer (UFFDIO_COPY).
35  */
36 
37 #define _GNU_SOURCE
38 #include <stdio.h>
39 #include <errno.h>
40 #include <unistd.h>
41 #include <stdlib.h>
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <fcntl.h>
45 #include <time.h>
46 #include <signal.h>
47 #include <poll.h>
48 #include <string.h>
49 #include <sys/mman.h>
50 #include <sys/syscall.h>
51 #include <sys/ioctl.h>
52 #include <sys/wait.h>
53 #include <pthread.h>
54 #include <linux/userfaultfd.h>
55 #include <setjmp.h>
56 #include <stdbool.h>
57 #include <assert.h>
58 
59 #include "../kselftest.h"
60 
61 #ifdef __NR_userfaultfd
62 
63 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
64 
65 #define BOUNCE_RANDOM		(1<<0)
66 #define BOUNCE_RACINGFAULTS	(1<<1)
67 #define BOUNCE_VERIFY		(1<<2)
68 #define BOUNCE_POLL		(1<<3)
69 static int bounces;
70 
71 #define TEST_ANON	1
72 #define TEST_HUGETLB	2
73 #define TEST_SHMEM	3
74 static int test_type;
75 
76 /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
77 #define ALARM_INTERVAL_SECS 10
78 static volatile bool test_uffdio_copy_eexist = true;
79 static volatile bool test_uffdio_zeropage_eexist = true;
80 /* Whether to test uffd write-protection */
81 static bool test_uffdio_wp = false;
82 
83 static bool map_shared;
84 static int huge_fd;
85 static char *huge_fd_off0;
86 static unsigned long long *count_verify;
87 static int uffd, uffd_flags, finished, *pipefd;
88 static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
89 static char *zeropage;
90 pthread_attr_t attr;
91 
92 /* Userfaultfd test statistics */
93 struct uffd_stats {
94 	int cpu;
95 	unsigned long missing_faults;
96 	unsigned long wp_faults;
97 };
98 
99 /* pthread_mutex_t starts at page offset 0 */
100 #define area_mutex(___area, ___nr)					\
101 	((pthread_mutex_t *) ((___area) + (___nr)*page_size))
102 /*
103  * count is placed in the page after pthread_mutex_t naturally aligned
104  * to avoid non alignment faults on non-x86 archs.
105  */
106 #define area_count(___area, ___nr)					\
107 	((volatile unsigned long long *) ((unsigned long)		\
108 				 ((___area) + (___nr)*page_size +	\
109 				  sizeof(pthread_mutex_t) +		\
110 				  sizeof(unsigned long long) - 1) &	\
111 				 ~(unsigned long)(sizeof(unsigned long long) \
112 						  -  1)))
113 
114 const char *examples =
115     "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
116     "./userfaultfd anon 100 99999\n\n"
117     "# Run share memory test on 1GiB region with 99 bounces:\n"
118     "./userfaultfd shmem 1000 99\n\n"
119     "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
120     "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
121     "# Run the same hugetlb test but using shmem:\n"
122     "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
123     "# 10MiB-~6GiB 999 bounces anonymous test, "
124     "continue forever unless an error triggers\n"
125     "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
126 
usage(void)127 static void usage(void)
128 {
129 	fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
130 		"[hugetlbfs_file]\n\n");
131 	fprintf(stderr, "Supported <test type>: anon, hugetlb, "
132 		"hugetlb_shared, shmem\n\n");
133 	fprintf(stderr, "Examples:\n\n");
134 	fprintf(stderr, "%s", examples);
135 	exit(1);
136 }
137 
uffd_stats_reset(struct uffd_stats * uffd_stats,unsigned long n_cpus)138 static void uffd_stats_reset(struct uffd_stats *uffd_stats,
139 			     unsigned long n_cpus)
140 {
141 	int i;
142 
143 	for (i = 0; i < n_cpus; i++) {
144 		uffd_stats[i].cpu = i;
145 		uffd_stats[i].missing_faults = 0;
146 		uffd_stats[i].wp_faults = 0;
147 	}
148 }
149 
uffd_stats_report(struct uffd_stats * stats,int n_cpus)150 static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
151 {
152 	int i;
153 	unsigned long long miss_total = 0, wp_total = 0;
154 
155 	for (i = 0; i < n_cpus; i++) {
156 		miss_total += stats[i].missing_faults;
157 		wp_total += stats[i].wp_faults;
158 	}
159 
160 	printf("userfaults: %llu missing (", miss_total);
161 	for (i = 0; i < n_cpus; i++)
162 		printf("%lu+", stats[i].missing_faults);
163 	printf("\b), %llu wp (", wp_total);
164 	for (i = 0; i < n_cpus; i++)
165 		printf("%lu+", stats[i].wp_faults);
166 	printf("\b)\n");
167 }
168 
anon_release_pages(char * rel_area)169 static int anon_release_pages(char *rel_area)
170 {
171 	int ret = 0;
172 
173 	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
174 		perror("madvise");
175 		ret = 1;
176 	}
177 
178 	return ret;
179 }
180 
anon_allocate_area(void ** alloc_area)181 static void anon_allocate_area(void **alloc_area)
182 {
183 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
184 			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
185 	if (*alloc_area == MAP_FAILED) {
186 		fprintf(stderr, "mmap of anonymous memory failed");
187 		*alloc_area = NULL;
188 	}
189 }
190 
noop_alias_mapping(__u64 * start,size_t len,unsigned long offset)191 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
192 {
193 }
194 
195 /* HugeTLB memory */
hugetlb_release_pages(char * rel_area)196 static int hugetlb_release_pages(char *rel_area)
197 {
198 	int ret = 0;
199 
200 	if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
201 				rel_area == huge_fd_off0 ? 0 :
202 				nr_pages * page_size,
203 				nr_pages * page_size)) {
204 		perror("fallocate");
205 		ret = 1;
206 	}
207 
208 	return ret;
209 }
210 
hugetlb_allocate_area(void ** alloc_area)211 static void hugetlb_allocate_area(void **alloc_area)
212 {
213 	void *area_alias = NULL;
214 	char **alloc_area_alias;
215 
216 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
217 			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
218 			   MAP_HUGETLB,
219 			   huge_fd, *alloc_area == area_src ? 0 :
220 			   nr_pages * page_size);
221 	if (*alloc_area == MAP_FAILED) {
222 		perror("mmap of hugetlbfs file failed");
223 		goto fail;
224 	}
225 
226 	if (map_shared) {
227 		area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
228 				  MAP_SHARED | MAP_HUGETLB,
229 				  huge_fd, *alloc_area == area_src ? 0 :
230 				  nr_pages * page_size);
231 		if (area_alias == MAP_FAILED) {
232 			perror("mmap of hugetlb file alias failed");
233 			goto fail_munmap;
234 		}
235 	}
236 
237 	if (*alloc_area == area_src) {
238 		huge_fd_off0 = *alloc_area;
239 		alloc_area_alias = &area_src_alias;
240 	} else {
241 		alloc_area_alias = &area_dst_alias;
242 	}
243 	if (area_alias)
244 		*alloc_area_alias = area_alias;
245 
246 	return;
247 
248 fail_munmap:
249 	if (munmap(*alloc_area, nr_pages * page_size) < 0) {
250 		perror("hugetlb munmap");
251 		exit(1);
252 	}
253 fail:
254 	*alloc_area = NULL;
255 }
256 
hugetlb_alias_mapping(__u64 * start,size_t len,unsigned long offset)257 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
258 {
259 	if (!map_shared)
260 		return;
261 	/*
262 	 * We can't zap just the pagetable with hugetlbfs because
263 	 * MADV_DONTEED won't work. So exercise -EEXIST on a alias
264 	 * mapping where the pagetables are not established initially,
265 	 * this way we'll exercise the -EEXEC at the fs level.
266 	 */
267 	*start = (unsigned long) area_dst_alias + offset;
268 }
269 
270 /* Shared memory */
shmem_release_pages(char * rel_area)271 static int shmem_release_pages(char *rel_area)
272 {
273 	int ret = 0;
274 
275 	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
276 		perror("madvise");
277 		ret = 1;
278 	}
279 
280 	return ret;
281 }
282 
shmem_allocate_area(void ** alloc_area)283 static void shmem_allocate_area(void **alloc_area)
284 {
285 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
286 			   MAP_ANONYMOUS | MAP_SHARED, -1, 0);
287 	if (*alloc_area == MAP_FAILED) {
288 		fprintf(stderr, "shared memory mmap failed\n");
289 		*alloc_area = NULL;
290 	}
291 }
292 
293 struct uffd_test_ops {
294 	unsigned long expected_ioctls;
295 	void (*allocate_area)(void **alloc_area);
296 	int (*release_pages)(char *rel_area);
297 	void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
298 };
299 
300 #define SHMEM_EXPECTED_IOCTLS		((1 << _UFFDIO_WAKE) | \
301 					 (1 << _UFFDIO_COPY) | \
302 					 (1 << _UFFDIO_ZEROPAGE))
303 
304 #define ANON_EXPECTED_IOCTLS		((1 << _UFFDIO_WAKE) | \
305 					 (1 << _UFFDIO_COPY) | \
306 					 (1 << _UFFDIO_ZEROPAGE) | \
307 					 (1 << _UFFDIO_WRITEPROTECT))
308 
309 static struct uffd_test_ops anon_uffd_test_ops = {
310 	.expected_ioctls = ANON_EXPECTED_IOCTLS,
311 	.allocate_area	= anon_allocate_area,
312 	.release_pages	= anon_release_pages,
313 	.alias_mapping = noop_alias_mapping,
314 };
315 
316 static struct uffd_test_ops shmem_uffd_test_ops = {
317 	.expected_ioctls = SHMEM_EXPECTED_IOCTLS,
318 	.allocate_area	= shmem_allocate_area,
319 	.release_pages	= shmem_release_pages,
320 	.alias_mapping = noop_alias_mapping,
321 };
322 
323 static struct uffd_test_ops hugetlb_uffd_test_ops = {
324 	.expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
325 	.allocate_area	= hugetlb_allocate_area,
326 	.release_pages	= hugetlb_release_pages,
327 	.alias_mapping = hugetlb_alias_mapping,
328 };
329 
330 static struct uffd_test_ops *uffd_test_ops;
331 
my_bcmp(char * str1,char * str2,size_t n)332 static int my_bcmp(char *str1, char *str2, size_t n)
333 {
334 	unsigned long i;
335 	for (i = 0; i < n; i++)
336 		if (str1[i] != str2[i])
337 			return 1;
338 	return 0;
339 }
340 
wp_range(int ufd,__u64 start,__u64 len,bool wp)341 static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
342 {
343 	struct uffdio_writeprotect prms = { 0 };
344 
345 	/* Write protection page faults */
346 	prms.range.start = start;
347 	prms.range.len = len;
348 	/* Undo write-protect, do wakeup after that */
349 	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
350 
351 	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) {
352 		fprintf(stderr, "clear WP failed for address 0x%Lx\n", start);
353 		exit(1);
354 	}
355 }
356 
locking_thread(void * arg)357 static void *locking_thread(void *arg)
358 {
359 	unsigned long cpu = (unsigned long) arg;
360 	struct random_data rand;
361 	unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
362 	int32_t rand_nr;
363 	unsigned long long count;
364 	char randstate[64];
365 	unsigned int seed;
366 	time_t start;
367 
368 	if (bounces & BOUNCE_RANDOM) {
369 		seed = (unsigned int) time(NULL) - bounces;
370 		if (!(bounces & BOUNCE_RACINGFAULTS))
371 			seed += cpu;
372 		bzero(&rand, sizeof(rand));
373 		bzero(&randstate, sizeof(randstate));
374 		if (initstate_r(seed, randstate, sizeof(randstate), &rand)) {
375 			fprintf(stderr, "srandom_r error\n");
376 			exit(1);
377 		}
378 	} else {
379 		page_nr = -bounces;
380 		if (!(bounces & BOUNCE_RACINGFAULTS))
381 			page_nr += cpu * nr_pages_per_cpu;
382 	}
383 
384 	while (!finished) {
385 		if (bounces & BOUNCE_RANDOM) {
386 			if (random_r(&rand, &rand_nr)) {
387 				fprintf(stderr, "random_r 1 error\n");
388 				exit(1);
389 			}
390 			page_nr = rand_nr;
391 			if (sizeof(page_nr) > sizeof(rand_nr)) {
392 				if (random_r(&rand, &rand_nr)) {
393 					fprintf(stderr, "random_r 2 error\n");
394 					exit(1);
395 				}
396 				page_nr |= (((unsigned long) rand_nr) << 16) <<
397 					   16;
398 			}
399 		} else
400 			page_nr += 1;
401 		page_nr %= nr_pages;
402 
403 		start = time(NULL);
404 		if (bounces & BOUNCE_VERIFY) {
405 			count = *area_count(area_dst, page_nr);
406 			if (!count) {
407 				fprintf(stderr,
408 					"page_nr %lu wrong count %Lu %Lu\n",
409 					page_nr, count,
410 					count_verify[page_nr]);
411 				exit(1);
412 			}
413 
414 
415 			/*
416 			 * We can't use bcmp (or memcmp) because that
417 			 * returns 0 erroneously if the memory is
418 			 * changing under it (even if the end of the
419 			 * page is never changing and always
420 			 * different).
421 			 */
422 #if 1
423 			if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
424 				     page_size)) {
425 				fprintf(stderr,
426 					"my_bcmp page_nr %lu wrong count %Lu %Lu\n",
427 					page_nr, count, count_verify[page_nr]);
428 				exit(1);
429 			}
430 #else
431 			unsigned long loops;
432 
433 			loops = 0;
434 			/* uncomment the below line to test with mutex */
435 			/* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
436 			while (!bcmp(area_dst + page_nr * page_size, zeropage,
437 				     page_size)) {
438 				loops += 1;
439 				if (loops > 10)
440 					break;
441 			}
442 			/* uncomment below line to test with mutex */
443 			/* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
444 			if (loops) {
445 				fprintf(stderr,
446 					"page_nr %lu all zero thread %lu %p %lu\n",
447 					page_nr, cpu, area_dst + page_nr * page_size,
448 					loops);
449 				if (loops > 10)
450 					exit(1);
451 			}
452 #endif
453 		}
454 
455 		pthread_mutex_lock(area_mutex(area_dst, page_nr));
456 		count = *area_count(area_dst, page_nr);
457 		if (count != count_verify[page_nr]) {
458 			fprintf(stderr,
459 				"page_nr %lu memory corruption %Lu %Lu\n",
460 				page_nr, count,
461 				count_verify[page_nr]); exit(1);
462 		}
463 		count++;
464 		*area_count(area_dst, page_nr) = count_verify[page_nr] = count;
465 		pthread_mutex_unlock(area_mutex(area_dst, page_nr));
466 
467 		if (time(NULL) - start > 1)
468 			fprintf(stderr,
469 				"userfault too slow %ld "
470 				"possible false positive with overcommit\n",
471 				time(NULL) - start);
472 	}
473 
474 	return NULL;
475 }
476 
retry_copy_page(int ufd,struct uffdio_copy * uffdio_copy,unsigned long offset)477 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
478 			    unsigned long offset)
479 {
480 	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
481 				     uffdio_copy->len,
482 				     offset);
483 	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
484 		/* real retval in ufdio_copy.copy */
485 		if (uffdio_copy->copy != -EEXIST) {
486 			fprintf(stderr, "UFFDIO_COPY retry error %Ld\n",
487 				uffdio_copy->copy);
488 			exit(1);
489 		}
490 	} else {
491 		fprintf(stderr,	"UFFDIO_COPY retry unexpected %Ld\n",
492 			uffdio_copy->copy); exit(1);
493 	}
494 }
495 
__copy_page(int ufd,unsigned long offset,bool retry)496 static int __copy_page(int ufd, unsigned long offset, bool retry)
497 {
498 	struct uffdio_copy uffdio_copy;
499 
500 	if (offset >= nr_pages * page_size) {
501 		fprintf(stderr, "unexpected offset %lu\n", offset);
502 		exit(1);
503 	}
504 	uffdio_copy.dst = (unsigned long) area_dst + offset;
505 	uffdio_copy.src = (unsigned long) area_src + offset;
506 	uffdio_copy.len = page_size;
507 	if (test_uffdio_wp)
508 		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
509 	else
510 		uffdio_copy.mode = 0;
511 	uffdio_copy.copy = 0;
512 	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
513 		/* real retval in ufdio_copy.copy */
514 		if (uffdio_copy.copy != -EEXIST) {
515 			fprintf(stderr, "UFFDIO_COPY error %Ld\n",
516 				uffdio_copy.copy);
517 			exit(1);
518 		}
519 	} else if (uffdio_copy.copy != page_size) {
520 		fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
521 			uffdio_copy.copy); exit(1);
522 	} else {
523 		if (test_uffdio_copy_eexist && retry) {
524 			test_uffdio_copy_eexist = false;
525 			retry_copy_page(ufd, &uffdio_copy, offset);
526 		}
527 		return 1;
528 	}
529 	return 0;
530 }
531 
copy_page_retry(int ufd,unsigned long offset)532 static int copy_page_retry(int ufd, unsigned long offset)
533 {
534 	return __copy_page(ufd, offset, true);
535 }
536 
copy_page(int ufd,unsigned long offset)537 static int copy_page(int ufd, unsigned long offset)
538 {
539 	return __copy_page(ufd, offset, false);
540 }
541 
uffd_read_msg(int ufd,struct uffd_msg * msg)542 static int uffd_read_msg(int ufd, struct uffd_msg *msg)
543 {
544 	int ret = read(uffd, msg, sizeof(*msg));
545 
546 	if (ret != sizeof(*msg)) {
547 		if (ret < 0) {
548 			if (errno == EAGAIN)
549 				return 1;
550 			perror("blocking read error");
551 		} else {
552 			fprintf(stderr, "short read\n");
553 		}
554 		exit(1);
555 	}
556 
557 	return 0;
558 }
559 
uffd_handle_page_fault(struct uffd_msg * msg,struct uffd_stats * stats)560 static void uffd_handle_page_fault(struct uffd_msg *msg,
561 				   struct uffd_stats *stats)
562 {
563 	unsigned long offset;
564 
565 	if (msg->event != UFFD_EVENT_PAGEFAULT) {
566 		fprintf(stderr, "unexpected msg event %u\n", msg->event);
567 		exit(1);
568 	}
569 
570 	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
571 		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
572 		stats->wp_faults++;
573 	} else {
574 		/* Missing page faults */
575 		if (bounces & BOUNCE_VERIFY &&
576 		    msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) {
577 			fprintf(stderr, "unexpected write fault\n");
578 			exit(1);
579 		}
580 
581 		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
582 		offset &= ~(page_size-1);
583 
584 		if (copy_page(uffd, offset))
585 			stats->missing_faults++;
586 	}
587 }
588 
uffd_poll_thread(void * arg)589 static void *uffd_poll_thread(void *arg)
590 {
591 	struct uffd_stats *stats = (struct uffd_stats *)arg;
592 	unsigned long cpu = stats->cpu;
593 	struct pollfd pollfd[2];
594 	struct uffd_msg msg;
595 	struct uffdio_register uffd_reg;
596 	int ret;
597 	char tmp_chr;
598 
599 	pollfd[0].fd = uffd;
600 	pollfd[0].events = POLLIN;
601 	pollfd[1].fd = pipefd[cpu*2];
602 	pollfd[1].events = POLLIN;
603 
604 	for (;;) {
605 		ret = poll(pollfd, 2, -1);
606 		if (!ret) {
607 			fprintf(stderr, "poll error %d\n", ret);
608 			exit(1);
609 		}
610 		if (ret < 0) {
611 			perror("poll");
612 			exit(1);
613 		}
614 		if (pollfd[1].revents & POLLIN) {
615 			if (read(pollfd[1].fd, &tmp_chr, 1) != 1) {
616 				fprintf(stderr, "read pipefd error\n");
617 				exit(1);
618 			}
619 			break;
620 		}
621 		if (!(pollfd[0].revents & POLLIN)) {
622 			fprintf(stderr, "pollfd[0].revents %d\n",
623 				pollfd[0].revents);
624 			exit(1);
625 		}
626 		if (uffd_read_msg(uffd, &msg))
627 			continue;
628 		switch (msg.event) {
629 		default:
630 			fprintf(stderr, "unexpected msg event %u\n",
631 				msg.event); exit(1);
632 			break;
633 		case UFFD_EVENT_PAGEFAULT:
634 			uffd_handle_page_fault(&msg, stats);
635 			break;
636 		case UFFD_EVENT_FORK:
637 			close(uffd);
638 			uffd = msg.arg.fork.ufd;
639 			pollfd[0].fd = uffd;
640 			break;
641 		case UFFD_EVENT_REMOVE:
642 			uffd_reg.range.start = msg.arg.remove.start;
643 			uffd_reg.range.len = msg.arg.remove.end -
644 				msg.arg.remove.start;
645 			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) {
646 				fprintf(stderr, "remove failure\n");
647 				exit(1);
648 			}
649 			break;
650 		case UFFD_EVENT_REMAP:
651 			area_dst = (char *)(unsigned long)msg.arg.remap.to;
652 			break;
653 		}
654 	}
655 
656 	return NULL;
657 }
658 
659 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
660 
uffd_read_thread(void * arg)661 static void *uffd_read_thread(void *arg)
662 {
663 	struct uffd_stats *stats = (struct uffd_stats *)arg;
664 	struct uffd_msg msg;
665 
666 	pthread_mutex_unlock(&uffd_read_mutex);
667 	/* from here cancellation is ok */
668 
669 	for (;;) {
670 		if (uffd_read_msg(uffd, &msg))
671 			continue;
672 		uffd_handle_page_fault(&msg, stats);
673 	}
674 
675 	return NULL;
676 }
677 
background_thread(void * arg)678 static void *background_thread(void *arg)
679 {
680 	unsigned long cpu = (unsigned long) arg;
681 	unsigned long page_nr, start_nr, mid_nr, end_nr;
682 
683 	start_nr = cpu * nr_pages_per_cpu;
684 	end_nr = (cpu+1) * nr_pages_per_cpu;
685 	mid_nr = (start_nr + end_nr) / 2;
686 
687 	/* Copy the first half of the pages */
688 	for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
689 		copy_page_retry(uffd, page_nr * page_size);
690 
691 	/*
692 	 * If we need to test uffd-wp, set it up now.  Then we'll have
693 	 * at least the first half of the pages mapped already which
694 	 * can be write-protected for testing
695 	 */
696 	if (test_uffdio_wp)
697 		wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
698 			nr_pages_per_cpu * page_size, true);
699 
700 	/*
701 	 * Continue the 2nd half of the page copying, handling write
702 	 * protection faults if any
703 	 */
704 	for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
705 		copy_page_retry(uffd, page_nr * page_size);
706 
707 	return NULL;
708 }
709 
stress(struct uffd_stats * uffd_stats)710 static int stress(struct uffd_stats *uffd_stats)
711 {
712 	unsigned long cpu;
713 	pthread_t locking_threads[nr_cpus];
714 	pthread_t uffd_threads[nr_cpus];
715 	pthread_t background_threads[nr_cpus];
716 
717 	finished = 0;
718 	for (cpu = 0; cpu < nr_cpus; cpu++) {
719 		if (pthread_create(&locking_threads[cpu], &attr,
720 				   locking_thread, (void *)cpu))
721 			return 1;
722 		if (bounces & BOUNCE_POLL) {
723 			if (pthread_create(&uffd_threads[cpu], &attr,
724 					   uffd_poll_thread,
725 					   (void *)&uffd_stats[cpu]))
726 				return 1;
727 		} else {
728 			if (pthread_create(&uffd_threads[cpu], &attr,
729 					   uffd_read_thread,
730 					   (void *)&uffd_stats[cpu]))
731 				return 1;
732 			pthread_mutex_lock(&uffd_read_mutex);
733 		}
734 		if (pthread_create(&background_threads[cpu], &attr,
735 				   background_thread, (void *)cpu))
736 			return 1;
737 	}
738 	for (cpu = 0; cpu < nr_cpus; cpu++)
739 		if (pthread_join(background_threads[cpu], NULL))
740 			return 1;
741 
742 	/*
743 	 * Be strict and immediately zap area_src, the whole area has
744 	 * been transferred already by the background treads. The
745 	 * area_src could then be faulted in in a racy way by still
746 	 * running uffdio_threads reading zeropages after we zapped
747 	 * area_src (but they're guaranteed to get -EEXIST from
748 	 * UFFDIO_COPY without writing zero pages into area_dst
749 	 * because the background threads already completed).
750 	 */
751 	if (uffd_test_ops->release_pages(area_src))
752 		return 1;
753 
754 
755 	finished = 1;
756 	for (cpu = 0; cpu < nr_cpus; cpu++)
757 		if (pthread_join(locking_threads[cpu], NULL))
758 			return 1;
759 
760 	for (cpu = 0; cpu < nr_cpus; cpu++) {
761 		char c;
762 		if (bounces & BOUNCE_POLL) {
763 			if (write(pipefd[cpu*2+1], &c, 1) != 1) {
764 				fprintf(stderr, "pipefd write error\n");
765 				return 1;
766 			}
767 			if (pthread_join(uffd_threads[cpu],
768 					 (void *)&uffd_stats[cpu]))
769 				return 1;
770 		} else {
771 			if (pthread_cancel(uffd_threads[cpu]))
772 				return 1;
773 			if (pthread_join(uffd_threads[cpu], NULL))
774 				return 1;
775 		}
776 	}
777 
778 	return 0;
779 }
780 
userfaultfd_open(int features)781 static int userfaultfd_open(int features)
782 {
783 	struct uffdio_api uffdio_api;
784 
785 	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
786 	if (uffd < 0) {
787 		fprintf(stderr,
788 			"userfaultfd syscall not available in this kernel\n");
789 		return 1;
790 	}
791 	uffd_flags = fcntl(uffd, F_GETFD, NULL);
792 
793 	uffdio_api.api = UFFD_API;
794 	uffdio_api.features = features;
795 	if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
796 		fprintf(stderr, "UFFDIO_API\n");
797 		return 1;
798 	}
799 	if (uffdio_api.api != UFFD_API) {
800 		fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
801 		return 1;
802 	}
803 
804 	return 0;
805 }
806 
807 sigjmp_buf jbuf, *sigbuf;
808 
sighndl(int sig,siginfo_t * siginfo,void * ptr)809 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
810 {
811 	if (sig == SIGBUS) {
812 		if (sigbuf)
813 			siglongjmp(*sigbuf, 1);
814 		abort();
815 	}
816 }
817 
818 /*
819  * For non-cooperative userfaultfd test we fork() a process that will
820  * generate pagefaults, will mremap the area monitored by the
821  * userfaultfd and at last this process will release the monitored
822  * area.
823  * For the anonymous and shared memory the area is divided into two
824  * parts, the first part is accessed before mremap, and the second
825  * part is accessed after mremap. Since hugetlbfs does not support
826  * mremap, the entire monitored area is accessed in a single pass for
827  * HUGETLB_TEST.
828  * The release of the pages currently generates event for shmem and
829  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
830  * for hugetlb.
831  * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
832  * monitored area, generate pagefaults and test that signal is delivered.
833  * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
834  * test robustness use case - we release monitored area, fork a process
835  * that will generate pagefaults and verify signal is generated.
836  * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
837  * feature. Using monitor thread, verify no userfault events are generated.
838  */
faulting_process(int signal_test)839 static int faulting_process(int signal_test)
840 {
841 	unsigned long nr;
842 	unsigned long long count;
843 	unsigned long split_nr_pages;
844 	unsigned long lastnr;
845 	struct sigaction act;
846 	unsigned long signalled = 0;
847 
848 	if (test_type != TEST_HUGETLB)
849 		split_nr_pages = (nr_pages + 1) / 2;
850 	else
851 		split_nr_pages = nr_pages;
852 
853 	if (signal_test) {
854 		sigbuf = &jbuf;
855 		memset(&act, 0, sizeof(act));
856 		act.sa_sigaction = sighndl;
857 		act.sa_flags = SA_SIGINFO;
858 		if (sigaction(SIGBUS, &act, 0)) {
859 			perror("sigaction");
860 			return 1;
861 		}
862 		lastnr = (unsigned long)-1;
863 	}
864 
865 	for (nr = 0; nr < split_nr_pages; nr++) {
866 		int steps = 1;
867 		unsigned long offset = nr * page_size;
868 
869 		if (signal_test) {
870 			if (sigsetjmp(*sigbuf, 1) != 0) {
871 				if (steps == 1 && nr == lastnr) {
872 					fprintf(stderr, "Signal repeated\n");
873 					return 1;
874 				}
875 
876 				lastnr = nr;
877 				if (signal_test == 1) {
878 					if (steps == 1) {
879 						/* This is a MISSING request */
880 						steps++;
881 						if (copy_page(uffd, offset))
882 							signalled++;
883 					} else {
884 						/* This is a WP request */
885 						assert(steps == 2);
886 						wp_range(uffd,
887 							 (__u64)area_dst +
888 							 offset,
889 							 page_size, false);
890 					}
891 				} else {
892 					signalled++;
893 					continue;
894 				}
895 			}
896 		}
897 
898 		count = *area_count(area_dst, nr);
899 		if (count != count_verify[nr]) {
900 			fprintf(stderr,
901 				"nr %lu memory corruption %Lu %Lu\n",
902 				nr, count,
903 				count_verify[nr]);
904 	        }
905 		/*
906 		 * Trigger write protection if there is by writting
907 		 * the same value back.
908 		 */
909 		*area_count(area_dst, nr) = count;
910 	}
911 
912 	if (signal_test)
913 		return signalled != split_nr_pages;
914 
915 	if (test_type == TEST_HUGETLB)
916 		return 0;
917 
918 	area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
919 			  MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
920 	if (area_dst == MAP_FAILED) {
921 		perror("mremap");
922 		exit(1);
923 	}
924 
925 	for (; nr < nr_pages; nr++) {
926 		count = *area_count(area_dst, nr);
927 		if (count != count_verify[nr]) {
928 			fprintf(stderr,
929 				"nr %lu memory corruption %Lu %Lu\n",
930 				nr, count,
931 				count_verify[nr]); exit(1);
932 		}
933 		/*
934 		 * Trigger write protection if there is by writting
935 		 * the same value back.
936 		 */
937 		*area_count(area_dst, nr) = count;
938 	}
939 
940 	if (uffd_test_ops->release_pages(area_dst))
941 		return 1;
942 
943 	for (nr = 0; nr < nr_pages; nr++) {
944 		if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) {
945 			fprintf(stderr, "nr %lu is not zero\n", nr);
946 			exit(1);
947 		}
948 	}
949 
950 	return 0;
951 }
952 
retry_uffdio_zeropage(int ufd,struct uffdio_zeropage * uffdio_zeropage,unsigned long offset)953 static void retry_uffdio_zeropage(int ufd,
954 				  struct uffdio_zeropage *uffdio_zeropage,
955 				  unsigned long offset)
956 {
957 	uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
958 				     uffdio_zeropage->range.len,
959 				     offset);
960 	if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
961 		if (uffdio_zeropage->zeropage != -EEXIST) {
962 			fprintf(stderr, "UFFDIO_ZEROPAGE retry error %Ld\n",
963 				uffdio_zeropage->zeropage);
964 			exit(1);
965 		}
966 	} else {
967 		fprintf(stderr, "UFFDIO_ZEROPAGE retry unexpected %Ld\n",
968 			uffdio_zeropage->zeropage); exit(1);
969 	}
970 }
971 
__uffdio_zeropage(int ufd,unsigned long offset,bool retry)972 static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
973 {
974 	struct uffdio_zeropage uffdio_zeropage;
975 	int ret;
976 	unsigned long has_zeropage;
977 
978 	has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
979 
980 	if (offset >= nr_pages * page_size) {
981 		fprintf(stderr, "unexpected offset %lu\n", offset);
982 		exit(1);
983 	}
984 	uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
985 	uffdio_zeropage.range.len = page_size;
986 	uffdio_zeropage.mode = 0;
987 	ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
988 	if (ret) {
989 		/* real retval in ufdio_zeropage.zeropage */
990 		if (has_zeropage) {
991 			if (uffdio_zeropage.zeropage == -EEXIST) {
992 				fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n");
993 				exit(1);
994 			} else {
995 				fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n",
996 					uffdio_zeropage.zeropage);
997 				exit(1);
998 			}
999 		} else {
1000 			if (uffdio_zeropage.zeropage != -EINVAL) {
1001 				fprintf(stderr,
1002 					"UFFDIO_ZEROPAGE not -EINVAL %Ld\n",
1003 					uffdio_zeropage.zeropage);
1004 				exit(1);
1005 			}
1006 		}
1007 	} else if (has_zeropage) {
1008 		if (uffdio_zeropage.zeropage != page_size) {
1009 			fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n",
1010 				uffdio_zeropage.zeropage); exit(1);
1011 		} else {
1012 			if (test_uffdio_zeropage_eexist && retry) {
1013 				test_uffdio_zeropage_eexist = false;
1014 				retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1015 						      offset);
1016 			}
1017 			return 1;
1018 		}
1019 	} else {
1020 		fprintf(stderr,
1021 			"UFFDIO_ZEROPAGE succeeded %Ld\n",
1022 			uffdio_zeropage.zeropage); exit(1);
1023 	}
1024 
1025 	return 0;
1026 }
1027 
uffdio_zeropage(int ufd,unsigned long offset)1028 static int uffdio_zeropage(int ufd, unsigned long offset)
1029 {
1030 	return __uffdio_zeropage(ufd, offset, false);
1031 }
1032 
1033 /* exercise UFFDIO_ZEROPAGE */
userfaultfd_zeropage_test(void)1034 static int userfaultfd_zeropage_test(void)
1035 {
1036 	struct uffdio_register uffdio_register;
1037 	unsigned long expected_ioctls;
1038 
1039 	printf("testing UFFDIO_ZEROPAGE: ");
1040 	fflush(stdout);
1041 
1042 	if (uffd_test_ops->release_pages(area_dst))
1043 		return 1;
1044 
1045 	if (userfaultfd_open(0) < 0)
1046 		return 1;
1047 	uffdio_register.range.start = (unsigned long) area_dst;
1048 	uffdio_register.range.len = nr_pages * page_size;
1049 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1050 	if (test_uffdio_wp)
1051 		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1052 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1053 		fprintf(stderr, "register failure\n");
1054 		exit(1);
1055 	}
1056 
1057 	expected_ioctls = uffd_test_ops->expected_ioctls;
1058 	if ((uffdio_register.ioctls & expected_ioctls) !=
1059 	    expected_ioctls) {
1060 		fprintf(stderr,
1061 			"unexpected missing ioctl for anon memory\n");
1062 		exit(1);
1063 	}
1064 
1065 	if (uffdio_zeropage(uffd, 0)) {
1066 		if (my_bcmp(area_dst, zeropage, page_size)) {
1067 			fprintf(stderr, "zeropage is not zero\n");
1068 			exit(1);
1069 		}
1070 	}
1071 
1072 	close(uffd);
1073 	printf("done.\n");
1074 	return 0;
1075 }
1076 
userfaultfd_events_test(void)1077 static int userfaultfd_events_test(void)
1078 {
1079 	struct uffdio_register uffdio_register;
1080 	unsigned long expected_ioctls;
1081 	pthread_t uffd_mon;
1082 	int err, features;
1083 	pid_t pid;
1084 	char c;
1085 	struct uffd_stats stats = { 0 };
1086 
1087 	printf("testing events (fork, remap, remove): ");
1088 	fflush(stdout);
1089 
1090 	if (uffd_test_ops->release_pages(area_dst))
1091 		return 1;
1092 
1093 	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1094 		UFFD_FEATURE_EVENT_REMOVE;
1095 	if (userfaultfd_open(features) < 0)
1096 		return 1;
1097 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1098 
1099 	uffdio_register.range.start = (unsigned long) area_dst;
1100 	uffdio_register.range.len = nr_pages * page_size;
1101 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1102 	if (test_uffdio_wp)
1103 		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1104 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1105 		fprintf(stderr, "register failure\n");
1106 		exit(1);
1107 	}
1108 
1109 	expected_ioctls = uffd_test_ops->expected_ioctls;
1110 	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1111 		fprintf(stderr, "unexpected missing ioctl for anon memory\n");
1112 		exit(1);
1113 	}
1114 
1115 	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1116 		perror("uffd_poll_thread create");
1117 		exit(1);
1118 	}
1119 
1120 	pid = fork();
1121 	if (pid < 0) {
1122 		perror("fork");
1123 		exit(1);
1124 	}
1125 
1126 	if (!pid)
1127 		return faulting_process(0);
1128 
1129 	waitpid(pid, &err, 0);
1130 	if (err) {
1131 		fprintf(stderr, "faulting process failed\n");
1132 		exit(1);
1133 	}
1134 
1135 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1136 		perror("pipe write");
1137 		exit(1);
1138 	}
1139 	if (pthread_join(uffd_mon, NULL))
1140 		return 1;
1141 
1142 	close(uffd);
1143 
1144 	uffd_stats_report(&stats, 1);
1145 
1146 	return stats.missing_faults != nr_pages;
1147 }
1148 
userfaultfd_sig_test(void)1149 static int userfaultfd_sig_test(void)
1150 {
1151 	struct uffdio_register uffdio_register;
1152 	unsigned long expected_ioctls;
1153 	unsigned long userfaults;
1154 	pthread_t uffd_mon;
1155 	int err, features;
1156 	pid_t pid;
1157 	char c;
1158 	struct uffd_stats stats = { 0 };
1159 
1160 	printf("testing signal delivery: ");
1161 	fflush(stdout);
1162 
1163 	if (uffd_test_ops->release_pages(area_dst))
1164 		return 1;
1165 
1166 	features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1167 	if (userfaultfd_open(features) < 0)
1168 		return 1;
1169 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1170 
1171 	uffdio_register.range.start = (unsigned long) area_dst;
1172 	uffdio_register.range.len = nr_pages * page_size;
1173 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1174 	if (test_uffdio_wp)
1175 		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1176 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1177 		fprintf(stderr, "register failure\n");
1178 		exit(1);
1179 	}
1180 
1181 	expected_ioctls = uffd_test_ops->expected_ioctls;
1182 	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1183 		fprintf(stderr, "unexpected missing ioctl for anon memory\n");
1184 		exit(1);
1185 	}
1186 
1187 	if (faulting_process(1)) {
1188 		fprintf(stderr, "faulting process failed\n");
1189 		exit(1);
1190 	}
1191 
1192 	if (uffd_test_ops->release_pages(area_dst))
1193 		return 1;
1194 
1195 	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1196 		perror("uffd_poll_thread create");
1197 		exit(1);
1198 	}
1199 
1200 	pid = fork();
1201 	if (pid < 0) {
1202 		perror("fork");
1203 		exit(1);
1204 	}
1205 
1206 	if (!pid)
1207 		exit(faulting_process(2));
1208 
1209 	waitpid(pid, &err, 0);
1210 	if (err) {
1211 		fprintf(stderr, "faulting process failed\n");
1212 		exit(1);
1213 	}
1214 
1215 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1216 		perror("pipe write");
1217 		exit(1);
1218 	}
1219 	if (pthread_join(uffd_mon, (void **)&userfaults))
1220 		return 1;
1221 
1222 	printf("done.\n");
1223 	if (userfaults)
1224 		fprintf(stderr, "Signal test failed, userfaults: %ld\n",
1225 			userfaults);
1226 	close(uffd);
1227 	return userfaults != 0;
1228 }
1229 
userfaultfd_stress(void)1230 static int userfaultfd_stress(void)
1231 {
1232 	void *area;
1233 	char *tmp_area;
1234 	unsigned long nr;
1235 	struct uffdio_register uffdio_register;
1236 	unsigned long cpu;
1237 	int err;
1238 	struct uffd_stats uffd_stats[nr_cpus];
1239 
1240 	uffd_test_ops->allocate_area((void **)&area_src);
1241 	if (!area_src)
1242 		return 1;
1243 	uffd_test_ops->allocate_area((void **)&area_dst);
1244 	if (!area_dst)
1245 		return 1;
1246 
1247 	if (userfaultfd_open(0) < 0)
1248 		return 1;
1249 
1250 	count_verify = malloc(nr_pages * sizeof(unsigned long long));
1251 	if (!count_verify) {
1252 		perror("count_verify");
1253 		return 1;
1254 	}
1255 
1256 	for (nr = 0; nr < nr_pages; nr++) {
1257 		*area_mutex(area_src, nr) = (pthread_mutex_t)
1258 			PTHREAD_MUTEX_INITIALIZER;
1259 		count_verify[nr] = *area_count(area_src, nr) = 1;
1260 		/*
1261 		 * In the transition between 255 to 256, powerpc will
1262 		 * read out of order in my_bcmp and see both bytes as
1263 		 * zero, so leave a placeholder below always non-zero
1264 		 * after the count, to avoid my_bcmp to trigger false
1265 		 * positives.
1266 		 */
1267 		*(area_count(area_src, nr) + 1) = 1;
1268 	}
1269 
1270 	pipefd = malloc(sizeof(int) * nr_cpus * 2);
1271 	if (!pipefd) {
1272 		perror("pipefd");
1273 		return 1;
1274 	}
1275 	for (cpu = 0; cpu < nr_cpus; cpu++) {
1276 		if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
1277 			perror("pipe");
1278 			return 1;
1279 		}
1280 	}
1281 
1282 	if (posix_memalign(&area, page_size, page_size)) {
1283 		fprintf(stderr, "out of memory\n");
1284 		return 1;
1285 	}
1286 	zeropage = area;
1287 	bzero(zeropage, page_size);
1288 
1289 	pthread_mutex_lock(&uffd_read_mutex);
1290 
1291 	pthread_attr_init(&attr);
1292 	pthread_attr_setstacksize(&attr, 16*1024*1024);
1293 
1294 	err = 0;
1295 	while (bounces--) {
1296 		unsigned long expected_ioctls;
1297 
1298 		printf("bounces: %d, mode:", bounces);
1299 		if (bounces & BOUNCE_RANDOM)
1300 			printf(" rnd");
1301 		if (bounces & BOUNCE_RACINGFAULTS)
1302 			printf(" racing");
1303 		if (bounces & BOUNCE_VERIFY)
1304 			printf(" ver");
1305 		if (bounces & BOUNCE_POLL)
1306 			printf(" poll");
1307 		printf(", ");
1308 		fflush(stdout);
1309 
1310 		if (bounces & BOUNCE_POLL)
1311 			fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1312 		else
1313 			fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1314 
1315 		/* register */
1316 		uffdio_register.range.start = (unsigned long) area_dst;
1317 		uffdio_register.range.len = nr_pages * page_size;
1318 		uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1319 		if (test_uffdio_wp)
1320 			uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1321 		if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1322 			fprintf(stderr, "register failure\n");
1323 			return 1;
1324 		}
1325 		expected_ioctls = uffd_test_ops->expected_ioctls;
1326 		if ((uffdio_register.ioctls & expected_ioctls) !=
1327 		    expected_ioctls) {
1328 			fprintf(stderr,
1329 				"unexpected missing ioctl for anon memory\n");
1330 			return 1;
1331 		}
1332 
1333 		if (area_dst_alias) {
1334 			uffdio_register.range.start = (unsigned long)
1335 				area_dst_alias;
1336 			if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1337 				fprintf(stderr, "register failure alias\n");
1338 				return 1;
1339 			}
1340 		}
1341 
1342 		/*
1343 		 * The madvise done previously isn't enough: some
1344 		 * uffd_thread could have read userfaults (one of
1345 		 * those already resolved by the background thread)
1346 		 * and it may be in the process of calling
1347 		 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1348 		 * area_src and it would map a zero page in it (of
1349 		 * course such a UFFDIO_COPY is perfectly safe as it'd
1350 		 * return -EEXIST). The problem comes at the next
1351 		 * bounce though: that racing UFFDIO_COPY would
1352 		 * generate zeropages in the area_src, so invalidating
1353 		 * the previous MADV_DONTNEED. Without this additional
1354 		 * MADV_DONTNEED those zeropages leftovers in the
1355 		 * area_src would lead to -EEXIST failure during the
1356 		 * next bounce, effectively leaving a zeropage in the
1357 		 * area_dst.
1358 		 *
1359 		 * Try to comment this out madvise to see the memory
1360 		 * corruption being caught pretty quick.
1361 		 *
1362 		 * khugepaged is also inhibited to collapse THP after
1363 		 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1364 		 * required to MADV_DONTNEED here.
1365 		 */
1366 		if (uffd_test_ops->release_pages(area_dst))
1367 			return 1;
1368 
1369 		uffd_stats_reset(uffd_stats, nr_cpus);
1370 
1371 		/* bounce pass */
1372 		if (stress(uffd_stats))
1373 			return 1;
1374 
1375 		/* Clear all the write protections if there is any */
1376 		if (test_uffdio_wp)
1377 			wp_range(uffd, (unsigned long)area_dst,
1378 				 nr_pages * page_size, false);
1379 
1380 		/* unregister */
1381 		if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
1382 			fprintf(stderr, "unregister failure\n");
1383 			return 1;
1384 		}
1385 		if (area_dst_alias) {
1386 			uffdio_register.range.start = (unsigned long) area_dst;
1387 			if (ioctl(uffd, UFFDIO_UNREGISTER,
1388 				  &uffdio_register.range)) {
1389 				fprintf(stderr, "unregister failure alias\n");
1390 				return 1;
1391 			}
1392 		}
1393 
1394 		/* verification */
1395 		if (bounces & BOUNCE_VERIFY) {
1396 			for (nr = 0; nr < nr_pages; nr++) {
1397 				if (*area_count(area_dst, nr) != count_verify[nr]) {
1398 					fprintf(stderr,
1399 						"error area_count %Lu %Lu %lu\n",
1400 						*area_count(area_src, nr),
1401 						count_verify[nr],
1402 						nr);
1403 					err = 1;
1404 					bounces = 0;
1405 				}
1406 			}
1407 		}
1408 
1409 		/* prepare next bounce */
1410 		tmp_area = area_src;
1411 		area_src = area_dst;
1412 		area_dst = tmp_area;
1413 
1414 		tmp_area = area_src_alias;
1415 		area_src_alias = area_dst_alias;
1416 		area_dst_alias = tmp_area;
1417 
1418 		uffd_stats_report(uffd_stats, nr_cpus);
1419 	}
1420 
1421 	if (err)
1422 		return err;
1423 
1424 	close(uffd);
1425 	return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1426 		|| userfaultfd_events_test();
1427 }
1428 
1429 /*
1430  * Copied from mlock2-tests.c
1431  */
default_huge_page_size(void)1432 unsigned long default_huge_page_size(void)
1433 {
1434 	unsigned long hps = 0;
1435 	char *line = NULL;
1436 	size_t linelen = 0;
1437 	FILE *f = fopen("/proc/meminfo", "r");
1438 
1439 	if (!f)
1440 		return 0;
1441 	while (getline(&line, &linelen, f) > 0) {
1442 		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
1443 			hps <<= 10;
1444 			break;
1445 		}
1446 	}
1447 
1448 	free(line);
1449 	fclose(f);
1450 	return hps;
1451 }
1452 
set_test_type(const char * type)1453 static void set_test_type(const char *type)
1454 {
1455 	if (!strcmp(type, "anon")) {
1456 		test_type = TEST_ANON;
1457 		uffd_test_ops = &anon_uffd_test_ops;
1458 		/* Only enable write-protect test for anonymous test */
1459 		test_uffdio_wp = true;
1460 	} else if (!strcmp(type, "hugetlb")) {
1461 		test_type = TEST_HUGETLB;
1462 		uffd_test_ops = &hugetlb_uffd_test_ops;
1463 	} else if (!strcmp(type, "hugetlb_shared")) {
1464 		map_shared = true;
1465 		test_type = TEST_HUGETLB;
1466 		uffd_test_ops = &hugetlb_uffd_test_ops;
1467 	} else if (!strcmp(type, "shmem")) {
1468 		map_shared = true;
1469 		test_type = TEST_SHMEM;
1470 		uffd_test_ops = &shmem_uffd_test_ops;
1471 	} else {
1472 		fprintf(stderr, "Unknown test type: %s\n", type); exit(1);
1473 	}
1474 
1475 	if (test_type == TEST_HUGETLB)
1476 		page_size = default_huge_page_size();
1477 	else
1478 		page_size = sysconf(_SC_PAGE_SIZE);
1479 
1480 	if (!page_size) {
1481 		fprintf(stderr, "Unable to determine page size\n");
1482 		exit(2);
1483 	}
1484 	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1485 	    > page_size) {
1486 		fprintf(stderr, "Impossible to run this test\n");
1487 		exit(2);
1488 	}
1489 }
1490 
sigalrm(int sig)1491 static void sigalrm(int sig)
1492 {
1493 	if (sig != SIGALRM)
1494 		abort();
1495 	test_uffdio_copy_eexist = true;
1496 	test_uffdio_zeropage_eexist = true;
1497 	alarm(ALARM_INTERVAL_SECS);
1498 }
1499 
main(int argc,char ** argv)1500 int main(int argc, char **argv)
1501 {
1502 	if (argc < 4)
1503 		usage();
1504 
1505 	if (signal(SIGALRM, sigalrm) == SIG_ERR) {
1506 		fprintf(stderr, "failed to arm SIGALRM");
1507 		exit(1);
1508 	}
1509 	alarm(ALARM_INTERVAL_SECS);
1510 
1511 	set_test_type(argv[1]);
1512 
1513 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1514 	nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1515 		nr_cpus;
1516 	if (!nr_pages_per_cpu) {
1517 		fprintf(stderr, "invalid MiB\n");
1518 		usage();
1519 	}
1520 
1521 	bounces = atoi(argv[3]);
1522 	if (bounces <= 0) {
1523 		fprintf(stderr, "invalid bounces\n");
1524 		usage();
1525 	}
1526 	nr_pages = nr_pages_per_cpu * nr_cpus;
1527 
1528 	if (test_type == TEST_HUGETLB) {
1529 		if (argc < 5)
1530 			usage();
1531 		huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1532 		if (huge_fd < 0) {
1533 			fprintf(stderr, "Open of %s failed", argv[3]);
1534 			perror("open");
1535 			exit(1);
1536 		}
1537 		if (ftruncate(huge_fd, 0)) {
1538 			fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
1539 			perror("ftruncate");
1540 			exit(1);
1541 		}
1542 	}
1543 	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1544 	       nr_pages, nr_pages_per_cpu);
1545 	return userfaultfd_stress();
1546 }
1547 
1548 #else /* __NR_userfaultfd */
1549 
1550 #warning "missing __NR_userfaultfd definition"
1551 
main(void)1552 int main(void)
1553 {
1554 	printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1555 	return KSFT_SKIP;
1556 }
1557 
1558 #endif /* __NR_userfaultfd */
1559