• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Stress userfaultfd syscall.
3  *
4  *  Copyright (C) 2015  Red Hat, Inc.
5  *
6  *  This work is licensed under the terms of the GNU GPL, version 2. See
7  *  the COPYING file in the top-level directory.
8  *
9  * This test allocates two virtual areas and bounces the physical
10  * memory across the two virtual areas (from area_src to area_dst)
11  * using userfaultfd.
12  *
13  * There are three threads running per CPU:
14  *
15  * 1) one per-CPU thread takes a per-page pthread_mutex in a random
16  *    page of the area_dst (while the physical page may still be in
17  *    area_src), and increments a per-page counter in the same page,
18  *    and checks its value against a verification region.
19  *
20  * 2) another per-CPU thread handles the userfaults generated by
21  *    thread 1 above. userfaultfd blocking reads or poll() modes are
22  *    exercised interleaved.
23  *
24  * 3) one last per-CPU thread transfers the memory in the background
25  *    at maximum bandwidth (if not already transferred by thread
26  *    2). Each cpu thread takes cares of transferring a portion of the
27  *    area.
28  *
29  * When all threads of type 3 completed the transfer, one bounce is
30  * complete. area_src and area_dst are then swapped. All threads are
31  * respawned and so the bounce is immediately restarted in the
32  * opposite direction.
33  *
34  * per-CPU threads 1 by triggering userfaults inside
35  * pthread_mutex_lock will also verify the atomicity of the memory
36  * transfer (UFFDIO_COPY).
37  *
38  * The program takes two parameters: the amounts of physical memory in
39  * megabytes (MiB) of the area and the number of bounces to execute.
40  *
41  * # 100MiB 99999 bounces
42  * ./userfaultfd 100 99999
43  *
44  * # 1GiB 99 bounces
45  * ./userfaultfd 1000 99
46  *
47  * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
48  * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
49  */
50 
51 #define _GNU_SOURCE
52 #include <stdio.h>
53 #include <errno.h>
54 #include <unistd.h>
55 #include <stdlib.h>
56 #include <sys/types.h>
57 #include <sys/stat.h>
58 #include <fcntl.h>
59 #include <time.h>
60 #include <signal.h>
61 #include <poll.h>
62 #include <string.h>
63 #include <sys/mman.h>
64 #include <sys/syscall.h>
65 #include <sys/ioctl.h>
66 #include <sys/wait.h>
67 #include <pthread.h>
68 #include <linux/userfaultfd.h>
69 
70 #ifdef __NR_userfaultfd
71 
72 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
73 
74 #define BOUNCE_RANDOM		(1<<0)
75 #define BOUNCE_RACINGFAULTS	(1<<1)
76 #define BOUNCE_VERIFY		(1<<2)
77 #define BOUNCE_POLL		(1<<3)
78 static int bounces;
79 
80 #ifdef HUGETLB_TEST
81 static int huge_fd;
82 static char *huge_fd_off0;
83 #endif
84 static unsigned long long *count_verify;
85 static int uffd, uffd_flags, finished, *pipefd;
86 static char *area_src, *area_dst;
87 static char *zeropage;
88 pthread_attr_t attr;
89 
90 /* pthread_mutex_t starts at page offset 0 */
91 #define area_mutex(___area, ___nr)					\
92 	((pthread_mutex_t *) ((___area) + (___nr)*page_size))
93 /*
94  * count is placed in the page after pthread_mutex_t naturally aligned
95  * to avoid non alignment faults on non-x86 archs.
96  */
97 #define area_count(___area, ___nr)					\
98 	((volatile unsigned long long *) ((unsigned long)		\
99 				 ((___area) + (___nr)*page_size +	\
100 				  sizeof(pthread_mutex_t) +		\
101 				  sizeof(unsigned long long) - 1) &	\
102 				 ~(unsigned long)(sizeof(unsigned long long) \
103 						  -  1)))
104 
105 #if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST)
106 
107 /* Anonymous memory */
108 #define EXPECTED_IOCTLS		((1 << _UFFDIO_WAKE) | \
109 				 (1 << _UFFDIO_COPY) | \
110 				 (1 << _UFFDIO_ZEROPAGE))
111 
release_pages(char * rel_area)112 static int release_pages(char *rel_area)
113 {
114 	int ret = 0;
115 
116 	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
117 		perror("madvise");
118 		ret = 1;
119 	}
120 
121 	return ret;
122 }
123 
allocate_area(void ** alloc_area)124 static void allocate_area(void **alloc_area)
125 {
126 	if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
127 		fprintf(stderr, "out of memory\n");
128 		*alloc_area = NULL;
129 	}
130 }
131 
132 #else /* HUGETLB_TEST or SHMEM_TEST */
133 
134 #define EXPECTED_IOCTLS		UFFD_API_RANGE_IOCTLS_BASIC
135 
136 #ifdef HUGETLB_TEST
137 
138 /* HugeTLB memory */
release_pages(char * rel_area)139 static int release_pages(char *rel_area)
140 {
141 	int ret = 0;
142 
143 	if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
144 				rel_area == huge_fd_off0 ? 0 :
145 				nr_pages * page_size,
146 				nr_pages * page_size)) {
147 		perror("fallocate");
148 		ret = 1;
149 	}
150 
151 	return ret;
152 }
153 
154 
allocate_area(void ** alloc_area)155 static void allocate_area(void **alloc_area)
156 {
157 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
158 				MAP_PRIVATE | MAP_HUGETLB, huge_fd,
159 				*alloc_area == area_src ? 0 :
160 				nr_pages * page_size);
161 	if (*alloc_area == MAP_FAILED) {
162 		fprintf(stderr, "mmap of hugetlbfs file failed\n");
163 		*alloc_area = NULL;
164 	}
165 
166 	if (*alloc_area == area_src)
167 		huge_fd_off0 = *alloc_area;
168 }
169 
170 #elif defined(SHMEM_TEST)
171 
172 /* Shared memory */
release_pages(char * rel_area)173 static int release_pages(char *rel_area)
174 {
175 	int ret = 0;
176 
177 	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
178 		perror("madvise");
179 		ret = 1;
180 	}
181 
182 	return ret;
183 }
184 
allocate_area(void ** alloc_area)185 static void allocate_area(void **alloc_area)
186 {
187 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
188 			   MAP_ANONYMOUS | MAP_SHARED, -1, 0);
189 	if (*alloc_area == MAP_FAILED) {
190 		fprintf(stderr, "shared memory mmap failed\n");
191 		*alloc_area = NULL;
192 	}
193 }
194 
195 #else /* SHMEM_TEST */
196 #error "Undefined test type"
197 #endif /* HUGETLB_TEST */
198 
199 #endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */
200 
my_bcmp(char * str1,char * str2,size_t n)201 static int my_bcmp(char *str1, char *str2, size_t n)
202 {
203 	unsigned long i;
204 	for (i = 0; i < n; i++)
205 		if (str1[i] != str2[i])
206 			return 1;
207 	return 0;
208 }
209 
locking_thread(void * arg)210 static void *locking_thread(void *arg)
211 {
212 	unsigned long cpu = (unsigned long) arg;
213 	struct random_data rand;
214 	unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
215 	int32_t rand_nr;
216 	unsigned long long count;
217 	char randstate[64];
218 	unsigned int seed;
219 	time_t start;
220 
221 	if (bounces & BOUNCE_RANDOM) {
222 		seed = (unsigned int) time(NULL) - bounces;
223 		if (!(bounces & BOUNCE_RACINGFAULTS))
224 			seed += cpu;
225 		bzero(&rand, sizeof(rand));
226 		bzero(&randstate, sizeof(randstate));
227 		if (initstate_r(seed, randstate, sizeof(randstate), &rand))
228 			fprintf(stderr, "srandom_r error\n"), exit(1);
229 	} else {
230 		page_nr = -bounces;
231 		if (!(bounces & BOUNCE_RACINGFAULTS))
232 			page_nr += cpu * nr_pages_per_cpu;
233 	}
234 
235 	while (!finished) {
236 		if (bounces & BOUNCE_RANDOM) {
237 			if (random_r(&rand, &rand_nr))
238 				fprintf(stderr, "random_r 1 error\n"), exit(1);
239 			page_nr = rand_nr;
240 			if (sizeof(page_nr) > sizeof(rand_nr)) {
241 				if (random_r(&rand, &rand_nr))
242 					fprintf(stderr, "random_r 2 error\n"), exit(1);
243 				page_nr |= (((unsigned long) rand_nr) << 16) <<
244 					   16;
245 			}
246 		} else
247 			page_nr += 1;
248 		page_nr %= nr_pages;
249 
250 		start = time(NULL);
251 		if (bounces & BOUNCE_VERIFY) {
252 			count = *area_count(area_dst, page_nr);
253 			if (!count)
254 				fprintf(stderr,
255 					"page_nr %lu wrong count %Lu %Lu\n",
256 					page_nr, count,
257 					count_verify[page_nr]), exit(1);
258 
259 
260 			/*
261 			 * We can't use bcmp (or memcmp) because that
262 			 * returns 0 erroneously if the memory is
263 			 * changing under it (even if the end of the
264 			 * page is never changing and always
265 			 * different).
266 			 */
267 #if 1
268 			if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
269 				     page_size))
270 				fprintf(stderr,
271 					"my_bcmp page_nr %lu wrong count %Lu %Lu\n",
272 					page_nr, count,
273 					count_verify[page_nr]), exit(1);
274 #else
275 			unsigned long loops;
276 
277 			loops = 0;
278 			/* uncomment the below line to test with mutex */
279 			/* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
280 			while (!bcmp(area_dst + page_nr * page_size, zeropage,
281 				     page_size)) {
282 				loops += 1;
283 				if (loops > 10)
284 					break;
285 			}
286 			/* uncomment below line to test with mutex */
287 			/* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
288 			if (loops) {
289 				fprintf(stderr,
290 					"page_nr %lu all zero thread %lu %p %lu\n",
291 					page_nr, cpu, area_dst + page_nr * page_size,
292 					loops);
293 				if (loops > 10)
294 					exit(1);
295 			}
296 #endif
297 		}
298 
299 		pthread_mutex_lock(area_mutex(area_dst, page_nr));
300 		count = *area_count(area_dst, page_nr);
301 		if (count != count_verify[page_nr]) {
302 			fprintf(stderr,
303 				"page_nr %lu memory corruption %Lu %Lu\n",
304 				page_nr, count,
305 				count_verify[page_nr]), exit(1);
306 		}
307 		count++;
308 		*area_count(area_dst, page_nr) = count_verify[page_nr] = count;
309 		pthread_mutex_unlock(area_mutex(area_dst, page_nr));
310 
311 		if (time(NULL) - start > 1)
312 			fprintf(stderr,
313 				"userfault too slow %ld "
314 				"possible false positive with overcommit\n",
315 				time(NULL) - start);
316 	}
317 
318 	return NULL;
319 }
320 
copy_page(int ufd,unsigned long offset)321 static int copy_page(int ufd, unsigned long offset)
322 {
323 	struct uffdio_copy uffdio_copy;
324 
325 	if (offset >= nr_pages * page_size)
326 		fprintf(stderr, "unexpected offset %lu\n",
327 			offset), exit(1);
328 	uffdio_copy.dst = (unsigned long) area_dst + offset;
329 	uffdio_copy.src = (unsigned long) area_src + offset;
330 	uffdio_copy.len = page_size;
331 	uffdio_copy.mode = 0;
332 	uffdio_copy.copy = 0;
333 	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
334 		/* real retval in ufdio_copy.copy */
335 		if (uffdio_copy.copy != -EEXIST)
336 			fprintf(stderr, "UFFDIO_COPY error %Ld\n",
337 				uffdio_copy.copy), exit(1);
338 	} else if (uffdio_copy.copy != page_size) {
339 		fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
340 			uffdio_copy.copy), exit(1);
341 	} else
342 		return 1;
343 	return 0;
344 }
345 
uffd_poll_thread(void * arg)346 static void *uffd_poll_thread(void *arg)
347 {
348 	unsigned long cpu = (unsigned long) arg;
349 	struct pollfd pollfd[2];
350 	struct uffd_msg msg;
351 	struct uffdio_register uffd_reg;
352 	int ret;
353 	unsigned long offset;
354 	char tmp_chr;
355 	unsigned long userfaults = 0;
356 
357 	pollfd[0].fd = uffd;
358 	pollfd[0].events = POLLIN;
359 	pollfd[1].fd = pipefd[cpu*2];
360 	pollfd[1].events = POLLIN;
361 
362 	for (;;) {
363 		ret = poll(pollfd, 2, -1);
364 		if (!ret)
365 			fprintf(stderr, "poll error %d\n", ret), exit(1);
366 		if (ret < 0)
367 			perror("poll"), exit(1);
368 		if (pollfd[1].revents & POLLIN) {
369 			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
370 				fprintf(stderr, "read pipefd error\n"),
371 					exit(1);
372 			break;
373 		}
374 		if (!(pollfd[0].revents & POLLIN))
375 			fprintf(stderr, "pollfd[0].revents %d\n",
376 				pollfd[0].revents), exit(1);
377 		ret = read(uffd, &msg, sizeof(msg));
378 		if (ret < 0) {
379 			if (errno == EAGAIN)
380 				continue;
381 			perror("nonblocking read error"), exit(1);
382 		}
383 		switch (msg.event) {
384 		default:
385 			fprintf(stderr, "unexpected msg event %u\n",
386 				msg.event), exit(1);
387 			break;
388 		case UFFD_EVENT_PAGEFAULT:
389 			if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
390 				fprintf(stderr, "unexpected write fault\n"), exit(1);
391 			offset = (char *)(unsigned long)msg.arg.pagefault.address -
392 				area_dst;
393 			offset &= ~(page_size-1);
394 			if (copy_page(uffd, offset))
395 				userfaults++;
396 			break;
397 		case UFFD_EVENT_FORK:
398 			uffd = msg.arg.fork.ufd;
399 			pollfd[0].fd = uffd;
400 			break;
401 		case UFFD_EVENT_REMOVE:
402 			uffd_reg.range.start = msg.arg.remove.start;
403 			uffd_reg.range.len = msg.arg.remove.end -
404 				msg.arg.remove.start;
405 			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
406 				fprintf(stderr, "remove failure\n"), exit(1);
407 			break;
408 		case UFFD_EVENT_REMAP:
409 			area_dst = (char *)(unsigned long)msg.arg.remap.to;
410 			break;
411 		}
412 	}
413 	return (void *)userfaults;
414 }
415 
416 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
417 
uffd_read_thread(void * arg)418 static void *uffd_read_thread(void *arg)
419 {
420 	unsigned long *this_cpu_userfaults;
421 	struct uffd_msg msg;
422 	unsigned long offset;
423 	int ret;
424 
425 	this_cpu_userfaults = (unsigned long *) arg;
426 	*this_cpu_userfaults = 0;
427 
428 	pthread_mutex_unlock(&uffd_read_mutex);
429 	/* from here cancellation is ok */
430 
431 	for (;;) {
432 		ret = read(uffd, &msg, sizeof(msg));
433 		if (ret != sizeof(msg)) {
434 			if (ret < 0)
435 				perror("blocking read error"), exit(1);
436 			else
437 				fprintf(stderr, "short read\n"), exit(1);
438 		}
439 		if (msg.event != UFFD_EVENT_PAGEFAULT)
440 			fprintf(stderr, "unexpected msg event %u\n",
441 				msg.event), exit(1);
442 		if (bounces & BOUNCE_VERIFY &&
443 		    msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
444 			fprintf(stderr, "unexpected write fault\n"), exit(1);
445 		offset = (char *)(unsigned long)msg.arg.pagefault.address -
446 			 area_dst;
447 		offset &= ~(page_size-1);
448 		if (copy_page(uffd, offset))
449 			(*this_cpu_userfaults)++;
450 	}
451 	return (void *)NULL;
452 }
453 
background_thread(void * arg)454 static void *background_thread(void *arg)
455 {
456 	unsigned long cpu = (unsigned long) arg;
457 	unsigned long page_nr;
458 
459 	for (page_nr = cpu * nr_pages_per_cpu;
460 	     page_nr < (cpu+1) * nr_pages_per_cpu;
461 	     page_nr++)
462 		copy_page(uffd, page_nr * page_size);
463 
464 	return NULL;
465 }
466 
stress(unsigned long * userfaults)467 static int stress(unsigned long *userfaults)
468 {
469 	unsigned long cpu;
470 	pthread_t locking_threads[nr_cpus];
471 	pthread_t uffd_threads[nr_cpus];
472 	pthread_t background_threads[nr_cpus];
473 	void **_userfaults = (void **) userfaults;
474 
475 	finished = 0;
476 	for (cpu = 0; cpu < nr_cpus; cpu++) {
477 		if (pthread_create(&locking_threads[cpu], &attr,
478 				   locking_thread, (void *)cpu))
479 			return 1;
480 		if (bounces & BOUNCE_POLL) {
481 			if (pthread_create(&uffd_threads[cpu], &attr,
482 					   uffd_poll_thread, (void *)cpu))
483 				return 1;
484 		} else {
485 			if (pthread_create(&uffd_threads[cpu], &attr,
486 					   uffd_read_thread,
487 					   &_userfaults[cpu]))
488 				return 1;
489 			pthread_mutex_lock(&uffd_read_mutex);
490 		}
491 		if (pthread_create(&background_threads[cpu], &attr,
492 				   background_thread, (void *)cpu))
493 			return 1;
494 	}
495 	for (cpu = 0; cpu < nr_cpus; cpu++)
496 		if (pthread_join(background_threads[cpu], NULL))
497 			return 1;
498 
499 	/*
500 	 * Be strict and immediately zap area_src, the whole area has
501 	 * been transferred already by the background treads. The
502 	 * area_src could then be faulted in in a racy way by still
503 	 * running uffdio_threads reading zeropages after we zapped
504 	 * area_src (but they're guaranteed to get -EEXIST from
505 	 * UFFDIO_COPY without writing zero pages into area_dst
506 	 * because the background threads already completed).
507 	 */
508 	if (release_pages(area_src))
509 		return 1;
510 
511 	for (cpu = 0; cpu < nr_cpus; cpu++) {
512 		char c;
513 		if (bounces & BOUNCE_POLL) {
514 			if (write(pipefd[cpu*2+1], &c, 1) != 1) {
515 				fprintf(stderr, "pipefd write error\n");
516 				return 1;
517 			}
518 			if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
519 				return 1;
520 		} else {
521 			if (pthread_cancel(uffd_threads[cpu]))
522 				return 1;
523 			if (pthread_join(uffd_threads[cpu], NULL))
524 				return 1;
525 		}
526 	}
527 
528 	finished = 1;
529 	for (cpu = 0; cpu < nr_cpus; cpu++)
530 		if (pthread_join(locking_threads[cpu], NULL))
531 			return 1;
532 
533 	return 0;
534 }
535 
userfaultfd_open(int features)536 static int userfaultfd_open(int features)
537 {
538 	struct uffdio_api uffdio_api;
539 
540 	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
541 	if (uffd < 0) {
542 		fprintf(stderr,
543 			"userfaultfd syscall not available in this kernel\n");
544 		return 1;
545 	}
546 	uffd_flags = fcntl(uffd, F_GETFD, NULL);
547 
548 	uffdio_api.api = UFFD_API;
549 	uffdio_api.features = features;
550 	if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
551 		fprintf(stderr, "UFFDIO_API\n");
552 		return 1;
553 	}
554 	if (uffdio_api.api != UFFD_API) {
555 		fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
556 		return 1;
557 	}
558 
559 	return 0;
560 }
561 
562 /*
563  * For non-cooperative userfaultfd test we fork() a process that will
564  * generate pagefaults, will mremap the area monitored by the
565  * userfaultfd and at last this process will release the monitored
566  * area.
567  * For the anonymous and shared memory the area is divided into two
568  * parts, the first part is accessed before mremap, and the second
569  * part is accessed after mremap. Since hugetlbfs does not support
570  * mremap, the entire monitored area is accessed in a single pass for
571  * HUGETLB_TEST.
572  * The release of the pages currently generates event for shmem and
573  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
574  * for hugetlb.
575  */
faulting_process(void)576 static int faulting_process(void)
577 {
578 	unsigned long nr;
579 	unsigned long long count;
580 
581 #ifndef HUGETLB_TEST
582 	unsigned long split_nr_pages = (nr_pages + 1) / 2;
583 #else
584 	unsigned long split_nr_pages = nr_pages;
585 #endif
586 
587 	for (nr = 0; nr < split_nr_pages; nr++) {
588 		count = *area_count(area_dst, nr);
589 		if (count != count_verify[nr]) {
590 			fprintf(stderr,
591 				"nr %lu memory corruption %Lu %Lu\n",
592 				nr, count,
593 				count_verify[nr]), exit(1);
594 		}
595 	}
596 
597 #ifndef HUGETLB_TEST
598 	area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
599 			  MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
600 	if (area_dst == MAP_FAILED)
601 		perror("mremap"), exit(1);
602 
603 	for (; nr < nr_pages; nr++) {
604 		count = *area_count(area_dst, nr);
605 		if (count != count_verify[nr]) {
606 			fprintf(stderr,
607 				"nr %lu memory corruption %Lu %Lu\n",
608 				nr, count,
609 				count_verify[nr]), exit(1);
610 		}
611 	}
612 
613 	if (release_pages(area_dst))
614 		return 1;
615 
616 	for (nr = 0; nr < nr_pages; nr++) {
617 		if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
618 			fprintf(stderr, "nr %lu is not zero\n", nr), exit(1);
619 	}
620 
621 #endif /* HUGETLB_TEST */
622 
623 	return 0;
624 }
625 
uffdio_zeropage(int ufd,unsigned long offset)626 static int uffdio_zeropage(int ufd, unsigned long offset)
627 {
628 	struct uffdio_zeropage uffdio_zeropage;
629 	int ret;
630 	unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE);
631 
632 	if (offset >= nr_pages * page_size)
633 		fprintf(stderr, "unexpected offset %lu\n",
634 			offset), exit(1);
635 	uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
636 	uffdio_zeropage.range.len = page_size;
637 	uffdio_zeropage.mode = 0;
638 	ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
639 	if (ret) {
640 		/* real retval in ufdio_zeropage.zeropage */
641 		if (has_zeropage) {
642 			if (uffdio_zeropage.zeropage == -EEXIST)
643 				fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"),
644 					exit(1);
645 			else
646 				fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n",
647 					uffdio_zeropage.zeropage), exit(1);
648 		} else {
649 			if (uffdio_zeropage.zeropage != -EINVAL)
650 				fprintf(stderr,
651 					"UFFDIO_ZEROPAGE not -EINVAL %Ld\n",
652 					uffdio_zeropage.zeropage), exit(1);
653 		}
654 	} else if (has_zeropage) {
655 		if (uffdio_zeropage.zeropage != page_size) {
656 			fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n",
657 				uffdio_zeropage.zeropage), exit(1);
658 		} else
659 			return 1;
660 	} else {
661 		fprintf(stderr,
662 			"UFFDIO_ZEROPAGE succeeded %Ld\n",
663 			uffdio_zeropage.zeropage), exit(1);
664 	}
665 
666 	return 0;
667 }
668 
669 /* exercise UFFDIO_ZEROPAGE */
userfaultfd_zeropage_test(void)670 static int userfaultfd_zeropage_test(void)
671 {
672 	struct uffdio_register uffdio_register;
673 	unsigned long expected_ioctls;
674 
675 	printf("testing UFFDIO_ZEROPAGE: ");
676 	fflush(stdout);
677 
678 	if (release_pages(area_dst))
679 		return 1;
680 
681 	if (userfaultfd_open(0) < 0)
682 		return 1;
683 	uffdio_register.range.start = (unsigned long) area_dst;
684 	uffdio_register.range.len = nr_pages * page_size;
685 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
686 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
687 		fprintf(stderr, "register failure\n"), exit(1);
688 
689 	expected_ioctls = EXPECTED_IOCTLS;
690 	if ((uffdio_register.ioctls & expected_ioctls) !=
691 	    expected_ioctls)
692 		fprintf(stderr,
693 			"unexpected missing ioctl for anon memory\n"),
694 			exit(1);
695 
696 	if (uffdio_zeropage(uffd, 0)) {
697 		if (my_bcmp(area_dst, zeropage, page_size))
698 			fprintf(stderr, "zeropage is not zero\n"), exit(1);
699 	}
700 
701 	close(uffd);
702 	printf("done.\n");
703 	return 0;
704 }
705 
userfaultfd_events_test(void)706 static int userfaultfd_events_test(void)
707 {
708 	struct uffdio_register uffdio_register;
709 	unsigned long expected_ioctls;
710 	unsigned long userfaults;
711 	pthread_t uffd_mon;
712 	int err, features;
713 	pid_t pid;
714 	char c;
715 
716 	printf("testing events (fork, remap, remove): ");
717 	fflush(stdout);
718 
719 	if (release_pages(area_dst))
720 		return 1;
721 
722 	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
723 		UFFD_FEATURE_EVENT_REMOVE;
724 	if (userfaultfd_open(features) < 0)
725 		return 1;
726 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
727 
728 	uffdio_register.range.start = (unsigned long) area_dst;
729 	uffdio_register.range.len = nr_pages * page_size;
730 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
731 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
732 		fprintf(stderr, "register failure\n"), exit(1);
733 
734 	expected_ioctls = EXPECTED_IOCTLS;
735 	if ((uffdio_register.ioctls & expected_ioctls) !=
736 	    expected_ioctls)
737 		fprintf(stderr,
738 			"unexpected missing ioctl for anon memory\n"),
739 			exit(1);
740 
741 	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
742 		perror("uffd_poll_thread create"), exit(1);
743 
744 	pid = fork();
745 	if (pid < 0)
746 		perror("fork"), exit(1);
747 
748 	if (!pid)
749 		return faulting_process();
750 
751 	waitpid(pid, &err, 0);
752 	if (err)
753 		fprintf(stderr, "faulting process failed\n"), exit(1);
754 
755 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
756 		perror("pipe write"), exit(1);
757 	if (pthread_join(uffd_mon, (void **)&userfaults))
758 		return 1;
759 
760 	close(uffd);
761 	printf("userfaults: %ld\n", userfaults);
762 
763 	return userfaults != nr_pages;
764 }
765 
userfaultfd_stress(void)766 static int userfaultfd_stress(void)
767 {
768 	void *area;
769 	char *tmp_area;
770 	unsigned long nr;
771 	struct uffdio_register uffdio_register;
772 	unsigned long cpu;
773 	int err;
774 	unsigned long userfaults[nr_cpus];
775 
776 	allocate_area((void **)&area_src);
777 	if (!area_src)
778 		return 1;
779 	allocate_area((void **)&area_dst);
780 	if (!area_dst)
781 		return 1;
782 
783 	if (userfaultfd_open(0) < 0)
784 		return 1;
785 
786 	count_verify = malloc(nr_pages * sizeof(unsigned long long));
787 	if (!count_verify) {
788 		perror("count_verify");
789 		return 1;
790 	}
791 
792 	for (nr = 0; nr < nr_pages; nr++) {
793 		*area_mutex(area_src, nr) = (pthread_mutex_t)
794 			PTHREAD_MUTEX_INITIALIZER;
795 		count_verify[nr] = *area_count(area_src, nr) = 1;
796 		/*
797 		 * In the transition between 255 to 256, powerpc will
798 		 * read out of order in my_bcmp and see both bytes as
799 		 * zero, so leave a placeholder below always non-zero
800 		 * after the count, to avoid my_bcmp to trigger false
801 		 * positives.
802 		 */
803 		*(area_count(area_src, nr) + 1) = 1;
804 	}
805 
806 	pipefd = malloc(sizeof(int) * nr_cpus * 2);
807 	if (!pipefd) {
808 		perror("pipefd");
809 		return 1;
810 	}
811 	for (cpu = 0; cpu < nr_cpus; cpu++) {
812 		if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
813 			perror("pipe");
814 			return 1;
815 		}
816 	}
817 
818 	if (posix_memalign(&area, page_size, page_size)) {
819 		fprintf(stderr, "out of memory\n");
820 		return 1;
821 	}
822 	zeropage = area;
823 	bzero(zeropage, page_size);
824 
825 	pthread_mutex_lock(&uffd_read_mutex);
826 
827 	pthread_attr_init(&attr);
828 	pthread_attr_setstacksize(&attr, 16*1024*1024);
829 
830 	err = 0;
831 	while (bounces--) {
832 		unsigned long expected_ioctls;
833 
834 		printf("bounces: %d, mode:", bounces);
835 		if (bounces & BOUNCE_RANDOM)
836 			printf(" rnd");
837 		if (bounces & BOUNCE_RACINGFAULTS)
838 			printf(" racing");
839 		if (bounces & BOUNCE_VERIFY)
840 			printf(" ver");
841 		if (bounces & BOUNCE_POLL)
842 			printf(" poll");
843 		printf(", ");
844 		fflush(stdout);
845 
846 		if (bounces & BOUNCE_POLL)
847 			fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
848 		else
849 			fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
850 
851 		/* register */
852 		uffdio_register.range.start = (unsigned long) area_dst;
853 		uffdio_register.range.len = nr_pages * page_size;
854 		uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
855 		if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
856 			fprintf(stderr, "register failure\n");
857 			return 1;
858 		}
859 		expected_ioctls = EXPECTED_IOCTLS;
860 		if ((uffdio_register.ioctls & expected_ioctls) !=
861 		    expected_ioctls) {
862 			fprintf(stderr,
863 				"unexpected missing ioctl for anon memory\n");
864 			return 1;
865 		}
866 
867 		/*
868 		 * The madvise done previously isn't enough: some
869 		 * uffd_thread could have read userfaults (one of
870 		 * those already resolved by the background thread)
871 		 * and it may be in the process of calling
872 		 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
873 		 * area_src and it would map a zero page in it (of
874 		 * course such a UFFDIO_COPY is perfectly safe as it'd
875 		 * return -EEXIST). The problem comes at the next
876 		 * bounce though: that racing UFFDIO_COPY would
877 		 * generate zeropages in the area_src, so invalidating
878 		 * the previous MADV_DONTNEED. Without this additional
879 		 * MADV_DONTNEED those zeropages leftovers in the
880 		 * area_src would lead to -EEXIST failure during the
881 		 * next bounce, effectively leaving a zeropage in the
882 		 * area_dst.
883 		 *
884 		 * Try to comment this out madvise to see the memory
885 		 * corruption being caught pretty quick.
886 		 *
887 		 * khugepaged is also inhibited to collapse THP after
888 		 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
889 		 * required to MADV_DONTNEED here.
890 		 */
891 		if (release_pages(area_dst))
892 			return 1;
893 
894 		/* bounce pass */
895 		if (stress(userfaults))
896 			return 1;
897 
898 		/* unregister */
899 		if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
900 			fprintf(stderr, "register failure\n");
901 			return 1;
902 		}
903 
904 		/* verification */
905 		if (bounces & BOUNCE_VERIFY) {
906 			for (nr = 0; nr < nr_pages; nr++) {
907 				if (*area_count(area_dst, nr) != count_verify[nr]) {
908 					fprintf(stderr,
909 						"error area_count %Lu %Lu %lu\n",
910 						*area_count(area_src, nr),
911 						count_verify[nr],
912 						nr);
913 					err = 1;
914 					bounces = 0;
915 				}
916 			}
917 		}
918 
919 		/* prepare next bounce */
920 		tmp_area = area_src;
921 		area_src = area_dst;
922 		area_dst = tmp_area;
923 
924 		printf("userfaults:");
925 		for (cpu = 0; cpu < nr_cpus; cpu++)
926 			printf(" %lu", userfaults[cpu]);
927 		printf("\n");
928 	}
929 
930 	if (err)
931 		return err;
932 
933 	close(uffd);
934 	return userfaultfd_zeropage_test() || userfaultfd_events_test();
935 }
936 
937 #ifndef HUGETLB_TEST
938 
main(int argc,char ** argv)939 int main(int argc, char **argv)
940 {
941 	if (argc < 3)
942 		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
943 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
944 	page_size = sysconf(_SC_PAGE_SIZE);
945 	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
946 	    > page_size)
947 		fprintf(stderr, "Impossible to run this test\n"), exit(2);
948 	nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
949 		nr_cpus;
950 	if (!nr_pages_per_cpu) {
951 		fprintf(stderr, "invalid MiB\n");
952 		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
953 	}
954 	bounces = atoi(argv[2]);
955 	if (bounces <= 0) {
956 		fprintf(stderr, "invalid bounces\n");
957 		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
958 	}
959 	nr_pages = nr_pages_per_cpu * nr_cpus;
960 	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
961 	       nr_pages, nr_pages_per_cpu);
962 	return userfaultfd_stress();
963 }
964 
965 #else /* HUGETLB_TEST */
966 
967 /*
968  * Copied from mlock2-tests.c
969  */
default_huge_page_size(void)970 unsigned long default_huge_page_size(void)
971 {
972 	unsigned long hps = 0;
973 	char *line = NULL;
974 	size_t linelen = 0;
975 	FILE *f = fopen("/proc/meminfo", "r");
976 
977 	if (!f)
978 		return 0;
979 	while (getline(&line, &linelen, f) > 0) {
980 		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
981 			hps <<= 10;
982 			break;
983 		}
984 	}
985 
986 	free(line);
987 	fclose(f);
988 	return hps;
989 }
990 
main(int argc,char ** argv)991 int main(int argc, char **argv)
992 {
993 	if (argc < 4)
994 		fprintf(stderr, "Usage: <MiB> <bounces> <hugetlbfs_file>\n"),
995 				exit(1);
996 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
997 	page_size = default_huge_page_size();
998 	if (!page_size)
999 		fprintf(stderr, "Unable to determine huge page size\n"),
1000 				exit(2);
1001 	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1002 	    > page_size)
1003 		fprintf(stderr, "Impossible to run this test\n"), exit(2);
1004 	nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
1005 		nr_cpus;
1006 	if (!nr_pages_per_cpu) {
1007 		fprintf(stderr, "invalid MiB\n");
1008 		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
1009 	}
1010 	bounces = atoi(argv[2]);
1011 	if (bounces <= 0) {
1012 		fprintf(stderr, "invalid bounces\n");
1013 		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
1014 	}
1015 	nr_pages = nr_pages_per_cpu * nr_cpus;
1016 	huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755);
1017 	if (huge_fd < 0) {
1018 		fprintf(stderr, "Open of %s failed", argv[3]);
1019 		perror("open");
1020 		exit(1);
1021 	}
1022 	if (ftruncate(huge_fd, 0)) {
1023 		fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
1024 		perror("ftruncate");
1025 		exit(1);
1026 	}
1027 	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1028 	       nr_pages, nr_pages_per_cpu);
1029 	return userfaultfd_stress();
1030 }
1031 
1032 #endif
1033 #else /* __NR_userfaultfd */
1034 
1035 #warning "missing __NR_userfaultfd definition"
1036 
main(void)1037 int main(void)
1038 {
1039 	printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1040 	return 0;
1041 }
1042 
1043 #endif /* __NR_userfaultfd */
1044