1 /* SPDX-License-Identifier: MIT */
2 #define _DEFAULT_SOURCE
3
4 #include "lib.h"
5 #include "syscall.h"
6 #include "liburing.h"
7 #include "int_flags.h"
8 #include "setup.h"
9 #include "liburing/compat.h"
10 #include "liburing/io_uring.h"
11
12 #define KERN_MAX_ENTRIES 32768
13 #define KERN_MAX_CQ_ENTRIES (2 * KERN_MAX_ENTRIES)
14
__fls(int x)15 static inline int __fls(int x)
16 {
17 if (!x)
18 return 0;
19 return 8 * sizeof(x) - __builtin_clz(x);
20 }
21
roundup_pow2(unsigned depth)22 static unsigned roundup_pow2(unsigned depth)
23 {
24 return 1U << __fls(depth - 1);
25 }
26
get_sq_cq_entries(unsigned entries,struct io_uring_params * p,unsigned * sq,unsigned * cq)27 static int get_sq_cq_entries(unsigned entries, struct io_uring_params *p,
28 unsigned *sq, unsigned *cq)
29 {
30 unsigned cq_entries;
31
32 if (!entries)
33 return -EINVAL;
34 if (entries > KERN_MAX_ENTRIES) {
35 if (!(p->flags & IORING_SETUP_CLAMP))
36 return -EINVAL;
37 entries = KERN_MAX_ENTRIES;
38 }
39
40 entries = roundup_pow2(entries);
41 if (p->flags & IORING_SETUP_CQSIZE) {
42 if (!p->cq_entries)
43 return -EINVAL;
44 cq_entries = p->cq_entries;
45 if (cq_entries > KERN_MAX_CQ_ENTRIES) {
46 if (!(p->flags & IORING_SETUP_CLAMP))
47 return -EINVAL;
48 cq_entries = KERN_MAX_CQ_ENTRIES;
49 }
50 cq_entries = roundup_pow2(cq_entries);
51 if (cq_entries < entries)
52 return -EINVAL;
53 } else {
54 cq_entries = 2 * entries;
55 }
56
57 *sq = entries;
58 *cq = cq_entries;
59 return 0;
60 }
61
io_uring_unmap_rings(struct io_uring_sq * sq,struct io_uring_cq * cq)62 static void io_uring_unmap_rings(struct io_uring_sq *sq, struct io_uring_cq *cq)
63 {
64 if (sq->ring_sz)
65 __sys_munmap(sq->ring_ptr, sq->ring_sz);
66 if (cq->ring_ptr && cq->ring_sz && cq->ring_ptr != sq->ring_ptr)
67 __sys_munmap(cq->ring_ptr, cq->ring_sz);
68 }
69
io_uring_setup_ring_pointers(struct io_uring_params * p,struct io_uring_sq * sq,struct io_uring_cq * cq)70 static void io_uring_setup_ring_pointers(struct io_uring_params *p,
71 struct io_uring_sq *sq,
72 struct io_uring_cq *cq)
73 {
74 sq->khead = sq->ring_ptr + p->sq_off.head;
75 sq->ktail = sq->ring_ptr + p->sq_off.tail;
76 sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask;
77 sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries;
78 sq->kflags = sq->ring_ptr + p->sq_off.flags;
79 sq->kdropped = sq->ring_ptr + p->sq_off.dropped;
80 if (!(p->flags & IORING_SETUP_NO_SQARRAY))
81 sq->array = sq->ring_ptr + p->sq_off.array;
82
83 cq->khead = cq->ring_ptr + p->cq_off.head;
84 cq->ktail = cq->ring_ptr + p->cq_off.tail;
85 cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask;
86 cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries;
87 cq->koverflow = cq->ring_ptr + p->cq_off.overflow;
88 cq->cqes = cq->ring_ptr + p->cq_off.cqes;
89 if (p->cq_off.flags)
90 cq->kflags = cq->ring_ptr + p->cq_off.flags;
91
92 sq->ring_mask = *sq->kring_mask;
93 sq->ring_entries = *sq->kring_entries;
94 cq->ring_mask = *cq->kring_mask;
95 cq->ring_entries = *cq->kring_entries;
96 }
97
io_uring_mmap(int fd,struct io_uring_params * p,struct io_uring_sq * sq,struct io_uring_cq * cq)98 static int io_uring_mmap(int fd, struct io_uring_params *p,
99 struct io_uring_sq *sq, struct io_uring_cq *cq)
100 {
101 size_t size;
102 int ret;
103
104 size = sizeof(struct io_uring_cqe);
105 if (p->flags & IORING_SETUP_CQE32)
106 size += sizeof(struct io_uring_cqe);
107
108 sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
109 cq->ring_sz = p->cq_off.cqes + p->cq_entries * size;
110
111 if (p->features & IORING_FEAT_SINGLE_MMAP) {
112 if (cq->ring_sz > sq->ring_sz)
113 sq->ring_sz = cq->ring_sz;
114 cq->ring_sz = sq->ring_sz;
115 }
116 sq->ring_ptr = __sys_mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
117 MAP_SHARED | MAP_POPULATE, fd,
118 IORING_OFF_SQ_RING);
119 if (IS_ERR(sq->ring_ptr))
120 return PTR_ERR(sq->ring_ptr);
121
122 if (p->features & IORING_FEAT_SINGLE_MMAP) {
123 cq->ring_ptr = sq->ring_ptr;
124 } else {
125 cq->ring_ptr = __sys_mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
126 MAP_SHARED | MAP_POPULATE, fd,
127 IORING_OFF_CQ_RING);
128 if (IS_ERR(cq->ring_ptr)) {
129 ret = PTR_ERR(cq->ring_ptr);
130 cq->ring_ptr = NULL;
131 goto err;
132 }
133 }
134
135 size = sizeof(struct io_uring_sqe);
136 if (p->flags & IORING_SETUP_SQE128)
137 size += 64;
138 sq->sqes = __sys_mmap(0, size * p->sq_entries, PROT_READ | PROT_WRITE,
139 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
140 if (IS_ERR(sq->sqes)) {
141 ret = PTR_ERR(sq->sqes);
142 err:
143 io_uring_unmap_rings(sq, cq);
144 return ret;
145 }
146
147 io_uring_setup_ring_pointers(p, sq, cq);
148 return 0;
149 }
150
151 /*
152 * For users that want to specify sq_thread_cpu or sq_thread_idle, this
153 * interface is a convenient helper for mmap()ing the rings.
154 * Returns -errno on error, or zero on success. On success, 'ring'
155 * contains the necessary information to read/write to the rings.
156 */
io_uring_queue_mmap(int fd,struct io_uring_params * p,struct io_uring * ring)157 __cold int io_uring_queue_mmap(int fd, struct io_uring_params *p,
158 struct io_uring *ring)
159 {
160 memset(ring, 0, sizeof(*ring));
161 return io_uring_mmap(fd, p, &ring->sq, &ring->cq);
162 }
163
164 /*
165 * Ensure that the mmap'ed rings aren't available to a child after a fork(2).
166 * This uses madvise(..., MADV_DONTFORK) on the mmap'ed ranges.
167 */
io_uring_ring_dontfork(struct io_uring * ring)168 __cold int io_uring_ring_dontfork(struct io_uring *ring)
169 {
170 size_t len;
171 int ret;
172
173 if (!ring->sq.ring_ptr || !ring->sq.sqes || !ring->cq.ring_ptr)
174 return -EINVAL;
175
176 len = sizeof(struct io_uring_sqe);
177 if (ring->flags & IORING_SETUP_SQE128)
178 len += 64;
179 len *= ring->sq.ring_entries;
180 ret = __sys_madvise(ring->sq.sqes, len, MADV_DONTFORK);
181 if (ret < 0)
182 return ret;
183
184 len = ring->sq.ring_sz;
185 ret = __sys_madvise(ring->sq.ring_ptr, len, MADV_DONTFORK);
186 if (ret < 0)
187 return ret;
188
189 if (ring->cq.ring_ptr != ring->sq.ring_ptr) {
190 len = ring->cq.ring_sz;
191 ret = __sys_madvise(ring->cq.ring_ptr, len, MADV_DONTFORK);
192 if (ret < 0)
193 return ret;
194 }
195
196 return 0;
197 }
198
199 /* FIXME */
200 static size_t huge_page_size = 2 * 1024 * 1024;
201
202 #define KRING_SIZE 64
203
204 /*
205 * Returns negative for error, or number of bytes used in the buffer on success
206 */
io_uring_alloc_huge(unsigned entries,struct io_uring_params * p,struct io_uring_sq * sq,struct io_uring_cq * cq,void * buf,size_t buf_size)207 static int io_uring_alloc_huge(unsigned entries, struct io_uring_params *p,
208 struct io_uring_sq *sq, struct io_uring_cq *cq,
209 void *buf, size_t buf_size)
210 {
211 unsigned long page_size = get_page_size();
212 unsigned sq_entries, cq_entries;
213 size_t ring_mem, sqes_mem, cqes_mem;
214 unsigned long mem_used = 0;
215 void *ptr;
216 int ret;
217
218 ret = get_sq_cq_entries(entries, p, &sq_entries, &cq_entries);
219 if (ret)
220 return ret;
221
222 ring_mem = KRING_SIZE;
223
224 sqes_mem = sq_entries * sizeof(struct io_uring_sqe);
225 if (!(p->flags & IORING_SETUP_NO_SQARRAY))
226 sqes_mem += sq_entries * sizeof(unsigned);
227 sqes_mem = (sqes_mem + page_size - 1) & ~(page_size - 1);
228
229 cqes_mem = cq_entries * sizeof(struct io_uring_cqe);
230 if (p->flags & IORING_SETUP_CQE32)
231 cqes_mem *= 2;
232 ring_mem += sqes_mem + cqes_mem;
233 mem_used = ring_mem;
234 mem_used = (mem_used + page_size - 1) & ~(page_size - 1);
235
236 /*
237 * A maxed-out number of CQ entries with IORING_SETUP_CQE32 fills a 2MB
238 * huge page by itself, so the SQ entries won't fit in the same huge
239 * page. For SQEs, that shouldn't be possible given KERN_MAX_ENTRIES,
240 * but check that too to future-proof (e.g. against different huge page
241 * sizes). Bail out early so we don't overrun.
242 */
243 if (!buf && (sqes_mem > huge_page_size || ring_mem > huge_page_size))
244 return -ENOMEM;
245
246 if (buf) {
247 if (mem_used > buf_size)
248 return -ENOMEM;
249 ptr = buf;
250 } else {
251 int map_hugetlb = 0;
252 if (sqes_mem <= page_size)
253 buf_size = page_size;
254 else {
255 buf_size = huge_page_size;
256 map_hugetlb = MAP_HUGETLB;
257 }
258 ptr = __sys_mmap(NULL, buf_size, PROT_READ|PROT_WRITE,
259 MAP_SHARED|MAP_ANONYMOUS|map_hugetlb,
260 -1, 0);
261 if (IS_ERR(ptr))
262 return PTR_ERR(ptr);
263 }
264
265 sq->sqes = ptr;
266 if (mem_used <= buf_size) {
267 sq->ring_ptr = (void *) sq->sqes + sqes_mem;
268 /* clear ring sizes, we have just one mmap() to undo */
269 cq->ring_sz = 0;
270 sq->ring_sz = 0;
271 } else {
272 int map_hugetlb = 0;
273 if (ring_mem <= page_size)
274 buf_size = page_size;
275 else {
276 buf_size = huge_page_size;
277 map_hugetlb = MAP_HUGETLB;
278 }
279 ptr = __sys_mmap(NULL, buf_size, PROT_READ|PROT_WRITE,
280 MAP_SHARED|MAP_ANONYMOUS|map_hugetlb,
281 -1, 0);
282 if (IS_ERR(ptr)) {
283 __sys_munmap(sq->sqes, 1);
284 return PTR_ERR(ptr);
285 }
286 sq->ring_ptr = ptr;
287 sq->ring_sz = buf_size;
288 cq->ring_sz = 0;
289 }
290
291 cq->ring_ptr = (void *) sq->ring_ptr;
292 p->sq_off.user_addr = (unsigned long) sq->sqes;
293 p->cq_off.user_addr = (unsigned long) sq->ring_ptr;
294 return (int) mem_used;
295 }
296
__io_uring_queue_init_params(unsigned entries,struct io_uring * ring,struct io_uring_params * p,void * buf,size_t buf_size)297 int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
298 struct io_uring_params *p, void *buf,
299 size_t buf_size)
300 {
301 int fd, ret = 0;
302 unsigned *sq_array;
303 unsigned sq_entries, index;
304
305 memset(ring, 0, sizeof(*ring));
306
307 /*
308 * The kernel does this check already, but checking it here allows us
309 * to avoid handling it below.
310 */
311 if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY
312 && !(p->flags & IORING_SETUP_NO_MMAP))
313 return -EINVAL;
314
315 if (p->flags & IORING_SETUP_NO_MMAP) {
316 ret = io_uring_alloc_huge(entries, p, &ring->sq, &ring->cq,
317 buf, buf_size);
318 if (ret < 0)
319 return ret;
320 if (buf)
321 ring->int_flags |= INT_FLAG_APP_MEM;
322 }
323
324 fd = __sys_io_uring_setup(entries, p);
325 if (fd < 0) {
326 if ((p->flags & IORING_SETUP_NO_MMAP) &&
327 !(ring->int_flags & INT_FLAG_APP_MEM)) {
328 __sys_munmap(ring->sq.sqes, 1);
329 io_uring_unmap_rings(&ring->sq, &ring->cq);
330 }
331 return fd;
332 }
333
334 if (!(p->flags & IORING_SETUP_NO_MMAP)) {
335 ret = io_uring_queue_mmap(fd, p, ring);
336 if (ret) {
337 __sys_close(fd);
338 return ret;
339 }
340 } else {
341 io_uring_setup_ring_pointers(p, &ring->sq, &ring->cq);
342 }
343
344 /*
345 * Directly map SQ slots to SQEs
346 */
347 sq_entries = ring->sq.ring_entries;
348
349 if (!(p->flags & IORING_SETUP_NO_SQARRAY)) {
350 sq_array = ring->sq.array;
351 for (index = 0; index < sq_entries; index++)
352 sq_array[index] = index;
353 }
354 ring->features = p->features;
355 ring->flags = p->flags;
356 ring->enter_ring_fd = fd;
357 if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) {
358 ring->ring_fd = -1;
359 ring->int_flags |= INT_FLAG_REG_RING | INT_FLAG_REG_REG_RING;
360 } else {
361 ring->ring_fd = fd;
362 }
363
364 return ret;
365 }
366
io_uring_queue_init_try_nosqarr(unsigned entries,struct io_uring * ring,struct io_uring_params * p,void * buf,size_t buf_size)367 static int io_uring_queue_init_try_nosqarr(unsigned entries, struct io_uring *ring,
368 struct io_uring_params *p, void *buf,
369 size_t buf_size)
370 {
371 unsigned flags = p->flags;
372 int ret;
373
374 p->flags |= IORING_SETUP_NO_SQARRAY;
375 ret = __io_uring_queue_init_params(entries, ring, p, buf, buf_size);
376
377 /* don't fallback if explicitly asked for NOSQARRAY */
378 if (ret != -EINVAL || (flags & IORING_SETUP_NO_SQARRAY))
379 return ret;
380
381 p->flags = flags;
382 return __io_uring_queue_init_params(entries, ring, p, buf, buf_size);
383 }
384
385 /*
386 * Like io_uring_queue_init_params(), except it allows the application to pass
387 * in a pre-allocated memory range that is used for the shared data between
388 * the kernel and the application. This includes the sqes array, and the two
389 * rings. The memory must be contiguous, the use case here is that the app
390 * allocates a huge page and passes it in.
391 *
392 * Returns the number of bytes used in the buffer, the app can then reuse
393 * the buffer with the returned offset to put more rings in the same huge
394 * page. Returns -ENOMEM if there's not enough room left in the buffer to
395 * host the ring.
396 */
io_uring_queue_init_mem(unsigned entries,struct io_uring * ring,struct io_uring_params * p,void * buf,size_t buf_size)397 int io_uring_queue_init_mem(unsigned entries, struct io_uring *ring,
398 struct io_uring_params *p,
399 void *buf, size_t buf_size)
400 {
401 /* should already be set... */
402 p->flags |= IORING_SETUP_NO_MMAP;
403 return io_uring_queue_init_try_nosqarr(entries, ring, p, buf, buf_size);
404 }
405
io_uring_queue_init_params(unsigned entries,struct io_uring * ring,struct io_uring_params * p)406 int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
407 struct io_uring_params *p)
408 {
409 int ret;
410
411 ret = io_uring_queue_init_try_nosqarr(entries, ring, p, NULL, 0);
412 return ret >= 0 ? 0 : ret;
413 }
414
415 /*
416 * Returns -errno on error, or zero on success. On success, 'ring'
417 * contains the necessary information to read/write to the rings.
418 */
io_uring_queue_init(unsigned entries,struct io_uring * ring,unsigned flags)419 __cold int io_uring_queue_init(unsigned entries, struct io_uring *ring,
420 unsigned flags)
421 {
422 struct io_uring_params p;
423
424 memset(&p, 0, sizeof(p));
425 p.flags = flags;
426
427 return io_uring_queue_init_params(entries, ring, &p);
428 }
429
io_uring_queue_exit(struct io_uring * ring)430 __cold void io_uring_queue_exit(struct io_uring *ring)
431 {
432 struct io_uring_sq *sq = &ring->sq;
433 struct io_uring_cq *cq = &ring->cq;
434 size_t sqe_size;
435
436 if (!sq->ring_sz && !(ring->int_flags & INT_FLAG_APP_MEM)) {
437 sqe_size = sizeof(struct io_uring_sqe);
438 if (ring->flags & IORING_SETUP_SQE128)
439 sqe_size += 64;
440 __sys_munmap(sq->sqes, sqe_size * sq->ring_entries);
441 io_uring_unmap_rings(sq, cq);
442 } else {
443 if (!(ring->int_flags & INT_FLAG_APP_MEM)) {
444 __sys_munmap(sq->sqes,
445 *sq->kring_entries * sizeof(struct io_uring_sqe));
446 io_uring_unmap_rings(sq, cq);
447 }
448 }
449
450 /*
451 * Not strictly required, but frees up the slot we used now rather
452 * than at process exit time.
453 */
454 if (ring->int_flags & INT_FLAG_REG_RING)
455 io_uring_unregister_ring_fd(ring);
456 if (ring->ring_fd != -1)
457 __sys_close(ring->ring_fd);
458 }
459
io_uring_get_probe_ring(struct io_uring * ring)460 __cold struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring)
461 {
462 struct io_uring_probe *probe;
463 size_t len;
464 int r;
465
466 len = sizeof(*probe) + 256 * sizeof(struct io_uring_probe_op);
467 probe = malloc(len);
468 if (!probe)
469 return NULL;
470 memset(probe, 0, len);
471
472 r = io_uring_register_probe(ring, probe, 256);
473 if (r >= 0)
474 return probe;
475
476 free(probe);
477 return NULL;
478 }
479
io_uring_get_probe(void)480 __cold struct io_uring_probe *io_uring_get_probe(void)
481 {
482 struct io_uring ring;
483 struct io_uring_probe *probe;
484 int r;
485
486 r = io_uring_queue_init(2, &ring, 0);
487 if (r < 0)
488 return NULL;
489
490 probe = io_uring_get_probe_ring(&ring);
491 io_uring_queue_exit(&ring);
492 return probe;
493 }
494
io_uring_free_probe(struct io_uring_probe * probe)495 __cold void io_uring_free_probe(struct io_uring_probe *probe)
496 {
497 free(probe);
498 }
499
npages(size_t size,long page_size)500 static size_t npages(size_t size, long page_size)
501 {
502 size--;
503 size /= page_size;
504 return __fls((int) size);
505 }
506
rings_size(struct io_uring_params * p,unsigned entries,unsigned cq_entries,long page_size)507 static size_t rings_size(struct io_uring_params *p, unsigned entries,
508 unsigned cq_entries, long page_size)
509 {
510 size_t pages, sq_size, cq_size;
511
512 cq_size = sizeof(struct io_uring_cqe);
513 if (p->flags & IORING_SETUP_CQE32)
514 cq_size += sizeof(struct io_uring_cqe);
515 cq_size *= cq_entries;
516 cq_size += KRING_SIZE;
517 cq_size = (cq_size + 63) & ~63UL;
518 pages = (size_t) 1 << npages(cq_size, page_size);
519
520 sq_size = sizeof(struct io_uring_sqe);
521 if (p->flags & IORING_SETUP_SQE128)
522 sq_size += 64;
523 sq_size *= entries;
524 pages += (size_t) 1 << npages(sq_size, page_size);
525 return pages * page_size;
526 }
527
528 /*
529 * Return the required ulimit -l memlock memory required for a given ring
530 * setup, in bytes. May return -errno on error. On newer (5.12+) kernels,
531 * io_uring no longer requires any memlock memory, and hence this function
532 * will return 0 for that case. On older (5.11 and prior) kernels, this will
533 * return the required memory so that the caller can ensure that enough space
534 * is available before setting up a ring with the specified parameters.
535 */
io_uring_mlock_size_params(unsigned entries,struct io_uring_params * p)536 __cold ssize_t io_uring_mlock_size_params(unsigned entries,
537 struct io_uring_params *p)
538 {
539 struct io_uring_params lp;
540 struct io_uring ring;
541 unsigned cq_entries, sq;
542 long page_size;
543 ssize_t ret;
544 int cret;
545
546 memset(&lp, 0, sizeof(lp));
547
548 /*
549 * We only really use this inited ring to see if the kernel is newer
550 * or not. Newer kernels don't require memlocked memory. If we fail,
551 * it's most likely because it's an older kernel and we have no
552 * available memlock space. Just continue on, lp.features will still
553 * be zeroed at this point and we'll do the right thing.
554 */
555 ret = io_uring_queue_init_params(entries, &ring, &lp);
556 if (!ret)
557 io_uring_queue_exit(&ring);
558
559 /*
560 * Native workers imply using cgroup memory accounting, and hence no
561 * memlock memory is needed for the ring allocations.
562 */
563 if (lp.features & IORING_FEAT_NATIVE_WORKERS)
564 return 0;
565
566 if (!entries)
567 return -EINVAL;
568 if (entries > KERN_MAX_ENTRIES) {
569 if (!(p->flags & IORING_SETUP_CLAMP))
570 return -EINVAL;
571 entries = KERN_MAX_ENTRIES;
572 }
573
574 cret = get_sq_cq_entries(entries, p, &sq, &cq_entries);
575 if (cret)
576 return cret;
577
578 page_size = get_page_size();
579 return rings_size(p, sq, cq_entries, page_size);
580 }
581
582 /*
583 * Return required ulimit -l memory space for a given ring setup. See
584 * @io_uring_mlock_size_params().
585 */
io_uring_mlock_size(unsigned entries,unsigned flags)586 __cold ssize_t io_uring_mlock_size(unsigned entries, unsigned flags)
587 {
588 struct io_uring_params p;
589
590 memset(&p, 0, sizeof(p));
591 p.flags = flags;
592 return io_uring_mlock_size_params(entries, &p);
593 }
594
595 #if defined(__hppa__)
br_setup(struct io_uring * ring,unsigned int nentries,int bgid,unsigned int flags,int * err)596 static struct io_uring_buf_ring *br_setup(struct io_uring *ring,
597 unsigned int nentries, int bgid,
598 unsigned int flags, int *err)
599 {
600 struct io_uring_buf_ring *br;
601 struct io_uring_buf_reg reg;
602 size_t ring_size;
603 off_t off;
604 int lret;
605
606 memset(®, 0, sizeof(reg));
607 reg.ring_entries = nentries;
608 reg.bgid = bgid;
609 reg.flags = IOU_PBUF_RING_MMAP;
610
611 *err = 0;
612 lret = io_uring_register_buf_ring(ring, ®, flags);
613 if (lret) {
614 *err = lret;
615 return NULL;
616 }
617
618 off = IORING_OFF_PBUF_RING | (unsigned long long) bgid << IORING_OFF_PBUF_SHIFT;
619 ring_size = nentries * sizeof(struct io_uring_buf);
620 br = __sys_mmap(NULL, ring_size, PROT_READ | PROT_WRITE,
621 MAP_SHARED | MAP_POPULATE, ring->ring_fd, off);
622 if (IS_ERR(br)) {
623 *err = PTR_ERR(br);
624 return NULL;
625 }
626
627 return br;
628 }
629 #else
br_setup(struct io_uring * ring,unsigned int nentries,int bgid,unsigned int flags,int * err)630 static struct io_uring_buf_ring *br_setup(struct io_uring *ring,
631 unsigned int nentries, int bgid,
632 unsigned int flags, int *err)
633 {
634 struct io_uring_buf_ring *br;
635 struct io_uring_buf_reg reg;
636 size_t ring_size;
637 int lret;
638
639 memset(®, 0, sizeof(reg));
640 ring_size = nentries * sizeof(struct io_uring_buf);
641 br = __sys_mmap(NULL, ring_size, PROT_READ | PROT_WRITE,
642 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
643 if (IS_ERR(br)) {
644 *err = PTR_ERR(br);
645 return NULL;
646 }
647
648 reg.ring_addr = (unsigned long) (uintptr_t) br;
649 reg.ring_entries = nentries;
650 reg.bgid = bgid;
651
652 *err = 0;
653 lret = io_uring_register_buf_ring(ring, ®, flags);
654 if (lret) {
655 __sys_munmap(br, ring_size);
656 *err = lret;
657 br = NULL;
658 }
659
660 return br;
661 }
662 #endif
663
io_uring_setup_buf_ring(struct io_uring * ring,unsigned int nentries,int bgid,unsigned int flags,int * err)664 struct io_uring_buf_ring *io_uring_setup_buf_ring(struct io_uring *ring,
665 unsigned int nentries,
666 int bgid, unsigned int flags,
667 int *err)
668 {
669 struct io_uring_buf_ring *br;
670
671 br = br_setup(ring, nentries, bgid, flags, err);
672 if (br)
673 io_uring_buf_ring_init(br);
674
675 return br;
676 }
677
io_uring_free_buf_ring(struct io_uring * ring,struct io_uring_buf_ring * br,unsigned int nentries,int bgid)678 int io_uring_free_buf_ring(struct io_uring *ring, struct io_uring_buf_ring *br,
679 unsigned int nentries, int bgid)
680 {
681 int ret;
682
683 ret = io_uring_unregister_buf_ring(ring, bgid);
684 if (ret)
685 return ret;
686
687 __sys_munmap(br, nentries * sizeof(struct io_uring_buf));
688 return 0;
689 }
690