1 /* SPDX-License-Identifier: MIT */
2 #define _DEFAULT_SOURCE
3
4 #include <sys/types.h>
5 #include <sys/stat.h>
6 #include <sys/mman.h>
7 #include <unistd.h>
8 #include <errno.h>
9 #include <string.h>
10 #include <stdlib.h>
11 #include <signal.h>
12
13 #include "liburing/compat.h"
14 #include "liburing/io_uring.h"
15 #include "liburing.h"
16
17 #include "syscall.h"
18
io_uring_unmap_rings(struct io_uring_sq * sq,struct io_uring_cq * cq)19 static void io_uring_unmap_rings(struct io_uring_sq *sq, struct io_uring_cq *cq)
20 {
21 munmap(sq->ring_ptr, sq->ring_sz);
22 if (cq->ring_ptr && cq->ring_ptr != sq->ring_ptr)
23 munmap(cq->ring_ptr, cq->ring_sz);
24 }
25
io_uring_mmap(int fd,struct io_uring_params * p,struct io_uring_sq * sq,struct io_uring_cq * cq)26 static int io_uring_mmap(int fd, struct io_uring_params *p,
27 struct io_uring_sq *sq, struct io_uring_cq *cq)
28 {
29 size_t size;
30 int ret;
31
32 sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
33 cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
34
35 if (p->features & IORING_FEAT_SINGLE_MMAP) {
36 if (cq->ring_sz > sq->ring_sz)
37 sq->ring_sz = cq->ring_sz;
38 cq->ring_sz = sq->ring_sz;
39 }
40 sq->ring_ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
41 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
42 if (sq->ring_ptr == MAP_FAILED)
43 return -errno;
44
45 if (p->features & IORING_FEAT_SINGLE_MMAP) {
46 cq->ring_ptr = sq->ring_ptr;
47 } else {
48 cq->ring_ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
49 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
50 if (cq->ring_ptr == MAP_FAILED) {
51 cq->ring_ptr = NULL;
52 ret = -errno;
53 goto err;
54 }
55 }
56
57 sq->khead = sq->ring_ptr + p->sq_off.head;
58 sq->ktail = sq->ring_ptr + p->sq_off.tail;
59 sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask;
60 sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries;
61 sq->kflags = sq->ring_ptr + p->sq_off.flags;
62 sq->kdropped = sq->ring_ptr + p->sq_off.dropped;
63 sq->array = sq->ring_ptr + p->sq_off.array;
64
65 size = p->sq_entries * sizeof(struct io_uring_sqe);
66 sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
67 MAP_SHARED | MAP_POPULATE, fd,
68 IORING_OFF_SQES);
69 if (sq->sqes == MAP_FAILED) {
70 ret = -errno;
71 err:
72 io_uring_unmap_rings(sq, cq);
73 return ret;
74 }
75
76 cq->khead = cq->ring_ptr + p->cq_off.head;
77 cq->ktail = cq->ring_ptr + p->cq_off.tail;
78 cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask;
79 cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries;
80 cq->koverflow = cq->ring_ptr + p->cq_off.overflow;
81 cq->cqes = cq->ring_ptr + p->cq_off.cqes;
82 if (p->cq_off.flags)
83 cq->kflags = cq->ring_ptr + p->cq_off.flags;
84 return 0;
85 }
86
87 /*
88 * For users that want to specify sq_thread_cpu or sq_thread_idle, this
89 * interface is a convenient helper for mmap()ing the rings.
90 * Returns -errno on error, or zero on success. On success, 'ring'
91 * contains the necessary information to read/write to the rings.
92 */
io_uring_queue_mmap(int fd,struct io_uring_params * p,struct io_uring * ring)93 int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring)
94 {
95 int ret;
96
97 memset(ring, 0, sizeof(*ring));
98 ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq);
99 if (!ret) {
100 ring->flags = p->flags;
101 ring->ring_fd = fd;
102 }
103 return ret;
104 }
105
106 /*
107 * Ensure that the mmap'ed rings aren't available to a child after a fork(2).
108 * This uses madvise(..., MADV_DONTFORK) on the mmap'ed ranges.
109 */
io_uring_ring_dontfork(struct io_uring * ring)110 int io_uring_ring_dontfork(struct io_uring *ring)
111 {
112 size_t len;
113 int ret;
114
115 if (!ring->sq.ring_ptr || !ring->sq.sqes || !ring->cq.ring_ptr)
116 return -EINVAL;
117
118 len = *ring->sq.kring_entries * sizeof(struct io_uring_sqe);
119 ret = madvise(ring->sq.sqes, len, MADV_DONTFORK);
120 if (ret == -1)
121 return -errno;
122
123 len = ring->sq.ring_sz;
124 ret = madvise(ring->sq.ring_ptr, len, MADV_DONTFORK);
125 if (ret == -1)
126 return -errno;
127
128 if (ring->cq.ring_ptr != ring->sq.ring_ptr) {
129 len = ring->cq.ring_sz;
130 ret = madvise(ring->cq.ring_ptr, len, MADV_DONTFORK);
131 if (ret == -1)
132 return -errno;
133 }
134
135 return 0;
136 }
137
io_uring_queue_init_params(unsigned entries,struct io_uring * ring,struct io_uring_params * p)138 int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
139 struct io_uring_params *p)
140 {
141 int fd, ret;
142
143 fd = __sys_io_uring_setup(entries, p);
144 if (fd < 0)
145 return -errno;
146
147 ret = io_uring_queue_mmap(fd, p, ring);
148 if (ret) {
149 close(fd);
150 return ret;
151 }
152
153 ring->features = p->features;
154 return 0;
155 }
156
157 /*
158 * Returns -errno on error, or zero on success. On success, 'ring'
159 * contains the necessary information to read/write to the rings.
160 */
io_uring_queue_init(unsigned entries,struct io_uring * ring,unsigned flags)161 int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags)
162 {
163 struct io_uring_params p;
164
165 memset(&p, 0, sizeof(p));
166 p.flags = flags;
167
168 return io_uring_queue_init_params(entries, ring, &p);
169 }
170
io_uring_queue_exit(struct io_uring * ring)171 void io_uring_queue_exit(struct io_uring *ring)
172 {
173 struct io_uring_sq *sq = &ring->sq;
174 struct io_uring_cq *cq = &ring->cq;
175
176 munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
177 io_uring_unmap_rings(sq, cq);
178 close(ring->ring_fd);
179 }
180
io_uring_get_probe_ring(struct io_uring * ring)181 struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring)
182 {
183 struct io_uring_probe *probe;
184 size_t len;
185 int r;
186
187 len = sizeof(*probe) + 256 * sizeof(struct io_uring_probe_op);
188 probe = malloc(len);
189 if (!probe)
190 return NULL;
191 memset(probe, 0, len);
192
193 r = io_uring_register_probe(ring, probe, 256);
194 if (r >= 0)
195 return probe;
196
197 free(probe);
198 return NULL;
199 }
200
io_uring_get_probe(void)201 struct io_uring_probe *io_uring_get_probe(void)
202 {
203 struct io_uring ring;
204 struct io_uring_probe *probe;
205 int r;
206
207 r = io_uring_queue_init(2, &ring, 0);
208 if (r < 0)
209 return NULL;
210
211 probe = io_uring_get_probe_ring(&ring);
212 io_uring_queue_exit(&ring);
213 return probe;
214 }
215
io_uring_free_probe(struct io_uring_probe * probe)216 void io_uring_free_probe(struct io_uring_probe *probe)
217 {
218 free(probe);
219 }
220
__fls(int x)221 static int __fls(int x)
222 {
223 int r = 32;
224
225 if (!x)
226 return 0;
227 if (!(x & 0xffff0000u)) {
228 x <<= 16;
229 r -= 16;
230 }
231 if (!(x & 0xff000000u)) {
232 x <<= 8;
233 r -= 8;
234 }
235 if (!(x & 0xf0000000u)) {
236 x <<= 4;
237 r -= 4;
238 }
239 if (!(x & 0xc0000000u)) {
240 x <<= 2;
241 r -= 2;
242 }
243 if (!(x & 0x80000000u)) {
244 x <<= 1;
245 r -= 1;
246 }
247 return r;
248 }
249
roundup_pow2(unsigned depth)250 static unsigned roundup_pow2(unsigned depth)
251 {
252 return 1UL << __fls(depth - 1);
253 }
254
npages(size_t size,unsigned page_size)255 static size_t npages(size_t size, unsigned page_size)
256 {
257 size--;
258 size /= page_size;
259 return __fls(size);
260 }
261
262 #define KRING_SIZE 320
263
rings_size(unsigned entries,unsigned cq_entries,unsigned page_size)264 static size_t rings_size(unsigned entries, unsigned cq_entries, unsigned page_size)
265 {
266 size_t pages, sq_size, cq_size;
267
268 cq_size = KRING_SIZE;
269 cq_size += cq_entries * sizeof(struct io_uring_cqe);
270 cq_size = (cq_size + 63) & ~63UL;
271 pages = (size_t) 1 << npages(cq_size, page_size);
272
273 sq_size = sizeof(struct io_uring_sqe) * entries;
274 pages += (size_t) 1 << npages(sq_size, page_size);
275 return pages * page_size;
276 }
277
278 #define KERN_MAX_ENTRIES 32768
279 #define KERN_MAX_CQ_ENTRIES (2 * KERN_MAX_ENTRIES)
280
281 /*
282 * Return the required ulimit -l memlock memory required for a given ring
283 * setup, in bytes. May return -errno on error. On newer (5.12+) kernels,
284 * io_uring no longer requires any memlock memory, and hence this function
285 * will return 0 for that case. On older (5.11 and prior) kernels, this will
286 * return the required memory so that the caller can ensure that enough space
287 * is available before setting up a ring with the specified parameters.
288 */
io_uring_mlock_size_params(unsigned entries,struct io_uring_params * p)289 ssize_t io_uring_mlock_size_params(unsigned entries, struct io_uring_params *p)
290 {
291 struct io_uring_params lp = { };
292 struct io_uring ring;
293 unsigned cq_entries;
294 long page_size;
295 ssize_t ret;
296
297 /*
298 * We only really use this inited ring to see if the kernel is newer
299 * or not. Newer kernels don't require memlocked memory. If we fail,
300 * it's most likely because it's an older kernel and we have no
301 * available memlock space. Just continue on, lp.features will still
302 * be zeroed at this point and we'll do the right thing.
303 */
304 ret = io_uring_queue_init_params(entries, &ring, &lp);
305 if (!ret)
306 io_uring_queue_exit(&ring);
307
308 /*
309 * Native workers imply using cgroup memory accounting, and hence no
310 * memlock memory is needed for the ring allocations.
311 */
312 if (lp.features & IORING_FEAT_NATIVE_WORKERS)
313 return 0;
314
315 if (!entries)
316 return -EINVAL;
317 if (entries > KERN_MAX_ENTRIES) {
318 if (!(p->flags & IORING_SETUP_CLAMP))
319 return -EINVAL;
320 entries = KERN_MAX_ENTRIES;
321 }
322
323 entries = roundup_pow2(entries);
324 if (p->flags & IORING_SETUP_CQSIZE) {
325 if (!p->cq_entries)
326 return -EINVAL;
327 cq_entries = p->cq_entries;
328 if (cq_entries > KERN_MAX_CQ_ENTRIES) {
329 if (!(p->flags & IORING_SETUP_CLAMP))
330 return -EINVAL;
331 cq_entries = KERN_MAX_CQ_ENTRIES;
332 }
333 cq_entries = roundup_pow2(cq_entries);
334 if (cq_entries < entries)
335 return -EINVAL;
336 } else {
337 cq_entries = 2 * entries;
338 }
339
340 page_size = sysconf(_SC_PAGESIZE);
341 if (page_size < 0)
342 page_size = 4096;
343
344 return rings_size(entries, cq_entries, page_size);
345 }
346
347 /*
348 * Return required ulimit -l memory space for a given ring setup. See
349 * @io_uring_mlock_size_params().
350 */
io_uring_mlock_size(unsigned entries,unsigned flags)351 ssize_t io_uring_mlock_size(unsigned entries, unsigned flags)
352 {
353 struct io_uring_params p = { .flags = flags, };
354
355 return io_uring_mlock_size_params(entries, &p);
356 }
357