• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* SPDX-License-Identifier: MIT */
2 #define _DEFAULT_SOURCE
3 
4 #include <sys/types.h>
5 #include <sys/stat.h>
6 #include <sys/mman.h>
7 #include <unistd.h>
8 #include <errno.h>
9 #include <string.h>
10 #include <stdlib.h>
11 #include <signal.h>
12 
13 #include "liburing/compat.h"
14 #include "liburing/io_uring.h"
15 #include "liburing.h"
16 
17 #include "syscall.h"
18 
io_uring_unmap_rings(struct io_uring_sq * sq,struct io_uring_cq * cq)19 static void io_uring_unmap_rings(struct io_uring_sq *sq, struct io_uring_cq *cq)
20 {
21 	munmap(sq->ring_ptr, sq->ring_sz);
22 	if (cq->ring_ptr && cq->ring_ptr != sq->ring_ptr)
23 		munmap(cq->ring_ptr, cq->ring_sz);
24 }
25 
io_uring_mmap(int fd,struct io_uring_params * p,struct io_uring_sq * sq,struct io_uring_cq * cq)26 static int io_uring_mmap(int fd, struct io_uring_params *p,
27 			 struct io_uring_sq *sq, struct io_uring_cq *cq)
28 {
29 	size_t size;
30 	int ret;
31 
32 	sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
33 	cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
34 
35 	if (p->features & IORING_FEAT_SINGLE_MMAP) {
36 		if (cq->ring_sz > sq->ring_sz)
37 			sq->ring_sz = cq->ring_sz;
38 		cq->ring_sz = sq->ring_sz;
39 	}
40 	sq->ring_ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
41 			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
42 	if (sq->ring_ptr == MAP_FAILED)
43 		return -errno;
44 
45 	if (p->features & IORING_FEAT_SINGLE_MMAP) {
46 		cq->ring_ptr = sq->ring_ptr;
47 	} else {
48 		cq->ring_ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
49 				MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
50 		if (cq->ring_ptr == MAP_FAILED) {
51 			cq->ring_ptr = NULL;
52 			ret = -errno;
53 			goto err;
54 		}
55 	}
56 
57 	sq->khead = sq->ring_ptr + p->sq_off.head;
58 	sq->ktail = sq->ring_ptr + p->sq_off.tail;
59 	sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask;
60 	sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries;
61 	sq->kflags = sq->ring_ptr + p->sq_off.flags;
62 	sq->kdropped = sq->ring_ptr + p->sq_off.dropped;
63 	sq->array = sq->ring_ptr + p->sq_off.array;
64 
65 	size = p->sq_entries * sizeof(struct io_uring_sqe);
66 	sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
67 				MAP_SHARED | MAP_POPULATE, fd,
68 				IORING_OFF_SQES);
69 	if (sq->sqes == MAP_FAILED) {
70 		ret = -errno;
71 err:
72 		io_uring_unmap_rings(sq, cq);
73 		return ret;
74 	}
75 
76 	cq->khead = cq->ring_ptr + p->cq_off.head;
77 	cq->ktail = cq->ring_ptr + p->cq_off.tail;
78 	cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask;
79 	cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries;
80 	cq->koverflow = cq->ring_ptr + p->cq_off.overflow;
81 	cq->cqes = cq->ring_ptr + p->cq_off.cqes;
82 	if (p->cq_off.flags)
83 		cq->kflags = cq->ring_ptr + p->cq_off.flags;
84 	return 0;
85 }
86 
87 /*
88  * For users that want to specify sq_thread_cpu or sq_thread_idle, this
89  * interface is a convenient helper for mmap()ing the rings.
90  * Returns -errno on error, or zero on success.  On success, 'ring'
91  * contains the necessary information to read/write to the rings.
92  */
io_uring_queue_mmap(int fd,struct io_uring_params * p,struct io_uring * ring)93 int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring)
94 {
95 	int ret;
96 
97 	memset(ring, 0, sizeof(*ring));
98 	ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq);
99 	if (!ret) {
100 		ring->flags = p->flags;
101 		ring->ring_fd = fd;
102 	}
103 	return ret;
104 }
105 
106 /*
107  * Ensure that the mmap'ed rings aren't available to a child after a fork(2).
108  * This uses madvise(..., MADV_DONTFORK) on the mmap'ed ranges.
109  */
io_uring_ring_dontfork(struct io_uring * ring)110 int io_uring_ring_dontfork(struct io_uring *ring)
111 {
112 	size_t len;
113 	int ret;
114 
115 	if (!ring->sq.ring_ptr || !ring->sq.sqes || !ring->cq.ring_ptr)
116 		return -EINVAL;
117 
118 	len = *ring->sq.kring_entries * sizeof(struct io_uring_sqe);
119 	ret = madvise(ring->sq.sqes, len, MADV_DONTFORK);
120 	if (ret == -1)
121 		return -errno;
122 
123 	len = ring->sq.ring_sz;
124 	ret = madvise(ring->sq.ring_ptr, len, MADV_DONTFORK);
125 	if (ret == -1)
126 		return -errno;
127 
128 	if (ring->cq.ring_ptr != ring->sq.ring_ptr) {
129 		len = ring->cq.ring_sz;
130 		ret = madvise(ring->cq.ring_ptr, len, MADV_DONTFORK);
131 		if (ret == -1)
132 			return -errno;
133 	}
134 
135 	return 0;
136 }
137 
io_uring_queue_init_params(unsigned entries,struct io_uring * ring,struct io_uring_params * p)138 int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
139 			       struct io_uring_params *p)
140 {
141 	int fd, ret;
142 
143 	fd = __sys_io_uring_setup(entries, p);
144 	if (fd < 0)
145 		return -errno;
146 
147 	ret = io_uring_queue_mmap(fd, p, ring);
148 	if (ret) {
149 		close(fd);
150 		return ret;
151 	}
152 
153 	ring->features = p->features;
154 	return 0;
155 }
156 
157 /*
158  * Returns -errno on error, or zero on success. On success, 'ring'
159  * contains the necessary information to read/write to the rings.
160  */
io_uring_queue_init(unsigned entries,struct io_uring * ring,unsigned flags)161 int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags)
162 {
163 	struct io_uring_params p;
164 
165 	memset(&p, 0, sizeof(p));
166 	p.flags = flags;
167 
168 	return io_uring_queue_init_params(entries, ring, &p);
169 }
170 
io_uring_queue_exit(struct io_uring * ring)171 void io_uring_queue_exit(struct io_uring *ring)
172 {
173 	struct io_uring_sq *sq = &ring->sq;
174 	struct io_uring_cq *cq = &ring->cq;
175 
176 	munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
177 	io_uring_unmap_rings(sq, cq);
178 	close(ring->ring_fd);
179 }
180 
io_uring_get_probe_ring(struct io_uring * ring)181 struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring)
182 {
183 	struct io_uring_probe *probe;
184 	size_t len;
185 	int r;
186 
187 	len = sizeof(*probe) + 256 * sizeof(struct io_uring_probe_op);
188 	probe = malloc(len);
189 	if (!probe)
190 		return NULL;
191 	memset(probe, 0, len);
192 
193 	r = io_uring_register_probe(ring, probe, 256);
194 	if (r >= 0)
195 		return probe;
196 
197 	free(probe);
198 	return NULL;
199 }
200 
io_uring_get_probe(void)201 struct io_uring_probe *io_uring_get_probe(void)
202 {
203 	struct io_uring ring;
204 	struct io_uring_probe *probe;
205 	int r;
206 
207 	r = io_uring_queue_init(2, &ring, 0);
208 	if (r < 0)
209 		return NULL;
210 
211 	probe = io_uring_get_probe_ring(&ring);
212 	io_uring_queue_exit(&ring);
213 	return probe;
214 }
215 
io_uring_free_probe(struct io_uring_probe * probe)216 void io_uring_free_probe(struct io_uring_probe *probe)
217 {
218 	free(probe);
219 }
220 
__fls(int x)221 static int __fls(int x)
222 {
223 	int r = 32;
224 
225 	if (!x)
226 		return 0;
227 	if (!(x & 0xffff0000u)) {
228 		x <<= 16;
229 		r -= 16;
230 	}
231 	if (!(x & 0xff000000u)) {
232 		x <<= 8;
233 		r -= 8;
234 	}
235 	if (!(x & 0xf0000000u)) {
236 		x <<= 4;
237 		r -= 4;
238 	}
239 	if (!(x & 0xc0000000u)) {
240 		x <<= 2;
241 		r -= 2;
242 	}
243 	if (!(x & 0x80000000u)) {
244 		x <<= 1;
245 		r -= 1;
246 	}
247 	return r;
248 }
249 
roundup_pow2(unsigned depth)250 static unsigned roundup_pow2(unsigned depth)
251 {
252 	return 1UL << __fls(depth - 1);
253 }
254 
npages(size_t size,unsigned page_size)255 static size_t npages(size_t size, unsigned page_size)
256 {
257 	size--;
258 	size /= page_size;
259 	return __fls(size);
260 }
261 
262 #define KRING_SIZE	320
263 
rings_size(unsigned entries,unsigned cq_entries,unsigned page_size)264 static size_t rings_size(unsigned entries, unsigned cq_entries, unsigned page_size)
265 {
266 	size_t pages, sq_size, cq_size;
267 
268 	cq_size = KRING_SIZE;
269 	cq_size += cq_entries * sizeof(struct io_uring_cqe);
270 	cq_size = (cq_size + 63) & ~63UL;
271 	pages = (size_t) 1 << npages(cq_size, page_size);
272 
273 	sq_size = sizeof(struct io_uring_sqe) * entries;
274 	pages += (size_t) 1 << npages(sq_size, page_size);
275 	return pages * page_size;
276 }
277 
278 #define KERN_MAX_ENTRIES	32768
279 #define KERN_MAX_CQ_ENTRIES	(2 * KERN_MAX_ENTRIES)
280 
281 /*
282  * Return the required ulimit -l memlock memory required for a given ring
283  * setup, in bytes. May return -errno on error. On newer (5.12+) kernels,
284  * io_uring no longer requires any memlock memory, and hence this function
285  * will return 0 for that case. On older (5.11 and prior) kernels, this will
286  * return the required memory so that the caller can ensure that enough space
287  * is available before setting up a ring with the specified parameters.
288  */
io_uring_mlock_size_params(unsigned entries,struct io_uring_params * p)289 ssize_t io_uring_mlock_size_params(unsigned entries, struct io_uring_params *p)
290 {
291 	struct io_uring_params lp = { };
292 	struct io_uring ring;
293 	unsigned cq_entries;
294 	long page_size;
295 	ssize_t ret;
296 
297 	/*
298 	 * We only really use this inited ring to see if the kernel is newer
299 	 * or not. Newer kernels don't require memlocked memory. If we fail,
300 	 * it's most likely because it's an older kernel and we have no
301 	 * available memlock space. Just continue on, lp.features will still
302 	 * be zeroed at this point and we'll do the right thing.
303 	 */
304 	ret = io_uring_queue_init_params(entries, &ring, &lp);
305 	if (!ret)
306 		io_uring_queue_exit(&ring);
307 
308 	/*
309 	 * Native workers imply using cgroup memory accounting, and hence no
310 	 * memlock memory is needed for the ring allocations.
311 	 */
312 	if (lp.features & IORING_FEAT_NATIVE_WORKERS)
313 		return 0;
314 
315 	if (!entries)
316 		return -EINVAL;
317 	if (entries > KERN_MAX_ENTRIES) {
318 		if (!(p->flags & IORING_SETUP_CLAMP))
319 			return -EINVAL;
320 		entries = KERN_MAX_ENTRIES;
321 	}
322 
323 	entries = roundup_pow2(entries);
324 	if (p->flags & IORING_SETUP_CQSIZE) {
325 		if (!p->cq_entries)
326 			return -EINVAL;
327 		cq_entries = p->cq_entries;
328 		if (cq_entries > KERN_MAX_CQ_ENTRIES) {
329 			if (!(p->flags & IORING_SETUP_CLAMP))
330 				return -EINVAL;
331 			cq_entries = KERN_MAX_CQ_ENTRIES;
332 		}
333 		cq_entries = roundup_pow2(cq_entries);
334 		if (cq_entries < entries)
335 			return -EINVAL;
336 	} else {
337 		cq_entries = 2 * entries;
338 	}
339 
340 	page_size = sysconf(_SC_PAGESIZE);
341 	if (page_size < 0)
342 		page_size = 4096;
343 
344 	return rings_size(entries, cq_entries, page_size);
345 }
346 
347 /*
348  * Return required ulimit -l memory space for a given ring setup. See
349  * @io_uring_mlock_size_params().
350  */
io_uring_mlock_size(unsigned entries,unsigned flags)351 ssize_t io_uring_mlock_size(unsigned entries, unsigned flags)
352 {
353 	struct io_uring_params p = { .flags = flags, };
354 
355 	return io_uring_mlock_size_params(entries, &p);
356 }
357