• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* SPDX-License-Identifier: MIT */
2 #define _DEFAULT_SOURCE
3 
4 #include "lib.h"
5 #include "syscall.h"
6 #include "liburing.h"
7 #include "int_flags.h"
8 #include "liburing/compat.h"
9 #include "liburing/io_uring.h"
10 
io_uring_unmap_rings(struct io_uring_sq * sq,struct io_uring_cq * cq)11 static void io_uring_unmap_rings(struct io_uring_sq *sq, struct io_uring_cq *cq)
12 {
13 	__sys_munmap(sq->ring_ptr, sq->ring_sz);
14 	if (cq->ring_ptr && cq->ring_ptr != sq->ring_ptr)
15 		__sys_munmap(cq->ring_ptr, cq->ring_sz);
16 }
17 
io_uring_mmap(int fd,struct io_uring_params * p,struct io_uring_sq * sq,struct io_uring_cq * cq)18 static int io_uring_mmap(int fd, struct io_uring_params *p,
19 			 struct io_uring_sq *sq, struct io_uring_cq *cq)
20 {
21 	size_t size;
22 	int ret;
23 
24 	size = sizeof(struct io_uring_cqe);
25 	if (p->flags & IORING_SETUP_CQE32)
26 		size += sizeof(struct io_uring_cqe);
27 
28 	sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
29 	cq->ring_sz = p->cq_off.cqes + p->cq_entries * size;
30 
31 	if (p->features & IORING_FEAT_SINGLE_MMAP) {
32 		if (cq->ring_sz > sq->ring_sz)
33 			sq->ring_sz = cq->ring_sz;
34 		cq->ring_sz = sq->ring_sz;
35 	}
36 	sq->ring_ptr = __sys_mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
37 				  MAP_SHARED | MAP_POPULATE, fd,
38 				  IORING_OFF_SQ_RING);
39 	if (IS_ERR(sq->ring_ptr))
40 		return PTR_ERR(sq->ring_ptr);
41 
42 	if (p->features & IORING_FEAT_SINGLE_MMAP) {
43 		cq->ring_ptr = sq->ring_ptr;
44 	} else {
45 		cq->ring_ptr = __sys_mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
46 					  MAP_SHARED | MAP_POPULATE, fd,
47 					  IORING_OFF_CQ_RING);
48 		if (IS_ERR(cq->ring_ptr)) {
49 			ret = PTR_ERR(cq->ring_ptr);
50 			cq->ring_ptr = NULL;
51 			goto err;
52 		}
53 	}
54 
55 	sq->khead = sq->ring_ptr + p->sq_off.head;
56 	sq->ktail = sq->ring_ptr + p->sq_off.tail;
57 	sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask;
58 	sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries;
59 	sq->kflags = sq->ring_ptr + p->sq_off.flags;
60 	sq->kdropped = sq->ring_ptr + p->sq_off.dropped;
61 	sq->array = sq->ring_ptr + p->sq_off.array;
62 
63 	size = sizeof(struct io_uring_sqe);
64 	if (p->flags & IORING_SETUP_SQE128)
65 		size += 64;
66 	sq->sqes = __sys_mmap(0, size * p->sq_entries, PROT_READ | PROT_WRITE,
67 			      MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
68 	if (IS_ERR(sq->sqes)) {
69 		ret = PTR_ERR(sq->sqes);
70 err:
71 		io_uring_unmap_rings(sq, cq);
72 		return ret;
73 	}
74 
75 	cq->khead = cq->ring_ptr + p->cq_off.head;
76 	cq->ktail = cq->ring_ptr + p->cq_off.tail;
77 	cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask;
78 	cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries;
79 	cq->koverflow = cq->ring_ptr + p->cq_off.overflow;
80 	cq->cqes = cq->ring_ptr + p->cq_off.cqes;
81 	if (p->cq_off.flags)
82 		cq->kflags = cq->ring_ptr + p->cq_off.flags;
83 	return 0;
84 }
85 
86 /*
87  * For users that want to specify sq_thread_cpu or sq_thread_idle, this
88  * interface is a convenient helper for mmap()ing the rings.
89  * Returns -errno on error, or zero on success.  On success, 'ring'
90  * contains the necessary information to read/write to the rings.
91  */
io_uring_queue_mmap(int fd,struct io_uring_params * p,struct io_uring * ring)92 int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring)
93 {
94 	int ret;
95 
96 	memset(ring, 0, sizeof(*ring));
97 	ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq);
98 	if (!ret) {
99 		ring->flags = p->flags;
100 		ring->ring_fd = ring->enter_ring_fd = fd;
101 		ring->int_flags = 0;
102 	}
103 	return ret;
104 }
105 
106 /*
107  * Ensure that the mmap'ed rings aren't available to a child after a fork(2).
108  * This uses madvise(..., MADV_DONTFORK) on the mmap'ed ranges.
109  */
io_uring_ring_dontfork(struct io_uring * ring)110 int io_uring_ring_dontfork(struct io_uring *ring)
111 {
112 	size_t len;
113 	int ret;
114 
115 	if (!ring->sq.ring_ptr || !ring->sq.sqes || !ring->cq.ring_ptr)
116 		return -EINVAL;
117 
118 	len = sizeof(struct io_uring_sqe);
119 	if (ring->flags & IORING_SETUP_SQE128)
120 		len += 64;
121 	len *= *ring->sq.kring_entries;
122 	ret = __sys_madvise(ring->sq.sqes, len, MADV_DONTFORK);
123 	if (ret < 0)
124 		return ret;
125 
126 	len = ring->sq.ring_sz;
127 	ret = __sys_madvise(ring->sq.ring_ptr, len, MADV_DONTFORK);
128 	if (ret < 0)
129 		return ret;
130 
131 	if (ring->cq.ring_ptr != ring->sq.ring_ptr) {
132 		len = ring->cq.ring_sz;
133 		ret = __sys_madvise(ring->cq.ring_ptr, len, MADV_DONTFORK);
134 		if (ret < 0)
135 			return ret;
136 	}
137 
138 	return 0;
139 }
140 
io_uring_queue_init_params(unsigned entries,struct io_uring * ring,struct io_uring_params * p)141 int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
142 			       struct io_uring_params *p)
143 {
144 	int fd, ret;
145 
146 	fd = ____sys_io_uring_setup(entries, p);
147 	if (fd < 0)
148 		return fd;
149 
150 	ret = io_uring_queue_mmap(fd, p, ring);
151 	if (ret) {
152 		__sys_close(fd);
153 		return ret;
154 	}
155 
156 	ring->features = p->features;
157 	return 0;
158 }
159 
160 /*
161  * Returns -errno on error, or zero on success. On success, 'ring'
162  * contains the necessary information to read/write to the rings.
163  */
io_uring_queue_init(unsigned entries,struct io_uring * ring,unsigned flags)164 int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags)
165 {
166 	struct io_uring_params p;
167 
168 	memset(&p, 0, sizeof(p));
169 	p.flags = flags;
170 
171 	return io_uring_queue_init_params(entries, ring, &p);
172 }
173 
io_uring_queue_exit(struct io_uring * ring)174 void io_uring_queue_exit(struct io_uring *ring)
175 {
176 	struct io_uring_sq *sq = &ring->sq;
177 	struct io_uring_cq *cq = &ring->cq;
178 	size_t sqe_size;
179 
180 	sqe_size = sizeof(struct io_uring_sqe);
181 	if (ring->flags & IORING_SETUP_SQE128)
182 		sqe_size += 64;
183 	__sys_munmap(sq->sqes, sqe_size * *sq->kring_entries);
184 	io_uring_unmap_rings(sq, cq);
185 	/*
186 	 * Not strictly required, but frees up the slot we used now rather
187 	 * than at process exit time.
188 	 */
189 	if (ring->int_flags & INT_FLAG_REG_RING)
190 		io_uring_unregister_ring_fd(ring);
191 	__sys_close(ring->ring_fd);
192 }
193 
io_uring_get_probe_ring(struct io_uring * ring)194 struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring)
195 {
196 	struct io_uring_probe *probe;
197 	size_t len;
198 	int r;
199 
200 	len = sizeof(*probe) + 256 * sizeof(struct io_uring_probe_op);
201 	probe = uring_malloc(len);
202 	if (!probe)
203 		return NULL;
204 	memset(probe, 0, len);
205 
206 	r = io_uring_register_probe(ring, probe, 256);
207 	if (r >= 0)
208 		return probe;
209 
210 	uring_free(probe);
211 	return NULL;
212 }
213 
io_uring_get_probe(void)214 struct io_uring_probe *io_uring_get_probe(void)
215 {
216 	struct io_uring ring;
217 	struct io_uring_probe *probe;
218 	int r;
219 
220 	r = io_uring_queue_init(2, &ring, 0);
221 	if (r < 0)
222 		return NULL;
223 
224 	probe = io_uring_get_probe_ring(&ring);
225 	io_uring_queue_exit(&ring);
226 	return probe;
227 }
228 
io_uring_free_probe(struct io_uring_probe * probe)229 void io_uring_free_probe(struct io_uring_probe *probe)
230 {
231 	uring_free(probe);
232 }
233 
__fls(int x)234 static inline int __fls(int x)
235 {
236 	if (!x)
237 		return 0;
238 	return 8 * sizeof(x) - __builtin_clz(x);
239 }
240 
roundup_pow2(unsigned depth)241 static unsigned roundup_pow2(unsigned depth)
242 {
243 	return 1UL << __fls(depth - 1);
244 }
245 
npages(size_t size,unsigned page_size)246 static size_t npages(size_t size, unsigned page_size)
247 {
248 	size--;
249 	size /= page_size;
250 	return __fls(size);
251 }
252 
253 #define KRING_SIZE	320
254 
rings_size(struct io_uring_params * p,unsigned entries,unsigned cq_entries,unsigned page_size)255 static size_t rings_size(struct io_uring_params *p, unsigned entries,
256 			 unsigned cq_entries, unsigned page_size)
257 {
258 	size_t pages, sq_size, cq_size;
259 
260 	cq_size = sizeof(struct io_uring_cqe);
261 	if (p->flags & IORING_SETUP_CQE32)
262 		cq_size += sizeof(struct io_uring_cqe);
263 	cq_size *= cq_entries;
264 	cq_size += KRING_SIZE;
265 	cq_size = (cq_size + 63) & ~63UL;
266 	pages = (size_t) 1 << npages(cq_size, page_size);
267 
268 	sq_size = sizeof(struct io_uring_sqe);
269 	if (p->flags & IORING_SETUP_SQE128)
270 		sq_size += 64;
271 	sq_size *= entries;
272 	pages += (size_t) 1 << npages(sq_size, page_size);
273 	return pages * page_size;
274 }
275 
276 #define KERN_MAX_ENTRIES	32768
277 #define KERN_MAX_CQ_ENTRIES	(2 * KERN_MAX_ENTRIES)
278 
279 /*
280  * Return the required ulimit -l memlock memory required for a given ring
281  * setup, in bytes. May return -errno on error. On newer (5.12+) kernels,
282  * io_uring no longer requires any memlock memory, and hence this function
283  * will return 0 for that case. On older (5.11 and prior) kernels, this will
284  * return the required memory so that the caller can ensure that enough space
285  * is available before setting up a ring with the specified parameters.
286  */
io_uring_mlock_size_params(unsigned entries,struct io_uring_params * p)287 ssize_t io_uring_mlock_size_params(unsigned entries, struct io_uring_params *p)
288 {
289 	struct io_uring_params lp = { };
290 	struct io_uring ring;
291 	unsigned cq_entries;
292 	long page_size;
293 	ssize_t ret;
294 
295 	/*
296 	 * We only really use this inited ring to see if the kernel is newer
297 	 * or not. Newer kernels don't require memlocked memory. If we fail,
298 	 * it's most likely because it's an older kernel and we have no
299 	 * available memlock space. Just continue on, lp.features will still
300 	 * be zeroed at this point and we'll do the right thing.
301 	 */
302 	ret = io_uring_queue_init_params(entries, &ring, &lp);
303 	if (!ret)
304 		io_uring_queue_exit(&ring);
305 
306 	/*
307 	 * Native workers imply using cgroup memory accounting, and hence no
308 	 * memlock memory is needed for the ring allocations.
309 	 */
310 	if (lp.features & IORING_FEAT_NATIVE_WORKERS)
311 		return 0;
312 
313 	if (!entries)
314 		return -EINVAL;
315 	if (entries > KERN_MAX_ENTRIES) {
316 		if (!(p->flags & IORING_SETUP_CLAMP))
317 			return -EINVAL;
318 		entries = KERN_MAX_ENTRIES;
319 	}
320 
321 	entries = roundup_pow2(entries);
322 	if (p->flags & IORING_SETUP_CQSIZE) {
323 		if (!p->cq_entries)
324 			return -EINVAL;
325 		cq_entries = p->cq_entries;
326 		if (cq_entries > KERN_MAX_CQ_ENTRIES) {
327 			if (!(p->flags & IORING_SETUP_CLAMP))
328 				return -EINVAL;
329 			cq_entries = KERN_MAX_CQ_ENTRIES;
330 		}
331 		cq_entries = roundup_pow2(cq_entries);
332 		if (cq_entries < entries)
333 			return -EINVAL;
334 	} else {
335 		cq_entries = 2 * entries;
336 	}
337 
338 	page_size = get_page_size();
339 	return rings_size(p, entries, cq_entries, page_size);
340 }
341 
342 /*
343  * Return required ulimit -l memory space for a given ring setup. See
344  * @io_uring_mlock_size_params().
345  */
io_uring_mlock_size(unsigned entries,unsigned flags)346 ssize_t io_uring_mlock_size(unsigned entries, unsigned flags)
347 {
348 	struct io_uring_params p = { .flags = flags, };
349 
350 	return io_uring_mlock_size_params(entries, &p);
351 }
352