1 /* Copyright libuv contributors. All rights reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to
5 * deal in the Software without restriction, including without limitation the
6 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 * sell copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 * IN THE SOFTWARE.
20 */
21
22 #include "uv.h"
23 #include "internal.h"
24 #include <errno.h>
25 #include <sys/epoll.h>
26
uv__epoll_init(uv_loop_t * loop)27 int uv__epoll_init(uv_loop_t* loop) {
28 int fd;
29 fd = epoll_create1(O_CLOEXEC);
30
31 /* epoll_create1() can fail either because it's not implemented (old kernel)
32 * or because it doesn't understand the O_CLOEXEC flag.
33 */
34 if (fd == -1 && (errno == ENOSYS || errno == EINVAL)) {
35 fd = epoll_create(256);
36
37 if (fd != -1)
38 uv__cloexec(fd, 1);
39 }
40
41 loop->backend_fd = fd;
42 if (fd == -1)
43 return UV__ERR(errno);
44
45 return 0;
46 }
47
48
uv__platform_invalidate_fd(uv_loop_t * loop,int fd)49 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
50 struct epoll_event* events;
51 struct epoll_event dummy;
52 uintptr_t i;
53 uintptr_t nfds;
54
55 assert(loop->watchers != NULL);
56 assert(fd >= 0);
57
58 events = (struct epoll_event*) loop->watchers[loop->nwatchers];
59 nfds = (uintptr_t) loop->watchers[loop->nwatchers + 1];
60 if (events != NULL)
61 /* Invalidate events with same file descriptor */
62 for (i = 0; i < nfds; i++)
63 if (events[i].data.fd == fd)
64 events[i].data.fd = -1;
65
66 /* Remove the file descriptor from the epoll.
67 * This avoids a problem where the same file description remains open
68 * in another process, causing repeated junk epoll events.
69 *
70 * We pass in a dummy epoll_event, to work around a bug in old kernels.
71 */
72 if (loop->backend_fd >= 0) {
73 /* Work around a bug in kernels 3.10 to 3.19 where passing a struct that
74 * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
75 */
76 memset(&dummy, 0, sizeof(dummy));
77 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
78 }
79 }
80
81
uv__io_check_fd(uv_loop_t * loop,int fd)82 int uv__io_check_fd(uv_loop_t* loop, int fd) {
83 struct epoll_event e;
84 int rc;
85
86 memset(&e, 0, sizeof(e));
87 e.events = POLLIN;
88 e.data.fd = -1;
89
90 rc = 0;
91 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
92 if (errno != EEXIST)
93 rc = UV__ERR(errno);
94
95 if (rc == 0)
96 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
97 abort();
98
99 return rc;
100 }
101
102
uv__io_poll(uv_loop_t * loop,int timeout)103 void uv__io_poll(uv_loop_t* loop, int timeout) {
104 /* A bug in kernels < 2.6.37 makes timeouts larger than ~30 minutes
105 * effectively infinite on 32 bits architectures. To avoid blocking
106 * indefinitely, we cap the timeout and poll again if necessary.
107 *
108 * Note that "30 minutes" is a simplification because it depends on
109 * the value of CONFIG_HZ. The magic constant assumes CONFIG_HZ=1200,
110 * that being the largest value I have seen in the wild (and only once.)
111 */
112 static const int max_safe_timeout = 1789569;
113 static int no_epoll_pwait_cached;
114 static int no_epoll_wait_cached;
115 int no_epoll_pwait;
116 int no_epoll_wait;
117 struct epoll_event events[1024];
118 struct epoll_event* pe;
119 struct epoll_event e;
120 int real_timeout;
121 QUEUE* q;
122 uv__io_t* w;
123 sigset_t sigset;
124 uint64_t sigmask;
125 uint64_t base;
126 int have_signals;
127 int nevents;
128 int count;
129 int nfds;
130 int fd;
131 int op;
132 int i;
133 int user_timeout;
134 int reset_timeout;
135
136 if (loop->nfds == 0) {
137 assert(QUEUE_EMPTY(&loop->watcher_queue));
138 return;
139 }
140
141 memset(&e, 0, sizeof(e));
142
143 while (!QUEUE_EMPTY(&loop->watcher_queue)) {
144 q = QUEUE_HEAD(&loop->watcher_queue);
145 QUEUE_REMOVE(q);
146 QUEUE_INIT(q);
147
148 w = QUEUE_DATA(q, uv__io_t, watcher_queue);
149 assert(w->pevents != 0);
150 assert(w->fd >= 0);
151 assert(w->fd < (int) loop->nwatchers);
152
153 e.events = w->pevents;
154 e.data.fd = w->fd;
155
156 if (w->events == 0)
157 op = EPOLL_CTL_ADD;
158 else
159 op = EPOLL_CTL_MOD;
160
161 /* XXX Future optimization: do EPOLL_CTL_MOD lazily if we stop watching
162 * events, skip the syscall and squelch the events after epoll_wait().
163 */
164 if (epoll_ctl(loop->backend_fd, op, w->fd, &e)) {
165 if (errno != EEXIST)
166 abort();
167
168 assert(op == EPOLL_CTL_ADD);
169
170 /* We've reactivated a file descriptor that's been watched before. */
171 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_MOD, w->fd, &e))
172 abort();
173 }
174
175 w->events = w->pevents;
176 }
177
178 sigmask = 0;
179 if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
180 sigemptyset(&sigset);
181 sigaddset(&sigset, SIGPROF);
182 sigmask |= 1 << (SIGPROF - 1);
183 }
184
185 assert(timeout >= -1);
186 base = loop->time;
187 count = 48; /* Benchmarks suggest this gives the best throughput. */
188 real_timeout = timeout;
189
190 if (uv__get_internal_fields(loop)->flags & UV_METRICS_IDLE_TIME) {
191 reset_timeout = 1;
192 user_timeout = timeout;
193 timeout = 0;
194 } else {
195 reset_timeout = 0;
196 user_timeout = 0;
197 }
198
199 /* You could argue there is a dependency between these two but
200 * ultimately we don't care about their ordering with respect
201 * to one another. Worst case, we make a few system calls that
202 * could have been avoided because another thread already knows
203 * they fail with ENOSYS. Hardly the end of the world.
204 */
205 no_epoll_pwait = uv__load_relaxed(&no_epoll_pwait_cached);
206 no_epoll_wait = uv__load_relaxed(&no_epoll_wait_cached);
207
208 for (;;) {
209 /* Only need to set the provider_entry_time if timeout != 0. The function
210 * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
211 */
212 if (timeout != 0)
213 uv__metrics_set_provider_entry_time(loop);
214
215 /* See the comment for max_safe_timeout for an explanation of why
216 * this is necessary. Executive summary: kernel bug workaround.
217 */
218 if (sizeof(int32_t) == sizeof(long) && timeout >= max_safe_timeout)
219 timeout = max_safe_timeout;
220
221 if (sigmask != 0 && no_epoll_pwait != 0)
222 if (pthread_sigmask(SIG_BLOCK, &sigset, NULL))
223 abort();
224
225 if (no_epoll_wait != 0 || (sigmask != 0 && no_epoll_pwait == 0)) {
226 nfds = epoll_pwait(loop->backend_fd,
227 events,
228 ARRAY_SIZE(events),
229 timeout,
230 &sigset);
231 if (nfds == -1 && errno == ENOSYS) {
232 uv__store_relaxed(&no_epoll_pwait_cached, 1);
233 no_epoll_pwait = 1;
234 }
235 } else {
236 nfds = epoll_wait(loop->backend_fd,
237 events,
238 ARRAY_SIZE(events),
239 timeout);
240 if (nfds == -1 && errno == ENOSYS) {
241 uv__store_relaxed(&no_epoll_wait_cached, 1);
242 no_epoll_wait = 1;
243 }
244 }
245
246 if (sigmask != 0 && no_epoll_pwait != 0)
247 if (pthread_sigmask(SIG_UNBLOCK, &sigset, NULL))
248 abort();
249
250 /* Update loop->time unconditionally. It's tempting to skip the update when
251 * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
252 * operating system didn't reschedule our process while in the syscall.
253 */
254 SAVE_ERRNO(uv__update_time(loop));
255
256 if (nfds == 0) {
257 assert(timeout != -1);
258
259 if (reset_timeout != 0) {
260 timeout = user_timeout;
261 reset_timeout = 0;
262 }
263
264 if (timeout == -1)
265 continue;
266
267 if (timeout == 0)
268 return;
269
270 /* We may have been inside the system call for longer than |timeout|
271 * milliseconds so we need to update the timestamp to avoid drift.
272 */
273 goto update_timeout;
274 }
275
276 if (nfds == -1) {
277 if (errno == ENOSYS) {
278 /* epoll_wait() or epoll_pwait() failed, try the other system call. */
279 assert(no_epoll_wait == 0 || no_epoll_pwait == 0);
280 continue;
281 }
282
283 if (errno != EINTR)
284 abort();
285
286 if (reset_timeout != 0) {
287 timeout = user_timeout;
288 reset_timeout = 0;
289 }
290
291 if (timeout == -1)
292 continue;
293
294 if (timeout == 0)
295 return;
296
297 /* Interrupted by a signal. Update timeout and poll again. */
298 goto update_timeout;
299 }
300
301 have_signals = 0;
302 nevents = 0;
303
304 {
305 /* Squelch a -Waddress-of-packed-member warning with gcc >= 9. */
306 union {
307 struct epoll_event* events;
308 uv__io_t* watchers;
309 } x;
310
311 x.events = events;
312 assert(loop->watchers != NULL);
313 loop->watchers[loop->nwatchers] = x.watchers;
314 loop->watchers[loop->nwatchers + 1] = (void*) (uintptr_t) nfds;
315 }
316
317 for (i = 0; i < nfds; i++) {
318 pe = events + i;
319 fd = pe->data.fd;
320
321 /* Skip invalidated events, see uv__platform_invalidate_fd */
322 if (fd == -1)
323 continue;
324
325 assert(fd >= 0);
326 assert((unsigned) fd < loop->nwatchers);
327
328 w = loop->watchers[fd];
329
330 if (w == NULL) {
331 /* File descriptor that we've stopped watching, disarm it.
332 *
333 * Ignore all errors because we may be racing with another thread
334 * when the file descriptor is closed.
335 */
336 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, pe);
337 continue;
338 }
339
340 /* Give users only events they're interested in. Prevents spurious
341 * callbacks when previous callback invocation in this loop has stopped
342 * the current watcher. Also, filters out events that users has not
343 * requested us to watch.
344 */
345 pe->events &= w->pevents | POLLERR | POLLHUP;
346
347 /* Work around an epoll quirk where it sometimes reports just the
348 * EPOLLERR or EPOLLHUP event. In order to force the event loop to
349 * move forward, we merge in the read/write events that the watcher
350 * is interested in; uv__read() and uv__write() will then deal with
351 * the error or hangup in the usual fashion.
352 *
353 * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
354 * reads the available data, calls uv_read_stop(), then sometime later
355 * calls uv_read_start() again. By then, libuv has forgotten about the
356 * hangup and the kernel won't report EPOLLIN again because there's
357 * nothing left to read. If anything, libuv is to blame here. The
358 * current hack is just a quick bandaid; to properly fix it, libuv
359 * needs to remember the error/hangup event. We should get that for
360 * free when we switch over to edge-triggered I/O.
361 */
362 if (pe->events == POLLERR || pe->events == POLLHUP)
363 pe->events |=
364 w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
365
366 if (pe->events != 0) {
367 /* Run signal watchers last. This also affects child process watchers
368 * because those are implemented in terms of signal watchers.
369 */
370 if (w == &loop->signal_io_watcher) {
371 have_signals = 1;
372 } else {
373 uv__metrics_update_idle_time(loop);
374 w->cb(loop, w, pe->events);
375 }
376
377 nevents++;
378 }
379 }
380
381 if (reset_timeout != 0) {
382 timeout = user_timeout;
383 reset_timeout = 0;
384 }
385
386 if (have_signals != 0) {
387 uv__metrics_update_idle_time(loop);
388 loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
389 }
390
391 loop->watchers[loop->nwatchers] = NULL;
392 loop->watchers[loop->nwatchers + 1] = NULL;
393
394 if (have_signals != 0)
395 return; /* Event loop should cycle now so don't poll again. */
396
397 if (nevents != 0) {
398 if (nfds == ARRAY_SIZE(events) && --count != 0) {
399 /* Poll for more events but don't block this time. */
400 timeout = 0;
401 continue;
402 }
403 return;
404 }
405
406 if (timeout == 0)
407 return;
408
409 if (timeout == -1)
410 continue;
411
412 update_timeout:
413 assert(timeout > 0);
414
415 real_timeout -= (loop->time - base);
416 if (real_timeout <= 0)
417 return;
418
419 timeout = real_timeout;
420 }
421 }
422
423