• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright libuv contributors. All rights reserved.
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to
5  * deal in the Software without restriction, including without limitation the
6  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7  * sell copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19  * IN THE SOFTWARE.
20  */
21 
22 #include "uv.h"
23 #include "internal.h"
24 #include <errno.h>
25 #include <sys/epoll.h>
26 
uv__epoll_init(uv_loop_t * loop)27 int uv__epoll_init(uv_loop_t* loop) {
28   int fd;
29   fd = epoll_create1(O_CLOEXEC);
30 
31   /* epoll_create1() can fail either because it's not implemented (old kernel)
32    * or because it doesn't understand the O_CLOEXEC flag.
33    */
34   if (fd == -1 && (errno == ENOSYS || errno == EINVAL)) {
35     fd = epoll_create(256);
36 
37     if (fd != -1)
38       uv__cloexec(fd, 1);
39   }
40 
41   loop->backend_fd = fd;
42   if (fd == -1)
43     return UV__ERR(errno);
44 
45   return 0;
46 }
47 
48 
uv__platform_invalidate_fd(uv_loop_t * loop,int fd)49 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
50   struct epoll_event* events;
51   struct epoll_event dummy;
52   uintptr_t i;
53   uintptr_t nfds;
54 
55   assert(loop->watchers != NULL);
56   assert(fd >= 0);
57 
58   events = (struct epoll_event*) loop->watchers[loop->nwatchers];
59   nfds = (uintptr_t) loop->watchers[loop->nwatchers + 1];
60   if (events != NULL)
61     /* Invalidate events with same file descriptor */
62     for (i = 0; i < nfds; i++)
63       if (events[i].data.fd == fd)
64         events[i].data.fd = -1;
65 
66   /* Remove the file descriptor from the epoll.
67    * This avoids a problem where the same file description remains open
68    * in another process, causing repeated junk epoll events.
69    *
70    * We pass in a dummy epoll_event, to work around a bug in old kernels.
71    */
72   if (loop->backend_fd >= 0) {
73     /* Work around a bug in kernels 3.10 to 3.19 where passing a struct that
74      * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
75      */
76     memset(&dummy, 0, sizeof(dummy));
77     epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
78   }
79 }
80 
81 
uv__io_check_fd(uv_loop_t * loop,int fd)82 int uv__io_check_fd(uv_loop_t* loop, int fd) {
83   struct epoll_event e;
84   int rc;
85 
86   memset(&e, 0, sizeof(e));
87   e.events = POLLIN;
88   e.data.fd = -1;
89 
90   rc = 0;
91   if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
92     if (errno != EEXIST)
93       rc = UV__ERR(errno);
94 
95   if (rc == 0)
96     if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
97       abort();
98 
99   return rc;
100 }
101 
102 
uv__io_poll(uv_loop_t * loop,int timeout)103 void uv__io_poll(uv_loop_t* loop, int timeout) {
104   /* A bug in kernels < 2.6.37 makes timeouts larger than ~30 minutes
105    * effectively infinite on 32 bits architectures.  To avoid blocking
106    * indefinitely, we cap the timeout and poll again if necessary.
107    *
108    * Note that "30 minutes" is a simplification because it depends on
109    * the value of CONFIG_HZ.  The magic constant assumes CONFIG_HZ=1200,
110    * that being the largest value I have seen in the wild (and only once.)
111    */
112   static const int max_safe_timeout = 1789569;
113   static int no_epoll_pwait_cached;
114   static int no_epoll_wait_cached;
115   int no_epoll_pwait;
116   int no_epoll_wait;
117   struct epoll_event events[1024];
118   struct epoll_event* pe;
119   struct epoll_event e;
120   int real_timeout;
121   QUEUE* q;
122   uv__io_t* w;
123   sigset_t sigset;
124   uint64_t sigmask;
125   uint64_t base;
126   int have_signals;
127   int nevents;
128   int count;
129   int nfds;
130   int fd;
131   int op;
132   int i;
133   int user_timeout;
134   int reset_timeout;
135 
136   if (loop->nfds == 0) {
137     assert(QUEUE_EMPTY(&loop->watcher_queue));
138     return;
139   }
140 
141   memset(&e, 0, sizeof(e));
142 
143   while (!QUEUE_EMPTY(&loop->watcher_queue)) {
144     q = QUEUE_HEAD(&loop->watcher_queue);
145     QUEUE_REMOVE(q);
146     QUEUE_INIT(q);
147 
148     w = QUEUE_DATA(q, uv__io_t, watcher_queue);
149     assert(w->pevents != 0);
150     assert(w->fd >= 0);
151     assert(w->fd < (int) loop->nwatchers);
152 
153     e.events = w->pevents;
154     e.data.fd = w->fd;
155 
156     if (w->events == 0)
157       op = EPOLL_CTL_ADD;
158     else
159       op = EPOLL_CTL_MOD;
160 
161     /* XXX Future optimization: do EPOLL_CTL_MOD lazily if we stop watching
162      * events, skip the syscall and squelch the events after epoll_wait().
163      */
164     if (epoll_ctl(loop->backend_fd, op, w->fd, &e)) {
165       if (errno != EEXIST)
166         abort();
167 
168       assert(op == EPOLL_CTL_ADD);
169 
170       /* We've reactivated a file descriptor that's been watched before. */
171       if (epoll_ctl(loop->backend_fd, EPOLL_CTL_MOD, w->fd, &e))
172         abort();
173     }
174 
175     w->events = w->pevents;
176   }
177 
178   sigmask = 0;
179   if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
180     sigemptyset(&sigset);
181     sigaddset(&sigset, SIGPROF);
182     sigmask |= 1 << (SIGPROF - 1);
183   }
184 
185   assert(timeout >= -1);
186   base = loop->time;
187   count = 48; /* Benchmarks suggest this gives the best throughput. */
188   real_timeout = timeout;
189 
190   if (uv__get_internal_fields(loop)->flags & UV_METRICS_IDLE_TIME) {
191     reset_timeout = 1;
192     user_timeout = timeout;
193     timeout = 0;
194   } else {
195     reset_timeout = 0;
196     user_timeout = 0;
197   }
198 
199   /* You could argue there is a dependency between these two but
200    * ultimately we don't care about their ordering with respect
201    * to one another. Worst case, we make a few system calls that
202    * could have been avoided because another thread already knows
203    * they fail with ENOSYS. Hardly the end of the world.
204    */
205   no_epoll_pwait = uv__load_relaxed(&no_epoll_pwait_cached);
206   no_epoll_wait = uv__load_relaxed(&no_epoll_wait_cached);
207 
208   for (;;) {
209     /* Only need to set the provider_entry_time if timeout != 0. The function
210      * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
211      */
212     if (timeout != 0)
213       uv__metrics_set_provider_entry_time(loop);
214 
215     /* See the comment for max_safe_timeout for an explanation of why
216      * this is necessary.  Executive summary: kernel bug workaround.
217      */
218     if (sizeof(int32_t) == sizeof(long) && timeout >= max_safe_timeout)
219       timeout = max_safe_timeout;
220 
221     if (sigmask != 0 && no_epoll_pwait != 0)
222       if (pthread_sigmask(SIG_BLOCK, &sigset, NULL))
223         abort();
224 
225     if (no_epoll_wait != 0 || (sigmask != 0 && no_epoll_pwait == 0)) {
226       nfds = epoll_pwait(loop->backend_fd,
227                          events,
228                          ARRAY_SIZE(events),
229                          timeout,
230                          &sigset);
231       if (nfds == -1 && errno == ENOSYS) {
232         uv__store_relaxed(&no_epoll_pwait_cached, 1);
233         no_epoll_pwait = 1;
234       }
235     } else {
236       nfds = epoll_wait(loop->backend_fd,
237                         events,
238                         ARRAY_SIZE(events),
239                         timeout);
240       if (nfds == -1 && errno == ENOSYS) {
241         uv__store_relaxed(&no_epoll_wait_cached, 1);
242         no_epoll_wait = 1;
243       }
244     }
245 
246     if (sigmask != 0 && no_epoll_pwait != 0)
247       if (pthread_sigmask(SIG_UNBLOCK, &sigset, NULL))
248         abort();
249 
250     /* Update loop->time unconditionally. It's tempting to skip the update when
251      * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
252      * operating system didn't reschedule our process while in the syscall.
253      */
254     SAVE_ERRNO(uv__update_time(loop));
255 
256     if (nfds == 0) {
257       assert(timeout != -1);
258 
259       if (reset_timeout != 0) {
260         timeout = user_timeout;
261         reset_timeout = 0;
262       }
263 
264       if (timeout == -1)
265         continue;
266 
267       if (timeout == 0)
268         return;
269 
270       /* We may have been inside the system call for longer than |timeout|
271        * milliseconds so we need to update the timestamp to avoid drift.
272        */
273       goto update_timeout;
274     }
275 
276     if (nfds == -1) {
277       if (errno == ENOSYS) {
278         /* epoll_wait() or epoll_pwait() failed, try the other system call. */
279         assert(no_epoll_wait == 0 || no_epoll_pwait == 0);
280         continue;
281       }
282 
283       if (errno != EINTR)
284         abort();
285 
286       if (reset_timeout != 0) {
287         timeout = user_timeout;
288         reset_timeout = 0;
289       }
290 
291       if (timeout == -1)
292         continue;
293 
294       if (timeout == 0)
295         return;
296 
297       /* Interrupted by a signal. Update timeout and poll again. */
298       goto update_timeout;
299     }
300 
301     have_signals = 0;
302     nevents = 0;
303 
304     {
305       /* Squelch a -Waddress-of-packed-member warning with gcc >= 9. */
306       union {
307         struct epoll_event* events;
308         uv__io_t* watchers;
309       } x;
310 
311       x.events = events;
312       assert(loop->watchers != NULL);
313       loop->watchers[loop->nwatchers] = x.watchers;
314       loop->watchers[loop->nwatchers + 1] = (void*) (uintptr_t) nfds;
315     }
316 
317     for (i = 0; i < nfds; i++) {
318       pe = events + i;
319       fd = pe->data.fd;
320 
321       /* Skip invalidated events, see uv__platform_invalidate_fd */
322       if (fd == -1)
323         continue;
324 
325       assert(fd >= 0);
326       assert((unsigned) fd < loop->nwatchers);
327 
328       w = loop->watchers[fd];
329 
330       if (w == NULL) {
331         /* File descriptor that we've stopped watching, disarm it.
332          *
333          * Ignore all errors because we may be racing with another thread
334          * when the file descriptor is closed.
335          */
336         epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, pe);
337         continue;
338       }
339 
340       /* Give users only events they're interested in. Prevents spurious
341        * callbacks when previous callback invocation in this loop has stopped
342        * the current watcher. Also, filters out events that users has not
343        * requested us to watch.
344        */
345       pe->events &= w->pevents | POLLERR | POLLHUP;
346 
347       /* Work around an epoll quirk where it sometimes reports just the
348        * EPOLLERR or EPOLLHUP event.  In order to force the event loop to
349        * move forward, we merge in the read/write events that the watcher
350        * is interested in; uv__read() and uv__write() will then deal with
351        * the error or hangup in the usual fashion.
352        *
353        * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
354        * reads the available data, calls uv_read_stop(), then sometime later
355        * calls uv_read_start() again.  By then, libuv has forgotten about the
356        * hangup and the kernel won't report EPOLLIN again because there's
357        * nothing left to read.  If anything, libuv is to blame here.  The
358        * current hack is just a quick bandaid; to properly fix it, libuv
359        * needs to remember the error/hangup event.  We should get that for
360        * free when we switch over to edge-triggered I/O.
361        */
362       if (pe->events == POLLERR || pe->events == POLLHUP)
363         pe->events |=
364           w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
365 
366       if (pe->events != 0) {
367         /* Run signal watchers last.  This also affects child process watchers
368          * because those are implemented in terms of signal watchers.
369          */
370         if (w == &loop->signal_io_watcher) {
371           have_signals = 1;
372         } else {
373           uv__metrics_update_idle_time(loop);
374           w->cb(loop, w, pe->events);
375         }
376 
377         nevents++;
378       }
379     }
380 
381     if (reset_timeout != 0) {
382       timeout = user_timeout;
383       reset_timeout = 0;
384     }
385 
386     if (have_signals != 0) {
387       uv__metrics_update_idle_time(loop);
388       loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
389     }
390 
391     loop->watchers[loop->nwatchers] = NULL;
392     loop->watchers[loop->nwatchers + 1] = NULL;
393 
394     if (have_signals != 0)
395       return;  /* Event loop should cycle now so don't poll again. */
396 
397     if (nevents != 0) {
398       if (nfds == ARRAY_SIZE(events) && --count != 0) {
399         /* Poll for more events but don't block this time. */
400         timeout = 0;
401         continue;
402       }
403       return;
404     }
405 
406     if (timeout == 0)
407       return;
408 
409     if (timeout == -1)
410       continue;
411 
412 update_timeout:
413     assert(timeout > 0);
414 
415     real_timeout -= (loop->time - base);
416     if (real_timeout <= 0)
417       return;
418 
419     timeout = real_timeout;
420   }
421 }
422 
423