1 /*
2 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
3 * Copyright 2007-2012 Niels Provos, Nick Mathewson
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 #include "event2/event-config.h"
28
29 #include <stdint.h>
30 #include <sys/types.h>
31 #include <sys/resource.h>
32 #ifdef _EVENT_HAVE_SYS_TIME_H
33 #include <sys/time.h>
34 #endif
35 #include <sys/queue.h>
36 #include <sys/epoll.h>
37 #include <signal.h>
38 #include <limits.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <unistd.h>
43 #include <errno.h>
44 #ifdef _EVENT_HAVE_FCNTL_H
45 #include <fcntl.h>
46 #endif
47
48 #include "event-internal.h"
49 #include "evsignal-internal.h"
50 #include "event2/thread.h"
51 #include "evthread-internal.h"
52 #include "log-internal.h"
53 #include "evmap-internal.h"
54 #include "changelist-internal.h"
55
56 struct epollop {
57 struct epoll_event *events;
58 int nevents;
59 int epfd;
60 };
61
62 static void *epoll_init(struct event_base *);
63 static int epoll_dispatch(struct event_base *, struct timeval *);
64 static void epoll_dealloc(struct event_base *);
65
66 static const struct eventop epollops_changelist = {
67 "epoll (with changelist)",
68 epoll_init,
69 event_changelist_add,
70 event_changelist_del,
71 epoll_dispatch,
72 epoll_dealloc,
73 1, /* need reinit */
74 EV_FEATURE_ET|EV_FEATURE_O1,
75 EVENT_CHANGELIST_FDINFO_SIZE
76 };
77
78
79 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
80 short old, short events, void *p);
81 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
82 short old, short events, void *p);
83
84 const struct eventop epollops = {
85 "epoll",
86 epoll_init,
87 epoll_nochangelist_add,
88 epoll_nochangelist_del,
89 epoll_dispatch,
90 epoll_dealloc,
91 1, /* need reinit */
92 EV_FEATURE_ET|EV_FEATURE_O1,
93 0
94 };
95
96 #define INITIAL_NEVENT 32
97 #define MAX_NEVENT 4096
98
99 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
100 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be
101 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
102 * largest number of msec we can support here is 2147482. Let's
103 * round that down by 47 seconds.
104 */
105 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
106
107 static void *
epoll_init(struct event_base * base)108 epoll_init(struct event_base *base)
109 {
110 int epfd;
111 struct epollop *epollop;
112
113 /* Initialize the kernel queue. (The size field is ignored since
114 * 2.6.8.) */
115 if ((epfd = epoll_create(32000)) == -1) {
116 if (errno != ENOSYS)
117 event_warn("epoll_create");
118 return (NULL);
119 }
120
121 evutil_make_socket_closeonexec(epfd);
122
123 if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
124 close(epfd);
125 return (NULL);
126 }
127
128 epollop->epfd = epfd;
129
130 /* Initialize fields */
131 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
132 if (epollop->events == NULL) {
133 mm_free(epollop);
134 close(epfd);
135 return (NULL);
136 }
137 epollop->nevents = INITIAL_NEVENT;
138
139 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
140 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
141 evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL))
142 base->evsel = &epollops_changelist;
143
144 evsig_init(base);
145
146 return (epollop);
147 }
148
149 static const char *
change_to_string(int change)150 change_to_string(int change)
151 {
152 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
153 if (change == EV_CHANGE_ADD) {
154 return "add";
155 } else if (change == EV_CHANGE_DEL) {
156 return "del";
157 } else if (change == 0) {
158 return "none";
159 } else {
160 return "???";
161 }
162 }
163
164 static const char *
epoll_op_to_string(int op)165 epoll_op_to_string(int op)
166 {
167 return op == EPOLL_CTL_ADD?"ADD":
168 op == EPOLL_CTL_DEL?"DEL":
169 op == EPOLL_CTL_MOD?"MOD":
170 "???";
171 }
172
173 static int
epoll_apply_one_change(struct event_base * base,struct epollop * epollop,const struct event_change * ch)174 epoll_apply_one_change(struct event_base *base,
175 struct epollop *epollop,
176 const struct event_change *ch)
177 {
178 struct epoll_event epev;
179 int op, events = 0;
180
181 if (1) {
182 /* The logic here is a little tricky. If we had no events set
183 on the fd before, we need to set op="ADD" and set
184 events=the events we want to add. If we had any events set
185 on the fd before, and we want any events to remain on the
186 fd, we need to say op="MOD" and set events=the events we
187 want to remain. But if we want to delete the last event,
188 we say op="DEL" and set events=the remaining events. What
189 fun!
190 */
191
192 /* TODO: Turn this into a switch or a table lookup. */
193
194 if ((ch->read_change & EV_CHANGE_ADD) ||
195 (ch->write_change & EV_CHANGE_ADD)) {
196 /* If we are adding anything at all, we'll want to do
197 * either an ADD or a MOD. */
198 events = 0;
199 op = EPOLL_CTL_ADD;
200 if (ch->read_change & EV_CHANGE_ADD) {
201 events |= EPOLLIN;
202 } else if (ch->read_change & EV_CHANGE_DEL) {
203 ;
204 } else if (ch->old_events & EV_READ) {
205 events |= EPOLLIN;
206 }
207 if (ch->write_change & EV_CHANGE_ADD) {
208 events |= EPOLLOUT;
209 } else if (ch->write_change & EV_CHANGE_DEL) {
210 ;
211 } else if (ch->old_events & EV_WRITE) {
212 events |= EPOLLOUT;
213 }
214 if ((ch->read_change|ch->write_change) & EV_ET)
215 events |= EPOLLET;
216
217 if (ch->old_events) {
218 /* If MOD fails, we retry as an ADD, and if
219 * ADD fails we will retry as a MOD. So the
220 * only hard part here is to guess which one
221 * will work. As a heuristic, we'll try
222 * MOD first if we think there were old
223 * events and ADD if we think there were none.
224 *
225 * We can be wrong about the MOD if the file
226 * has in fact been closed and re-opened.
227 *
228 * We can be wrong about the ADD if the
229 * the fd has been re-created with a dup()
230 * of the same file that it was before.
231 */
232 op = EPOLL_CTL_MOD;
233 }
234 } else if ((ch->read_change & EV_CHANGE_DEL) ||
235 (ch->write_change & EV_CHANGE_DEL)) {
236 /* If we're deleting anything, we'll want to do a MOD
237 * or a DEL. */
238 op = EPOLL_CTL_DEL;
239
240 if (ch->read_change & EV_CHANGE_DEL) {
241 if (ch->write_change & EV_CHANGE_DEL) {
242 events = EPOLLIN|EPOLLOUT;
243 } else if (ch->old_events & EV_WRITE) {
244 events = EPOLLOUT;
245 op = EPOLL_CTL_MOD;
246 } else {
247 events = EPOLLIN;
248 }
249 } else if (ch->write_change & EV_CHANGE_DEL) {
250 if (ch->old_events & EV_READ) {
251 events = EPOLLIN;
252 op = EPOLL_CTL_MOD;
253 } else {
254 events = EPOLLOUT;
255 }
256 }
257 }
258
259 if (!events)
260 return 0;
261
262 memset(&epev, 0, sizeof(epev));
263 epev.data.fd = ch->fd;
264 epev.events = events;
265 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) {
266 if (op == EPOLL_CTL_MOD && errno == ENOENT) {
267 /* If a MOD operation fails with ENOENT, the
268 * fd was probably closed and re-opened. We
269 * should retry the operation as an ADD.
270 */
271 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
272 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
273 (int)epev.events, ch->fd);
274 return -1;
275 } else {
276 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
277 (int)epev.events,
278 ch->fd));
279 }
280 } else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
281 /* If an ADD operation fails with EEXIST,
282 * either the operation was redundant (as with a
283 * precautionary add), or we ran into a fun
284 * kernel bug where using dup*() to duplicate the
285 * same file into the same fd gives you the same epitem
286 * rather than a fresh one. For the second case,
287 * we must retry with MOD. */
288 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
289 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
290 (int)epev.events, ch->fd);
291 return -1;
292 } else {
293 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
294 (int)epev.events,
295 ch->fd));
296 }
297 } else if (op == EPOLL_CTL_DEL &&
298 (errno == ENOENT || errno == EBADF ||
299 errno == EPERM)) {
300 /* If a delete fails with one of these errors,
301 * that's fine too: we closed the fd before we
302 * got around to calling epoll_dispatch. */
303 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
304 (int)epev.events,
305 ch->fd,
306 strerror(errno)));
307 } else {
308 event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s)",
309 epoll_op_to_string(op),
310 (int)epev.events,
311 ch->fd,
312 ch->old_events,
313 ch->read_change,
314 change_to_string(ch->read_change),
315 ch->write_change,
316 change_to_string(ch->write_change));
317 return -1;
318 }
319 } else {
320 event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
321 epoll_op_to_string(op),
322 (int)epev.events,
323 (int)ch->fd,
324 ch->old_events,
325 ch->read_change,
326 ch->write_change));
327 }
328 }
329 return 0;
330 }
331
332 static int
epoll_apply_changes(struct event_base * base)333 epoll_apply_changes(struct event_base *base)
334 {
335 struct event_changelist *changelist = &base->changelist;
336 struct epollop *epollop = base->evbase;
337 struct event_change *ch;
338
339 int r = 0;
340 int i;
341
342 for (i = 0; i < changelist->n_changes; ++i) {
343 ch = &changelist->changes[i];
344 if (epoll_apply_one_change(base, epollop, ch) < 0)
345 r = -1;
346 }
347
348 return (r);
349 }
350
351 static int
epoll_nochangelist_add(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)352 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
353 short old, short events, void *p)
354 {
355 struct event_change ch;
356 ch.fd = fd;
357 ch.old_events = old;
358 ch.read_change = ch.write_change = 0;
359 if (events & EV_WRITE)
360 ch.write_change = EV_CHANGE_ADD |
361 (events & EV_ET);
362 if (events & EV_READ)
363 ch.read_change = EV_CHANGE_ADD |
364 (events & EV_ET);
365
366 return epoll_apply_one_change(base, base->evbase, &ch);
367 }
368
369 static int
epoll_nochangelist_del(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)370 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
371 short old, short events, void *p)
372 {
373 struct event_change ch;
374 ch.fd = fd;
375 ch.old_events = old;
376 ch.read_change = ch.write_change = 0;
377 if (events & EV_WRITE)
378 ch.write_change = EV_CHANGE_DEL;
379 if (events & EV_READ)
380 ch.read_change = EV_CHANGE_DEL;
381
382 return epoll_apply_one_change(base, base->evbase, &ch);
383 }
384
385 static int
epoll_dispatch(struct event_base * base,struct timeval * tv)386 epoll_dispatch(struct event_base *base, struct timeval *tv)
387 {
388 struct epollop *epollop = base->evbase;
389 struct epoll_event *events = epollop->events;
390 int i, res;
391 long timeout = -1;
392
393 if (tv != NULL) {
394 timeout = evutil_tv_to_msec(tv);
395 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
396 /* Linux kernels can wait forever if the timeout is
397 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
398 timeout = MAX_EPOLL_TIMEOUT_MSEC;
399 }
400 }
401
402 epoll_apply_changes(base);
403 event_changelist_remove_all(&base->changelist, base);
404
405 EVBASE_RELEASE_LOCK(base, th_base_lock);
406
407 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
408
409 EVBASE_ACQUIRE_LOCK(base, th_base_lock);
410
411 if (res == -1) {
412 if (errno != EINTR) {
413 event_warn("epoll_wait");
414 return (-1);
415 }
416
417 return (0);
418 }
419
420 event_debug(("%s: epoll_wait reports %d", __func__, res));
421 EVUTIL_ASSERT(res <= epollop->nevents);
422
423 for (i = 0; i < res; i++) {
424 int what = events[i].events;
425 short ev = 0;
426
427 if (what & (EPOLLHUP|EPOLLERR)) {
428 ev = EV_READ | EV_WRITE;
429 } else {
430 if (what & EPOLLIN)
431 ev |= EV_READ;
432 if (what & EPOLLOUT)
433 ev |= EV_WRITE;
434 }
435
436 if (!ev)
437 continue;
438
439 evmap_io_active(base, events[i].data.fd, ev | EV_ET);
440 }
441
442 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
443 /* We used all of the event space this time. We should
444 be ready for more events next time. */
445 int new_nevents = epollop->nevents * 2;
446 struct epoll_event *new_events;
447
448 new_events = mm_realloc(epollop->events,
449 new_nevents * sizeof(struct epoll_event));
450 if (new_events) {
451 epollop->events = new_events;
452 epollop->nevents = new_nevents;
453 }
454 }
455
456 return (0);
457 }
458
459
460 static void
epoll_dealloc(struct event_base * base)461 epoll_dealloc(struct event_base *base)
462 {
463 struct epollop *epollop = base->evbase;
464
465 evsig_dealloc(base);
466 if (epollop->events)
467 mm_free(epollop->events);
468 if (epollop->epfd >= 0)
469 close(epollop->epfd);
470
471 memset(epollop, 0, sizeof(struct epollop));
472 mm_free(epollop);
473 }
474