• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* MIT License
2  *
3  * Copyright (c) 2024 Brad House
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a copy
6  * of this software and associated documentation files (the "Software"), to deal
7  * in the Software without restriction, including without limitation the rights
8  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9  * copies of the Software, and to permit persons to whom the Software is
10  * furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  *
24  * SPDX-License-Identifier: MIT
25  */
26 
27 /* Uses an anonymous union */
28 #if defined(__clang__) || defined(__GNUC__)
29 #  pragma GCC diagnostic push
30 #  if defined(__clang__)
31 #    pragma GCC diagnostic ignored "-Wc11-extensions"
32 #  else
33 #    pragma GCC diagnostic ignored "-Wpedantic"
34 #  endif
35 #endif
36 
37 #include "ares_private.h"
38 #include "ares_event.h"
39 #include "ares_event_win32.h"
40 
41 
42 #if defined(USE_WINSOCK) && defined(CARES_THREADS)
43 
44 #ifdef HAVE_LIMITS_H
45 #  include <limits.h>
46 #endif
47 
48 /* IMPLEMENTATION NOTES
49  * ====================
50  *
51  * This implementation uses some undocumented functionality within Windows for
52  * monitoring sockets. The Ancillary Function Driver (AFD) is the low level
53  * implementation that Winsock2 sits on top of.  Winsock2 unfortunately does
54  * not expose the equivalent of epoll() or kqueue(), but it is possible to
55  * access AFD directly and use along with IOCP to simulate the functionality.
56  * We want to use IOCP if possible as it gives us the ability to monitor more
57  * than just sockets (WSAPoll is not an option), and perform arbitrary callbacks
58  * which means we can hook in non-socket related events.
59  *
60  * The information for this implementation was gathered from "wepoll" and
61  * "libuv" which both use slight variants on this.  We originally went with
62  * an implementation methodology more similar to "libuv", but we had a few
63  * user reports of crashes during shutdown and memory leaks due to some
64  * events not being delivered for cleanup of closed sockets.
65  *
66  * Initialization:
67  *   1. Dynamically load the NtDeviceIoControlFile, NtCreateFile, and
68  *      NtCancelIoFileEx internal symbols from ntdll.dll. (Don't believe
69  *      Microsoft's documentation for NtCancelIoFileEx as it documents an
70  *      invalid prototype). These functions are to open a reference to the
71  *      Ancillary Function Driver (AFD), and to submit and cancel POLL
72  *      requests.
73  *   2. Create an IO Completion Port base handle via CreateIoCompletionPort()
74  *      that all socket events will be delivered through.
75  *   3. Create a list of AFD Handles and track the number of poll requests
76  *      per AFD handle.  When we exceed a pre-determined limit of poll requests
77  *      for a handle (128), we will automatically create a new handle.  The
78  *      reason behind this is NtCancelIoFileEx uses a horrible algorithm for
79  *      issuing cancellations.  See:
80  *      https://github.com/python-trio/trio/issues/52#issuecomment-548215128
81  *   4. Create a callback to be used to be able to interrupt waiting for IOCP
82  *      events, this may be called for allowing enqueuing of additional socket
83  *      events or removing socket events. PostQueuedCompletionStatus() is the
84  *      obvious choice.  We can use the same container format, the event
85  *      delivered won't have an OVERLAPPED pointer so we can differentiate from
86  *      socket events.  Use the container as the completion key.
87  *
88  * Socket Add:
89  *   1. Create/Allocate a container for holding metadata about a socket
90  *      including:
91  *      - SOCKET base_socket;
92  *      - IO_STATUS_BLOCK iosb; -- Used by AFD POLL, returned as OVERLAPPED
93  *      - AFD_POLL_INFO afd_poll_info; -- Used by AFD POLL
94  *      - afd list node -- for tracking which AFD handle a POLL request was
95  *        submitted to.
96  *   2. Call WSAIoctl(..., SIO_BASE_HANDLE, ...) to unwrap the SOCKET and get
97  *      the "base socket" we can use for polling.  It appears this may fail so
98  *      we should call WSAIoctl(..., SIO_BSP_HANDLE_POLL, ...) as a fallback.
99  *   3. Submit AFD POLL request (see "AFD POLL Request" section)
100  *   4. Record a mapping between the "IO Status Block" and the socket container
101  *      so when events are delivered we can dereference.
102  *
103  * Socket Delete:
104  *   1. Call
105  *      NtCancelIoFileEx(afd, iosb, &temp_iosb);
106  *      to cancel any pending operations.
107  *   2. Tag the socket container as being queued for deletion
108  *   3. Wait for an event to be delivered for the socket (cancel isn't
109  *      immediate, it delivers an event to know its complete). Delete only once
110  *      that event has been delivered.  If we don't do this we could try to
111  *      access free()'d memory at a later point.
112  *
113  * Socket Modify:
114  *   1. Call
115  *      NtCancelIoFileEx(afd, iosb, &temp_iosb)
116  *      to cancel any pending operation.
117  *   2. When the event comes through that the cancel is complete, enqueue
118  *      another "AFD Poll Request" for the desired events.
119  *
120  * Event Wait:
121  *   1. Call GetQueuedCompletionStatusEx() with the base IOCP handle, a
122  *      stack allocated array of OVERLAPPED_ENTRY's, and an appropriate
123  *      timeout.
124  *   2. Iterate across returned events, if the lpOverlapped is NULL, then the
125  *      the CompletionKey is a pointer to the container registered via
126  *      PostQueuedCompletionStatus(), otherwise it is the "IO Status Block"
127  *      registered with the "AFD Poll Request" which needs to be dereferenced
128  *      to the "socket container".
129  *   3. If it is a "socket container", disassociate it from the afd list node
130  *      it was previously submitted to.
131  *   4. If it is a "socket container" check to see if we are cleaning up, if so,
132  *      clean it up.
133  *   5. If it is a "socket container" that is still valid, Submit an
134  *      AFD POLL Request (see "AFD POLL Request"). We must re-enable the request
135  *      each time we receive a response, it is not persistent.
136  *   6. Notify of any events received as indicated in the AFD_POLL_INFO
137  *      Handles[0].Events (NOTE: check NumberOfHandles > 0, and the status in
138  *      the IO_STATUS_BLOCK.  If we received an AFD_POLL_LOCAL_CLOSE, clean up
139  *      the connection like the integrator requested it to be cleaned up.
140  *
141  * AFD Poll Request:
142  *   1. Find an afd poll handle in the list that has fewer pending requests than
143  *      the limit.
144  *   2. If an afd poll handle was not associated (e.g. due to all being over
145  *      limit), create a new afd poll handle by calling NtCreateFile()
146  *      with path \Device\Afd , then add the AFD handle to the IO Completion
147  *      Port.  We can leave the completion key as blank since events for
148  *      multiple sockets will be delivered through this and we need to
149  *      differentiate via the OVERLAPPED member returned.  Add the new AFD
150  *      handle to the list of handles.
151  *   3. Initialize the AFD_POLL_INFO structure:
152  *      Exclusive         = FALSE; // allow multiple requests
153  *      NumberOfHandles   = 1;
154  *      Timeout.QuadPart  = LLONG_MAX;
155  *      Handles[0].Handle = (HANDLE)base_socket;
156  *      Handles[0].Status = 0;
157  *      Handles[0].Events = AFD_POLL_LOCAL_CLOSE + additional events to wait for
158  *                          such as AFD_POLL_RECEIVE, etc;
159  *   4. Zero out the IO_STATUS_BLOCK structures
160  *   5. Set the "Status" member of IO_STATUS_BLOCK to STATUS_PENDING
161  *   6. Call
162  *      NtDeviceIoControlFile(afd, NULL, NULL, &iosb,
163  *                            &iosb, IOCTL_AFD_POLL
164  *                            &afd_poll_info, sizeof(afd_poll_info),
165  *                            &afd_poll_info, sizeof(afd_poll_info));
166  *
167  *
168  * References:
169  *   - https://github.com/piscisaureus/wepoll/
170  *   - https://github.com/libuv/libuv/
171  */
172 
173 /* Cap the number of outstanding AFD poll requests per AFD handle due to known
174  * slowdowns with large lists and NtCancelIoFileEx() */
175 #  define AFD_POLL_PER_HANDLE 128
176 
177 #  include <stdarg.h>
178 
179 /* #  define CARES_DEBUG 1 */
180 
181 #  ifdef __GNUC__
182 #    define CARES_PRINTF_LIKE(fmt, args) \
183       __attribute__((format(printf, fmt, args)))
184 #  else
185 #    define CARES_PRINTF_LIKE(fmt, args)
186 #  endif
187 
188 static void CARES_DEBUG_LOG(const char *fmt, ...) CARES_PRINTF_LIKE(1, 2);
189 
CARES_DEBUG_LOG(const char * fmt,...)190 static void CARES_DEBUG_LOG(const char *fmt, ...)
191 {
192   va_list ap;
193 
194   va_start(ap, fmt);
195 #  ifdef CARES_DEBUG
196   vfprintf(stderr, fmt, ap);
197   fflush(stderr);
198 #  endif
199   va_end(ap);
200 }
201 
202 typedef struct {
203   /* Dynamically loaded symbols */
204   NtCreateFile_t          NtCreateFile;
205   NtDeviceIoControlFile_t NtDeviceIoControlFile;
206   NtCancelIoFileEx_t      NtCancelIoFileEx;
207 
208   /* Implementation details */
209   ares_slist_t           *afd_handles;
210   HANDLE                  iocp_handle;
211 
212   /* IO_STATUS_BLOCK * -> ares_evsys_win32_eventdata_t * mapping.  There is
213    * no completion key passed to IOCP with this method so we have to look
214    * up based on the lpOverlapped returned (which is mapped to IO_STATUS_BLOCK)
215    */
216   ares_htable_vpvp_t     *sockets;
217 
218   /* Flag about whether or not we are shutting down */
219   ares_bool_t             is_shutdown;
220 } ares_evsys_win32_t;
221 
222 typedef enum {
223   POLL_STATUS_NONE    = 0,
224   POLL_STATUS_PENDING = 1,
225   POLL_STATUS_CANCEL  = 2,
226   POLL_STATUS_DESTROY = 3
227 } poll_status_t;
228 
229 typedef struct {
230   /*! Pointer to parent event container */
231   ares_event_t        *event;
232   /*! Socket passed in to monitor */
233   SOCKET               socket;
234   /*! Base socket derived from provided socket */
235   SOCKET               base_socket;
236   /*! Structure for submitting AFD POLL requests (Internals!) */
237   AFD_POLL_INFO        afd_poll_info;
238   /*! Status of current polling operation */
239   poll_status_t        poll_status;
240   /*! IO Status Block structure submitted with AFD POLL requests and returned
241    *  with IOCP results as lpOverlapped (even though its a different structure)
242    */
243   IO_STATUS_BLOCK      iosb;
244   /*! AFD handle node an outstanding poll request is associated with */
245   ares_slist_node_t   *afd_handle_node;
246   /* Lock is only for PostQueuedCompletionStatus() to prevent multiple
247    * signals. Tracking via POLL_STATUS_PENDING/POLL_STATUS_NONE */
248   ares_thread_mutex_t *lock;
249 } ares_evsys_win32_eventdata_t;
250 
251 static size_t ares_evsys_win32_wait(ares_event_thread_t *e,
252                                     unsigned long        timeout_ms);
253 
ares_iocpevent_signal(const ares_event_t * event)254 static void   ares_iocpevent_signal(const ares_event_t *event)
255 {
256   ares_event_thread_t          *e           = event->e;
257   ares_evsys_win32_t           *ew          = e->ev_sys_data;
258   ares_evsys_win32_eventdata_t *ed          = event->data;
259   ares_bool_t                   queue_event = ARES_FALSE;
260 
261   ares_thread_mutex_lock(ed->lock);
262   if (ed->poll_status != POLL_STATUS_PENDING) {
263     ed->poll_status = POLL_STATUS_PENDING;
264     queue_event     = ARES_TRUE;
265   }
266   ares_thread_mutex_unlock(ed->lock);
267 
268   if (!queue_event) {
269     return;
270   }
271 
272   PostQueuedCompletionStatus(ew->iocp_handle, 0, (ULONG_PTR)event->data, NULL);
273 }
274 
ares_iocpevent_cb(ares_event_thread_t * e,ares_socket_t fd,void * data,ares_event_flags_t flags)275 static void ares_iocpevent_cb(ares_event_thread_t *e, ares_socket_t fd,
276                               void *data, ares_event_flags_t flags)
277 {
278   ares_evsys_win32_eventdata_t *ed = data;
279   (void)e;
280   (void)fd;
281   (void)flags;
282   ares_thread_mutex_lock(ed->lock);
283   ed->poll_status = POLL_STATUS_NONE;
284   ares_thread_mutex_unlock(ed->lock);
285 }
286 
ares_iocpevent_create(ares_event_thread_t * e)287 static ares_event_t *ares_iocpevent_create(ares_event_thread_t *e)
288 {
289   ares_event_t *event = NULL;
290   ares_status_t status;
291 
292   status =
293     ares_event_update(&event, e, ARES_EVENT_FLAG_OTHER, ares_iocpevent_cb,
294                       ARES_SOCKET_BAD, NULL, NULL, ares_iocpevent_signal);
295   if (status != ARES_SUCCESS) {
296     return NULL;
297   }
298 
299   return event;
300 }
301 
ares_evsys_win32_destroy(ares_event_thread_t * e)302 static void ares_evsys_win32_destroy(ares_event_thread_t *e)
303 {
304   ares_evsys_win32_t *ew = NULL;
305 
306   if (e == NULL) {
307     return;
308   }
309 
310   CARES_DEBUG_LOG("** Win32 Event Destroy\n");
311 
312   ew = e->ev_sys_data;
313   if (ew == NULL) {
314     return;
315   }
316 
317   ew->is_shutdown = ARES_TRUE;
318   CARES_DEBUG_LOG("  ** waiting on %lu remaining sockets to be destroyed\n",
319                   (unsigned long)ares_htable_vpvp_num_keys(ew->sockets));
320   while (ares_htable_vpvp_num_keys(ew->sockets)) {
321     ares_evsys_win32_wait(e, 0);
322   }
323   CARES_DEBUG_LOG("  ** all sockets cleaned up\n");
324 
325 
326   if (ew->iocp_handle != NULL) {
327     CloseHandle(ew->iocp_handle);
328   }
329 
330   ares_slist_destroy(ew->afd_handles);
331 
332   ares_htable_vpvp_destroy(ew->sockets);
333 
334   ares_free(ew);
335   e->ev_sys_data = NULL;
336 }
337 
338 typedef struct {
339   size_t poll_cnt;
340   HANDLE afd_handle;
341 } ares_afd_handle_t;
342 
ares_afd_handle_destroy(void * arg)343 static void ares_afd_handle_destroy(void *arg)
344 {
345   ares_afd_handle_t *hnd = arg;
346   if (hnd != NULL && hnd->afd_handle != NULL) {
347     CloseHandle(hnd->afd_handle);
348   }
349   ares_free(hnd);
350 }
351 
ares_afd_handle_cmp(const void * data1,const void * data2)352 static int ares_afd_handle_cmp(const void *data1, const void *data2)
353 {
354   const ares_afd_handle_t *hnd1 = data1;
355   const ares_afd_handle_t *hnd2 = data2;
356 
357   if (hnd1->poll_cnt > hnd2->poll_cnt) {
358     return 1;
359   }
360   if (hnd1->poll_cnt < hnd2->poll_cnt) {
361     return -1;
362   }
363   return 0;
364 }
365 
fill_object_attributes(OBJECT_ATTRIBUTES * attr,UNICODE_STRING * name,ULONG attributes)366 static void fill_object_attributes(OBJECT_ATTRIBUTES *attr,
367                                    UNICODE_STRING *name, ULONG attributes)
368 {
369   memset(attr, 0, sizeof(*attr));
370   attr->Length     = sizeof(*attr);
371   attr->ObjectName = name;
372   attr->Attributes = attributes;
373 }
374 
375 #  define UNICODE_STRING_CONSTANT(s) \
376     { (sizeof(s) - 1) * sizeof(wchar_t), sizeof(s) * sizeof(wchar_t), L##s }
377 
ares_afd_handle_create(ares_evsys_win32_t * ew)378 static ares_slist_node_t *ares_afd_handle_create(ares_evsys_win32_t *ew)
379 {
380   UNICODE_STRING     afd_device_name = UNICODE_STRING_CONSTANT("\\Device\\Afd");
381   OBJECT_ATTRIBUTES  afd_attributes;
382   NTSTATUS           status;
383   IO_STATUS_BLOCK    iosb;
384   ares_afd_handle_t *afd  = ares_malloc_zero(sizeof(*afd));
385   ares_slist_node_t *node = NULL;
386   if (afd == NULL) {
387     goto fail;
388   }
389 
390   /* Open a handle to the AFD subsystem */
391   fill_object_attributes(&afd_attributes, &afd_device_name, 0);
392   memset(&iosb, 0, sizeof(iosb));
393   iosb.Status = STATUS_PENDING;
394   status      = ew->NtCreateFile(&afd->afd_handle, SYNCHRONIZE, &afd_attributes,
395                                  &iosb, NULL, 0, FILE_SHARE_READ | FILE_SHARE_WRITE,
396                                  FILE_OPEN, 0, NULL, 0);
397   if (status != STATUS_SUCCESS) {
398     CARES_DEBUG_LOG("** Failed to create AFD endpoint\n");
399     goto fail;
400   }
401 
402   if (CreateIoCompletionPort(afd->afd_handle, ew->iocp_handle,
403                              0 /* CompletionKey */, 0) == NULL) {
404     goto fail;
405   }
406 
407   if (!SetFileCompletionNotificationModes(afd->afd_handle,
408                                           FILE_SKIP_SET_EVENT_ON_HANDLE)) {
409     goto fail;
410   }
411 
412   node = ares_slist_insert(ew->afd_handles, afd);
413   if (node == NULL) {
414     goto fail;
415   }
416 
417   return node;
418 
419 fail:
420 
421   ares_afd_handle_destroy(afd);
422   return NULL;
423 }
424 
425 /* Fetch the lowest poll count entry, but if it exceeds the limit, create a
426  * new one and return that */
ares_afd_handle_fetch(ares_evsys_win32_t * ew)427 static ares_slist_node_t *ares_afd_handle_fetch(ares_evsys_win32_t *ew)
428 {
429   ares_slist_node_t *node = ares_slist_node_first(ew->afd_handles);
430   ares_afd_handle_t *afd  = ares_slist_node_val(node);
431 
432   if (afd != NULL && afd->poll_cnt < AFD_POLL_PER_HANDLE) {
433     return node;
434   }
435 
436   return ares_afd_handle_create(ew);
437 }
438 
ares_evsys_win32_init(ares_event_thread_t * e)439 static ares_bool_t ares_evsys_win32_init(ares_event_thread_t *e)
440 {
441   ares_evsys_win32_t *ew = NULL;
442   HMODULE             ntdll;
443 
444   CARES_DEBUG_LOG("** Win32 Event Init\n");
445 
446   ew = ares_malloc_zero(sizeof(*ew));
447   if (ew == NULL) {
448     return ARES_FALSE;
449   }
450 
451   e->ev_sys_data = ew;
452 
453   /* All apps should have ntdll.dll already loaded, so just get a handle to
454    * this */
455   ntdll = GetModuleHandleA("ntdll.dll");
456   if (ntdll == NULL) {
457     goto fail;
458   }
459 
460 #  ifdef __GNUC__
461 #    pragma GCC diagnostic push
462 #    pragma GCC diagnostic ignored "-Wpedantic"
463 /* Without the (void *) cast we get:
464  *  warning: cast between incompatible function types from 'FARPROC' {aka 'long
465  * long int (*)()'} to 'NTSTATUS (*)(...)'} [-Wcast-function-type] but with it
466  * we get: warning: ISO C forbids conversion of function pointer to object
467  * pointer type [-Wpedantic] look unsolvable short of killing the warning.
468  */
469 #  endif
470 
471   /* Load Internal symbols not typically accessible */
472   ew->NtCreateFile =
473     (NtCreateFile_t)(void *)GetProcAddress(ntdll, "NtCreateFile");
474   ew->NtDeviceIoControlFile = (NtDeviceIoControlFile_t)(void *)GetProcAddress(
475     ntdll, "NtDeviceIoControlFile");
476   ew->NtCancelIoFileEx =
477     (NtCancelIoFileEx_t)(void *)GetProcAddress(ntdll, "NtCancelIoFileEx");
478 
479 #  ifdef __GNUC__
480 #    pragma GCC diagnostic pop
481 #  endif
482 
483   if (ew->NtCreateFile == NULL || ew->NtCancelIoFileEx == NULL ||
484       ew->NtDeviceIoControlFile == NULL) {
485     goto fail;
486   }
487 
488   ew->iocp_handle = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
489   if (ew->iocp_handle == NULL) {
490     goto fail;
491   }
492 
493   ew->afd_handles = ares_slist_create(
494     e->channel->rand_state, ares_afd_handle_cmp, ares_afd_handle_destroy);
495   if (ew->afd_handles == NULL) {
496     goto fail;
497   }
498 
499   /* Create at least the first afd handle, so we know of any critical system
500    * issues during startup */
501   if (ares_afd_handle_create(ew) == NULL) {
502     goto fail;
503   }
504 
505   e->ev_signal = ares_iocpevent_create(e);
506   if (e->ev_signal == NULL) {
507     goto fail;
508   }
509 
510   ew->sockets = ares_htable_vpvp_create(NULL, NULL);
511   if (ew->sockets == NULL) {
512     goto fail;
513   }
514 
515   return ARES_TRUE;
516 
517 fail:
518   ares_evsys_win32_destroy(e);
519   return ARES_FALSE;
520 }
521 
ares_evsys_win32_basesocket(ares_socket_t socket)522 static ares_socket_t ares_evsys_win32_basesocket(ares_socket_t socket)
523 {
524   while (1) {
525     DWORD         bytes; /* Not used */
526     ares_socket_t base_socket = ARES_SOCKET_BAD;
527     int           rv;
528 
529     rv = WSAIoctl(socket, SIO_BASE_HANDLE, NULL, 0, &base_socket,
530                   sizeof(base_socket), &bytes, NULL, NULL);
531     if (rv != SOCKET_ERROR && base_socket != ARES_SOCKET_BAD) {
532       socket = base_socket;
533       break;
534     }
535 
536     /* If we're here, an error occurred */
537     if (GetLastError() == WSAENOTSOCK) {
538       /* This is critical, exit */
539       return ARES_SOCKET_BAD;
540     }
541 
542     /* Work around known bug in Komodia based LSPs, use ARES_BSP_HANDLE_POLL
543      * to retrieve the underlying socket to then loop and get the base socket:
544      *  https://docs.microsoft.com/en-us/windows/win32/winsock/winsock-ioctls
545      *  https://www.komodia.com/newwiki/index.php?title=Komodia%27s_Redirector_bug_fixes#Version_2.2.2.6
546      */
547     base_socket = ARES_SOCKET_BAD;
548     rv          = WSAIoctl(socket, SIO_BSP_HANDLE_POLL, NULL, 0, &base_socket,
549                            sizeof(base_socket), &bytes, NULL, NULL);
550 
551     if (rv != SOCKET_ERROR && base_socket != ARES_SOCKET_BAD &&
552         base_socket != socket) {
553       socket = base_socket;
554       continue; /* loop! */
555     }
556 
557     return ARES_SOCKET_BAD;
558   }
559 
560   return socket;
561 }
562 
ares_evsys_win32_afd_enqueue(ares_event_t * event,ares_event_flags_t flags)563 static ares_bool_t ares_evsys_win32_afd_enqueue(ares_event_t      *event,
564                                                 ares_event_flags_t flags)
565 {
566   ares_event_thread_t          *e  = event->e;
567   ares_evsys_win32_t           *ew = e->ev_sys_data;
568   ares_evsys_win32_eventdata_t *ed = event->data;
569   ares_afd_handle_t            *afd;
570   NTSTATUS                      status;
571 
572   if (e == NULL || ed == NULL || ew == NULL) {
573     return ARES_FALSE;
574   }
575 
576   /* Misuse */
577   if (ed->poll_status != POLL_STATUS_NONE) {
578     return ARES_FALSE;
579   }
580 
581   ed->afd_handle_node = ares_afd_handle_fetch(ew);
582   /* System resource issue? */
583   if (ed->afd_handle_node == NULL) {
584     return ARES_FALSE;
585   }
586 
587   afd = ares_slist_node_val(ed->afd_handle_node);
588 
589   /* Enqueue AFD Poll */
590   ed->afd_poll_info.Exclusive         = FALSE;
591   ed->afd_poll_info.NumberOfHandles   = 1;
592   ed->afd_poll_info.Timeout.QuadPart  = LLONG_MAX;
593   ed->afd_poll_info.Handles[0].Handle = (HANDLE)ed->base_socket;
594   ed->afd_poll_info.Handles[0].Status = 0;
595   ed->afd_poll_info.Handles[0].Events = AFD_POLL_LOCAL_CLOSE;
596 
597   if (flags & ARES_EVENT_FLAG_READ) {
598     ed->afd_poll_info.Handles[0].Events |=
599       (AFD_POLL_RECEIVE | AFD_POLL_DISCONNECT | AFD_POLL_ACCEPT |
600        AFD_POLL_ABORT);
601   }
602   if (flags & ARES_EVENT_FLAG_WRITE) {
603     ed->afd_poll_info.Handles[0].Events |=
604       (AFD_POLL_SEND | AFD_POLL_CONNECT_FAIL);
605   }
606   if (flags == 0) {
607     ed->afd_poll_info.Handles[0].Events |= AFD_POLL_DISCONNECT;
608   }
609 
610   memset(&ed->iosb, 0, sizeof(ed->iosb));
611   ed->iosb.Status = STATUS_PENDING;
612 
613   status = ew->NtDeviceIoControlFile(
614     afd->afd_handle, NULL, NULL, &ed->iosb, &ed->iosb, IOCTL_AFD_POLL,
615     &ed->afd_poll_info, sizeof(ed->afd_poll_info), &ed->afd_poll_info,
616     sizeof(ed->afd_poll_info));
617   if (status != STATUS_SUCCESS && status != STATUS_PENDING) {
618     CARES_DEBUG_LOG("** afd_enqueue ed=%p FAILED\n", (void *)ed);
619     ed->afd_handle_node = NULL;
620     return ARES_FALSE;
621   }
622 
623   /* Record that we submitted a poll request to this handle and tell it to
624    * re-sort the node since we changed its sort value */
625   afd->poll_cnt++;
626   ares_slist_node_reinsert(ed->afd_handle_node);
627 
628   ed->poll_status = POLL_STATUS_PENDING;
629   CARES_DEBUG_LOG("++ afd_enqueue ed=%p flags=%X\n", (void *)ed,
630                   (unsigned int)flags);
631   return ARES_TRUE;
632 }
633 
ares_evsys_win32_afd_cancel(ares_evsys_win32_eventdata_t * ed)634 static ares_bool_t ares_evsys_win32_afd_cancel(ares_evsys_win32_eventdata_t *ed)
635 {
636   IO_STATUS_BLOCK     cancel_iosb;
637   ares_evsys_win32_t *ew;
638   NTSTATUS            status;
639   ares_afd_handle_t  *afd;
640 
641   ew = ed->event->e->ev_sys_data;
642 
643   /* Misuse */
644   if (ed->poll_status != POLL_STATUS_PENDING) {
645     return ARES_FALSE;
646   }
647 
648   afd = ares_slist_node_val(ed->afd_handle_node);
649 
650   /* Misuse */
651   if (afd == NULL) {
652     return ARES_FALSE;
653   }
654 
655   ed->poll_status = POLL_STATUS_CANCEL;
656 
657   /* Not pending, nothing to do. Most likely that means there is a pending
658    * event that hasn't yet been delivered otherwise it would be re-armed
659    * already */
660   if (ed->iosb.Status != STATUS_PENDING) {
661     CARES_DEBUG_LOG("** cancel not needed for ed=%p\n", (void *)ed);
662     return ARES_FALSE;
663   }
664 
665   status = ew->NtCancelIoFileEx(afd->afd_handle, &ed->iosb, &cancel_iosb);
666 
667   CARES_DEBUG_LOG("** Enqueued cancel for ed=%p, status = %lX\n", (void *)ed,
668                   status);
669 
670   /* NtCancelIoFileEx() may return STATUS_NOT_FOUND if the operation completed
671    * just before calling NtCancelIoFileEx(), but we have not yet received the
672    * notification (but it should be queued for the next IOCP event).  */
673   if (status == STATUS_SUCCESS || status == STATUS_NOT_FOUND) {
674     return ARES_TRUE;
675   }
676 
677   return ARES_FALSE;
678 }
679 
ares_evsys_win32_eventdata_destroy(ares_evsys_win32_t * ew,ares_evsys_win32_eventdata_t * ed)680 static void ares_evsys_win32_eventdata_destroy(ares_evsys_win32_t           *ew,
681                                                ares_evsys_win32_eventdata_t *ed)
682 {
683   if (ew == NULL || ed == NULL) {
684     return;
685   }
686   CARES_DEBUG_LOG("-- deleting ed=%p (%s)\n", (void *)ed,
687                   (ed->socket == ARES_SOCKET_BAD) ? "data" : "socket");
688   /* These type of handles are deferred destroy. Update tracking. */
689   if (ed->socket != ARES_SOCKET_BAD) {
690     ares_htable_vpvp_remove(ew->sockets, &ed->iosb);
691   }
692 
693   ares_thread_mutex_destroy(ed->lock);
694 
695   if (ed->event != NULL) {
696     ed->event->data = NULL;
697   }
698 
699   ares_free(ed);
700 }
701 
ares_evsys_win32_event_add(ares_event_t * event)702 static ares_bool_t ares_evsys_win32_event_add(ares_event_t *event)
703 {
704   ares_event_thread_t          *e  = event->e;
705   ares_evsys_win32_t           *ew = e->ev_sys_data;
706   ares_evsys_win32_eventdata_t *ed;
707   ares_bool_t                   rc = ARES_FALSE;
708 
709   ed              = ares_malloc_zero(sizeof(*ed));
710   ed->event       = event;
711   ed->socket      = event->fd;
712   ed->base_socket = ARES_SOCKET_BAD;
713   event->data     = ed;
714 
715   CARES_DEBUG_LOG("++ add ed=%p (%s) flags=%X\n", (void *)ed,
716                   (ed->socket == ARES_SOCKET_BAD) ? "data" : "socket",
717                   (unsigned int)event->flags);
718 
719   /* Likely a signal event, not something we will directly handle.  We create
720    * the ares_evsys_win32_eventdata_t as the placeholder to use as the
721    * IOCP Completion Key */
722   if (ed->socket == ARES_SOCKET_BAD) {
723     ed->lock = ares_thread_mutex_create();
724     if (ed->lock == NULL) {
725       goto done;
726     }
727     rc = ARES_TRUE;
728     goto done;
729   }
730 
731   ed->base_socket = ares_evsys_win32_basesocket(ed->socket);
732   if (ed->base_socket == ARES_SOCKET_BAD) {
733     goto done;
734   }
735 
736   if (!ares_htable_vpvp_insert(ew->sockets, &ed->iosb, ed)) {
737     goto done;
738   }
739 
740   if (!ares_evsys_win32_afd_enqueue(event, event->flags)) {
741     goto done;
742   }
743 
744   rc = ARES_TRUE;
745 
746 done:
747   if (!rc) {
748     ares_evsys_win32_eventdata_destroy(ew, ed);
749     event->data = NULL;
750   }
751   return rc;
752 }
753 
ares_evsys_win32_event_del(ares_event_t * event)754 static void ares_evsys_win32_event_del(ares_event_t *event)
755 {
756   ares_evsys_win32_eventdata_t *ed = event->data;
757 
758   /* Already cleaned up, likely a LOCAL_CLOSE */
759   if (ed == NULL) {
760     return;
761   }
762 
763   CARES_DEBUG_LOG("-- DELETE requested for ed=%p (%s)\n", (void *)ed,
764                   (ed->socket != ARES_SOCKET_BAD) ? "socket" : "data");
765 
766   /*
767    * Cancel pending AFD Poll operation.
768    */
769   if (ed->socket != ARES_SOCKET_BAD) {
770     ares_evsys_win32_afd_cancel(ed);
771     ed->poll_status = POLL_STATUS_DESTROY;
772     ed->event       = NULL;
773   } else {
774     ares_evsys_win32_eventdata_destroy(event->e->ev_sys_data, ed);
775   }
776 
777   event->data = NULL;
778 }
779 
ares_evsys_win32_event_mod(ares_event_t * event,ares_event_flags_t new_flags)780 static void ares_evsys_win32_event_mod(ares_event_t      *event,
781                                        ares_event_flags_t new_flags)
782 {
783   ares_evsys_win32_eventdata_t *ed = event->data;
784 
785   /* Not for us */
786   if (event->fd == ARES_SOCKET_BAD || ed == NULL) {
787     return;
788   }
789 
790   CARES_DEBUG_LOG("** mod ed=%p new_flags=%X\n", (void *)ed,
791                   (unsigned int)new_flags);
792 
793   /* All we need to do is cancel the pending operation.  When the event gets
794    * delivered for the cancellation, it will automatically re-enqueue a new
795    * event */
796   ares_evsys_win32_afd_cancel(ed);
797 }
798 
ares_evsys_win32_process_other_event(ares_evsys_win32_t * ew,ares_evsys_win32_eventdata_t * ed,size_t i)799 static ares_bool_t ares_evsys_win32_process_other_event(
800   ares_evsys_win32_t *ew, ares_evsys_win32_eventdata_t *ed, size_t i)
801 {
802   ares_event_t *event;
803 
804   /* NOTE: do NOT dereference 'ed' if during shutdown as this could be an
805    * invalid pointer if the signal handle was cleaned up, but there was still a
806    * pending event! */
807 
808   if (ew->is_shutdown) {
809     CARES_DEBUG_LOG("\t\t** i=%lu, skip non-socket handle during shutdown\n",
810                     (unsigned long)i);
811     return ARES_FALSE;
812   }
813 
814   event = ed->event;
815   CARES_DEBUG_LOG("\t\t** i=%lu, ed=%p (data)\n", (unsigned long)i, (void *)ed);
816 
817   event->cb(event->e, event->fd, event->data, ARES_EVENT_FLAG_OTHER);
818   return ARES_TRUE;
819 }
820 
ares_evsys_win32_process_socket_event(ares_evsys_win32_t * ew,ares_evsys_win32_eventdata_t * ed,size_t i)821 static ares_bool_t ares_evsys_win32_process_socket_event(
822   ares_evsys_win32_t *ew, ares_evsys_win32_eventdata_t *ed, size_t i)
823 {
824   ares_event_flags_t flags = 0;
825   ares_event_t      *event = NULL;
826   ares_afd_handle_t *afd   = NULL;
827 
828   /* Shouldn't be possible */
829   if (ed == NULL) {
830     CARES_DEBUG_LOG("\t\t** i=%lu, Invalid handle.\n", (unsigned long)i);
831     return ARES_FALSE;
832   }
833 
834   event = ed->event;
835 
836   CARES_DEBUG_LOG("\t\t** i=%lu, ed=%p (socket)\n", (unsigned long)i,
837                   (void *)ed);
838 
839   /* Process events */
840   if (ed->poll_status == POLL_STATUS_PENDING &&
841       ed->iosb.Status == STATUS_SUCCESS &&
842       ed->afd_poll_info.NumberOfHandles > 0) {
843     if (ed->afd_poll_info.Handles[0].Events &
844         (AFD_POLL_RECEIVE | AFD_POLL_DISCONNECT | AFD_POLL_ACCEPT |
845          AFD_POLL_ABORT)) {
846       flags |= ARES_EVENT_FLAG_READ;
847     }
848     if (ed->afd_poll_info.Handles[0].Events &
849         (AFD_POLL_SEND | AFD_POLL_CONNECT_FAIL)) {
850       flags |= ARES_EVENT_FLAG_WRITE;
851     }
852     if (ed->afd_poll_info.Handles[0].Events & AFD_POLL_LOCAL_CLOSE) {
853       CARES_DEBUG_LOG("\t\t** ed=%p LOCAL CLOSE\n", (void *)ed);
854       ed->poll_status = POLL_STATUS_DESTROY;
855     }
856   }
857 
858   CARES_DEBUG_LOG("\t\t** ed=%p, iosb status=%lX, poll_status=%d, flags=%X\n",
859                   (void *)ed, (unsigned long)ed->iosb.Status,
860                   (int)ed->poll_status, (unsigned int)flags);
861 
862   /* Decrement poll count for AFD handle then resort, also disassociate
863    * with socket */
864   afd = ares_slist_node_val(ed->afd_handle_node);
865   afd->poll_cnt--;
866   ares_slist_node_reinsert(ed->afd_handle_node);
867   ed->afd_handle_node = NULL;
868 
869   /* Pending destroy, go ahead and kill it */
870   if (ed->poll_status == POLL_STATUS_DESTROY) {
871     ares_evsys_win32_eventdata_destroy(ew, ed);
872     return ARES_FALSE;
873   }
874 
875   ed->poll_status = POLL_STATUS_NONE;
876 
877   /* Mask flags against current desired flags.  We could have an event
878    * queued that is outdated. */
879   flags &= event->flags;
880 
881   /* Don't actually do anything with the event that was delivered as we are
882    * in a shutdown/cleanup process.  Mostly just handling the delayed
883    * destruction of sockets */
884   if (ew->is_shutdown) {
885     return ARES_FALSE;
886   }
887 
888   /* Re-enqueue so we can get more events on the socket, we either
889    * received a real event, or a cancellation notice.  Both cases we
890    * re-queue using the current configured event flags.
891    *
892    * If we can't re-enqueue, that likely means the socket has been
893    * closed, so we want to kill our reference to it
894    */
895   if (!ares_evsys_win32_afd_enqueue(event, event->flags)) {
896     ares_evsys_win32_eventdata_destroy(ew, ed);
897     return ARES_FALSE;
898   }
899 
900   /* No events we recognize to deliver */
901   if (flags == 0) {
902     return ARES_FALSE;
903   }
904 
905   event->cb(event->e, event->fd, event->data, flags);
906   return ARES_TRUE;
907 }
908 
ares_evsys_win32_wait(ares_event_thread_t * e,unsigned long timeout_ms)909 static size_t ares_evsys_win32_wait(ares_event_thread_t *e,
910                                     unsigned long        timeout_ms)
911 {
912   ares_evsys_win32_t *ew = e->ev_sys_data;
913   OVERLAPPED_ENTRY    entries[16];
914   ULONG               maxentries = sizeof(entries) / sizeof(*entries);
915   ULONG               nentries;
916   BOOL                status;
917   size_t              i;
918   size_t              cnt  = 0;
919   DWORD               tout = (timeout_ms == 0) ? INFINITE : (DWORD)timeout_ms;
920 
921   CARES_DEBUG_LOG("** Wait Enter\n");
922   /* Process in a loop for as long as it fills the entire entries buffer, and
923    * on subsequent attempts, ensure the timeout is 0 */
924   do {
925     nentries = maxentries;
926     status   = GetQueuedCompletionStatusEx(ew->iocp_handle, entries, nentries,
927                                            &nentries, tout, FALSE);
928 
929     /* Next loop around, we want to return instantly if there are no events to
930      * be processed */
931     tout = 0;
932 
933     if (!status) {
934       break;
935     }
936 
937     CARES_DEBUG_LOG("\t** GetQueuedCompletionStatusEx returned %lu entries\n",
938                     (unsigned long)nentries);
939     for (i = 0; i < (size_t)nentries; i++) {
940       ares_evsys_win32_eventdata_t *ed = NULL;
941       ares_bool_t                   rc;
942 
943       /* For things triggered via PostQueuedCompletionStatus() we have an
944        * lpCompletionKey we can just use.  Otherwise we need to dereference the
945        * pointer returned in lpOverlapped to determine the referenced
946        * socket */
947       if (entries[i].lpCompletionKey) {
948         ed = (ares_evsys_win32_eventdata_t *)entries[i].lpCompletionKey;
949         rc = ares_evsys_win32_process_other_event(ew, ed, i);
950       } else {
951         ed = ares_htable_vpvp_get_direct(ew->sockets, entries[i].lpOverlapped);
952         rc = ares_evsys_win32_process_socket_event(ew, ed, i);
953       }
954 
955       /* We processed actual events */
956       if (rc) {
957         cnt++;
958       }
959     }
960   } while (nentries == maxentries);
961 
962   CARES_DEBUG_LOG("** Wait Exit\n");
963 
964   return cnt;
965 }
966 
967 const ares_event_sys_t ares_evsys_win32 = { "win32",
968                                             ares_evsys_win32_init,
969                                             ares_evsys_win32_destroy,
970                                             ares_evsys_win32_event_add,
971                                             ares_evsys_win32_event_del,
972                                             ares_evsys_win32_event_mod,
973                                             ares_evsys_win32_wait };
974 #endif
975 
976 #if defined(__clang__) || defined(__GNUC__)
977 #  pragma GCC diagnostic pop
978 #endif
979