1
2 /* Copyright 1998 by the Massachusetts Institute of Technology.
3 * Copyright (C) 2004-2010 by Daniel Stenberg
4 *
5 * Permission to use, copy, modify, and distribute this
6 * software and its documentation for any purpose and without
7 * fee is hereby granted, provided that the above copyright
8 * notice appear in all copies and that both that copyright
9 * notice and this permission notice appear in supporting
10 * documentation, and that the name of M.I.T. not be used in
11 * advertising or publicity pertaining to distribution of the
12 * software without specific, written prior permission.
13 * M.I.T. makes no representations about the suitability of
14 * this software for any purpose. It is provided "as is"
15 * without express or implied warranty.
16 */
17
18 #include "ares_setup.h"
19
20 #ifdef HAVE_SYS_SOCKET_H
21 # include <sys/socket.h>
22 #endif
23 #ifdef HAVE_SYS_UIO_H
24 # include <sys/uio.h>
25 #endif
26 #ifdef HAVE_NETINET_IN_H
27 # include <netinet/in.h>
28 #endif
29 #ifdef HAVE_NETINET_TCP_H
30 # include <netinet/tcp.h>
31 #endif
32 #ifdef HAVE_NETDB_H
33 # include <netdb.h>
34 #endif
35 #ifdef HAVE_ARPA_NAMESER_H
36 # include <arpa/nameser.h>
37 #else
38 # include "nameser.h"
39 #endif
40 #ifdef HAVE_ARPA_NAMESER_COMPAT_H
41 # include <arpa/nameser_compat.h>
42 #endif
43
44 #ifdef HAVE_SYS_TIME_H
45 # include <sys/time.h>
46 #endif
47
48 #ifdef HAVE_STRINGS_H
49 # include <strings.h>
50 #endif
51 #ifdef HAVE_UNISTD_H
52 # include <unistd.h>
53 #endif
54 #ifdef HAVE_SYS_IOCTL_H
55 # include <sys/ioctl.h>
56 #endif
57 #ifdef NETWARE
58 # include <sys/filio.h>
59 #endif
60
61 #include <assert.h>
62 #include <string.h>
63 #include <stdlib.h>
64 #include <fcntl.h>
65 #include <time.h>
66
67 #include "ares.h"
68 #include "ares_dns.h"
69 #include "ares_nowarn.h"
70 #include "ares_private.h"
71
72
73 static int try_again(int errnum);
74 static void write_tcp_data(ares_channel channel, fd_set *write_fds,
75 ares_socket_t write_fd, struct timeval *now);
76 static void read_tcp_data(ares_channel channel, fd_set *read_fds,
77 ares_socket_t read_fd, struct timeval *now);
78 static void read_udp_packets(ares_channel channel, fd_set *read_fds,
79 ares_socket_t read_fd, struct timeval *now);
80 static void advance_tcp_send_queue(ares_channel channel, int whichserver,
81 ssize_t num_bytes);
82 static void process_timeouts(ares_channel channel, struct timeval *now);
83 static void process_broken_connections(ares_channel channel,
84 struct timeval *now);
85 static void process_answer(ares_channel channel, unsigned char *abuf,
86 int alen, int whichserver, int tcp,
87 struct timeval *now);
88 static void handle_error(ares_channel channel, int whichserver,
89 struct timeval *now);
90 static void skip_server(ares_channel channel, struct query *query,
91 int whichserver);
92 static void next_server(ares_channel channel, struct query *query,
93 struct timeval *now);
94 static int open_tcp_socket(ares_channel channel, struct server_state *server);
95 static int open_udp_socket(ares_channel channel, struct server_state *server);
96 static int same_questions(const unsigned char *qbuf, int qlen,
97 const unsigned char *abuf, int alen);
98 static int same_address(struct sockaddr *sa, struct ares_addr *aa);
99 static void end_query(ares_channel channel, struct query *query, int status,
100 unsigned char *abuf, int alen);
101
102 /* return true if now is exactly check time or later */
ares__timedout(struct timeval * now,struct timeval * check)103 int ares__timedout(struct timeval *now,
104 struct timeval *check)
105 {
106 long secs = (now->tv_sec - check->tv_sec);
107
108 if(secs > 0)
109 return 1; /* yes, timed out */
110 if(secs < 0)
111 return 0; /* nope, not timed out */
112
113 /* if the full seconds were identical, check the sub second parts */
114 return (now->tv_usec - check->tv_usec >= 0);
115 }
116
117 /* add the specific number of milliseconds to the time in the first argument */
ares__timeadd(struct timeval * now,int millisecs)118 int ares__timeadd(struct timeval *now,
119 int millisecs)
120 {
121 now->tv_sec += millisecs/1000;
122 now->tv_usec += (millisecs%1000)*1000;
123
124 if(now->tv_usec >= 1000000) {
125 ++(now->tv_sec);
126 now->tv_usec -= 1000000;
127 }
128
129 return 0;
130 }
131
132 /* return time offset between now and (future) check, in milliseconds */
ares__timeoffset(struct timeval * now,struct timeval * check)133 long ares__timeoffset(struct timeval *now,
134 struct timeval *check)
135 {
136 return (check->tv_sec - now->tv_sec)*1000 +
137 (check->tv_usec - now->tv_usec)/1000;
138 }
139
140
141 /*
142 * generic process function
143 */
processfds(ares_channel channel,fd_set * read_fds,ares_socket_t read_fd,fd_set * write_fds,ares_socket_t write_fd)144 static void processfds(ares_channel channel,
145 fd_set *read_fds, ares_socket_t read_fd,
146 fd_set *write_fds, ares_socket_t write_fd)
147 {
148 struct timeval now = ares__tvnow();
149
150 write_tcp_data(channel, write_fds, write_fd, &now);
151 read_tcp_data(channel, read_fds, read_fd, &now);
152 read_udp_packets(channel, read_fds, read_fd, &now);
153 process_timeouts(channel, &now);
154 process_broken_connections(channel, &now);
155 }
156
157 /* Something interesting happened on the wire, or there was a timeout.
158 * See what's up and respond accordingly.
159 */
ares_process(ares_channel channel,fd_set * read_fds,fd_set * write_fds)160 void ares_process(ares_channel channel, fd_set *read_fds, fd_set *write_fds)
161 {
162 processfds(channel, read_fds, ARES_SOCKET_BAD, write_fds, ARES_SOCKET_BAD);
163 }
164
165 /* Something interesting happened on the wire, or there was a timeout.
166 * See what's up and respond accordingly.
167 */
ares_process_fd(ares_channel channel,ares_socket_t read_fd,ares_socket_t write_fd)168 void ares_process_fd(ares_channel channel,
169 ares_socket_t read_fd, /* use ARES_SOCKET_BAD or valid
170 file descriptors */
171 ares_socket_t write_fd)
172 {
173 processfds(channel, NULL, read_fd, NULL, write_fd);
174 }
175
176
177 /* Return 1 if the specified error number describes a readiness error, or 0
178 * otherwise. This is mostly for HP-UX, which could return EAGAIN or
179 * EWOULDBLOCK. See this man page
180 *
181 * http://devrsrc1.external.hp.com/STKS/cgi-bin/man2html?
182 * manpage=/usr/share/man/man2.Z/send.2
183 */
try_again(int errnum)184 static int try_again(int errnum)
185 {
186 #if !defined EWOULDBLOCK && !defined EAGAIN
187 #error "Neither EWOULDBLOCK nor EAGAIN defined"
188 #endif
189 switch (errnum)
190 {
191 #ifdef EWOULDBLOCK
192 case EWOULDBLOCK:
193 return 1;
194 #endif
195 #if defined EAGAIN && EAGAIN != EWOULDBLOCK
196 case EAGAIN:
197 return 1;
198 #endif
199 }
200 return 0;
201 }
202
203 /* If any TCP sockets select true for writing, write out queued data
204 * we have for them.
205 */
write_tcp_data(ares_channel channel,fd_set * write_fds,ares_socket_t write_fd,struct timeval * now)206 static void write_tcp_data(ares_channel channel,
207 fd_set *write_fds,
208 ares_socket_t write_fd,
209 struct timeval *now)
210 {
211 struct server_state *server;
212 struct send_request *sendreq;
213 struct iovec *vec;
214 int i;
215 ssize_t scount;
216 ssize_t wcount;
217 size_t n;
218
219 if(!write_fds && (write_fd == ARES_SOCKET_BAD))
220 /* no possible action */
221 return;
222
223 for (i = 0; i < channel->nservers; i++)
224 {
225 /* Make sure server has data to send and is selected in write_fds or
226 write_fd. */
227 server = &channel->servers[i];
228 if (!server->qhead || server->tcp_socket == ARES_SOCKET_BAD ||
229 server->is_broken)
230 continue;
231
232 if(write_fds) {
233 if(!FD_ISSET(server->tcp_socket, write_fds))
234 continue;
235 }
236 else {
237 if(server->tcp_socket != write_fd)
238 continue;
239 }
240
241 if(write_fds)
242 /* If there's an error and we close this socket, then open
243 * another with the same fd to talk to another server, then we
244 * don't want to think that it was the new socket that was
245 * ready. This is not disastrous, but is likely to result in
246 * extra system calls and confusion. */
247 FD_CLR(server->tcp_socket, write_fds);
248
249 /* Count the number of send queue items. */
250 n = 0;
251 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
252 n++;
253
254 /* Allocate iovecs so we can send all our data at once. */
255 vec = malloc(n * sizeof(struct iovec));
256 if (vec)
257 {
258 /* Fill in the iovecs and send. */
259 n = 0;
260 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
261 {
262 vec[n].iov_base = (char *) sendreq->data;
263 vec[n].iov_len = sendreq->len;
264 n++;
265 }
266 wcount = (ssize_t)writev(server->tcp_socket, vec, (int)n);
267 free(vec);
268 if (wcount < 0)
269 {
270 if (!try_again(SOCKERRNO))
271 handle_error(channel, i, now);
272 continue;
273 }
274
275 /* Advance the send queue by as many bytes as we sent. */
276 advance_tcp_send_queue(channel, i, wcount);
277 }
278 else
279 {
280 /* Can't allocate iovecs; just send the first request. */
281 sendreq = server->qhead;
282
283 scount = swrite(server->tcp_socket, sendreq->data, sendreq->len);
284 if (scount < 0)
285 {
286 if (!try_again(SOCKERRNO))
287 handle_error(channel, i, now);
288 continue;
289 }
290
291 /* Advance the send queue by as many bytes as we sent. */
292 advance_tcp_send_queue(channel, i, scount);
293 }
294 }
295 }
296
297 /* Consume the given number of bytes from the head of the TCP send queue. */
advance_tcp_send_queue(ares_channel channel,int whichserver,ssize_t num_bytes)298 static void advance_tcp_send_queue(ares_channel channel, int whichserver,
299 ssize_t num_bytes)
300 {
301 struct send_request *sendreq;
302 struct server_state *server = &channel->servers[whichserver];
303 while (num_bytes > 0) {
304 sendreq = server->qhead;
305 if ((size_t)num_bytes >= sendreq->len) {
306 num_bytes -= sendreq->len;
307 server->qhead = sendreq->next;
308 if (sendreq->data_storage)
309 free(sendreq->data_storage);
310 free(sendreq);
311 if (server->qhead == NULL) {
312 SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 0);
313 server->qtail = NULL;
314
315 /* qhead is NULL so we cannot continue this loop */
316 break;
317 }
318 }
319 else {
320 sendreq->data += num_bytes;
321 sendreq->len -= num_bytes;
322 num_bytes = 0;
323 }
324 }
325 }
326
327 /* If any TCP socket selects true for reading, read some data,
328 * allocate a buffer if we finish reading the length word, and process
329 * a packet if we finish reading one.
330 */
read_tcp_data(ares_channel channel,fd_set * read_fds,ares_socket_t read_fd,struct timeval * now)331 static void read_tcp_data(ares_channel channel, fd_set *read_fds,
332 ares_socket_t read_fd, struct timeval *now)
333 {
334 struct server_state *server;
335 int i;
336 ssize_t count;
337
338 if(!read_fds && (read_fd == ARES_SOCKET_BAD))
339 /* no possible action */
340 return;
341
342 for (i = 0; i < channel->nservers; i++)
343 {
344 /* Make sure the server has a socket and is selected in read_fds. */
345 server = &channel->servers[i];
346 if (server->tcp_socket == ARES_SOCKET_BAD || server->is_broken)
347 continue;
348
349 if(read_fds) {
350 if(!FD_ISSET(server->tcp_socket, read_fds))
351 continue;
352 }
353 else {
354 if(server->tcp_socket != read_fd)
355 continue;
356 }
357
358 if(read_fds)
359 /* If there's an error and we close this socket, then open
360 * another with the same fd to talk to another server, then we
361 * don't want to think that it was the new socket that was
362 * ready. This is not disastrous, but is likely to result in
363 * extra system calls and confusion. */
364 FD_CLR(server->tcp_socket, read_fds);
365
366 if (server->tcp_lenbuf_pos != 2)
367 {
368 /* We haven't yet read a length word, so read that (or
369 * what's left to read of it).
370 */
371 count = sread(server->tcp_socket,
372 server->tcp_lenbuf + server->tcp_lenbuf_pos,
373 2 - server->tcp_lenbuf_pos);
374 if (count <= 0)
375 {
376 if (!(count == -1 && try_again(SOCKERRNO)))
377 handle_error(channel, i, now);
378 continue;
379 }
380
381 server->tcp_lenbuf_pos += (int)count;
382 if (server->tcp_lenbuf_pos == 2)
383 {
384 /* We finished reading the length word. Decode the
385 * length and allocate a buffer for the data.
386 */
387 server->tcp_length = server->tcp_lenbuf[0] << 8
388 | server->tcp_lenbuf[1];
389 server->tcp_buffer = malloc(server->tcp_length);
390 if (!server->tcp_buffer)
391 handle_error(channel, i, now);
392 server->tcp_buffer_pos = 0;
393 }
394 }
395 else
396 {
397 /* Read data into the allocated buffer. */
398 count = sread(server->tcp_socket,
399 server->tcp_buffer + server->tcp_buffer_pos,
400 server->tcp_length - server->tcp_buffer_pos);
401 if (count <= 0)
402 {
403 if (!(count == -1 && try_again(SOCKERRNO)))
404 handle_error(channel, i, now);
405 continue;
406 }
407
408 server->tcp_buffer_pos += (int)count;
409 if (server->tcp_buffer_pos == server->tcp_length)
410 {
411 /* We finished reading this answer; process it and
412 * prepare to read another length word.
413 */
414 process_answer(channel, server->tcp_buffer, server->tcp_length,
415 i, 1, now);
416 if (server->tcp_buffer)
417 free(server->tcp_buffer);
418 server->tcp_buffer = NULL;
419 server->tcp_lenbuf_pos = 0;
420 server->tcp_buffer_pos = 0;
421 }
422 }
423 }
424 }
425
426 /* If any UDP sockets select true for reading, process them. */
read_udp_packets(ares_channel channel,fd_set * read_fds,ares_socket_t read_fd,struct timeval * now)427 static void read_udp_packets(ares_channel channel, fd_set *read_fds,
428 ares_socket_t read_fd, struct timeval *now)
429 {
430 struct server_state *server;
431 int i;
432 ssize_t count;
433 unsigned char buf[PACKETSZ + 1];
434 #ifdef HAVE_RECVFROM
435 ares_socklen_t fromlen;
436 union {
437 struct sockaddr sa;
438 struct sockaddr_in sa4;
439 struct sockaddr_in6 sa6;
440 } from;
441 #endif
442
443 if(!read_fds && (read_fd == ARES_SOCKET_BAD))
444 /* no possible action */
445 return;
446
447 for (i = 0; i < channel->nservers; i++)
448 {
449 /* Make sure the server has a socket and is selected in read_fds. */
450 server = &channel->servers[i];
451
452 if (server->udp_socket == ARES_SOCKET_BAD || server->is_broken)
453 continue;
454
455 if(read_fds) {
456 if(!FD_ISSET(server->udp_socket, read_fds))
457 continue;
458 }
459 else {
460 if(server->udp_socket != read_fd)
461 continue;
462 }
463
464 if(read_fds)
465 /* If there's an error and we close this socket, then open
466 * another with the same fd to talk to another server, then we
467 * don't want to think that it was the new socket that was
468 * ready. This is not disastrous, but is likely to result in
469 * extra system calls and confusion. */
470 FD_CLR(server->udp_socket, read_fds);
471
472 /* To reduce event loop overhead, read and process as many
473 * packets as we can. */
474 do {
475 #ifdef HAVE_RECVFROM
476 if (server->addr.family == AF_INET)
477 fromlen = sizeof(from.sa4);
478 else
479 fromlen = sizeof(from.sa6);
480 count = (ssize_t)recvfrom(server->udp_socket, (void *)buf, sizeof(buf),
481 0, &from.sa, &fromlen);
482 #else
483 count = sread(server->udp_socket, buf, sizeof(buf));
484 #endif
485 if (count == -1 && try_again(SOCKERRNO))
486 continue;
487 else if (count <= 0)
488 handle_error(channel, i, now);
489 #ifdef HAVE_RECVFROM
490 else if (!same_address(&from.sa, &server->addr))
491 /* The address the response comes from does not match
492 * the address we sent the request to. Someone may be
493 * attempting to perform a cache poisoning attack. */
494 break;
495 #endif
496 else
497 process_answer(channel, buf, (int)count, i, 0, now);
498 } while (count > 0);
499 }
500 }
501
502 /* If any queries have timed out, note the timeout and move them on. */
process_timeouts(ares_channel channel,struct timeval * now)503 static void process_timeouts(ares_channel channel, struct timeval *now)
504 {
505 time_t t; /* the time of the timeouts we're processing */
506 struct query *query;
507 struct list_node* list_head;
508 struct list_node* list_node;
509
510 /* Process all the timeouts that have fired since the last time we
511 * processed timeouts. If things are going well, then we'll have
512 * hundreds/thousands of queries that fall into future buckets, and
513 * only a handful of requests that fall into the "now" bucket, so
514 * this should be quite quick.
515 */
516 for (t = channel->last_timeout_processed; t <= now->tv_sec; t++)
517 {
518 list_head = &(channel->queries_by_timeout[t % ARES_TIMEOUT_TABLE_SIZE]);
519 for (list_node = list_head->next; list_node != list_head; )
520 {
521 query = list_node->data;
522 list_node = list_node->next; /* in case the query gets deleted */
523 if (query->timeout.tv_sec && ares__timedout(now, &query->timeout))
524 {
525 query->error_status = ARES_ETIMEOUT;
526 ++query->timeouts;
527 next_server(channel, query, now);
528 }
529 }
530 }
531 channel->last_timeout_processed = now->tv_sec;
532 }
533
534 /* Handle an answer from a server. */
process_answer(ares_channel channel,unsigned char * abuf,int alen,int whichserver,int tcp,struct timeval * now)535 static void process_answer(ares_channel channel, unsigned char *abuf,
536 int alen, int whichserver, int tcp,
537 struct timeval *now)
538 {
539 int tc, rcode;
540 unsigned short id;
541 struct query *query;
542 struct list_node* list_head;
543 struct list_node* list_node;
544
545 /* If there's no room in the answer for a header, we can't do much
546 * with it. */
547 if (alen < HFIXEDSZ)
548 return;
549
550 /* Grab the query ID, truncate bit, and response code from the packet. */
551 id = DNS_HEADER_QID(abuf);
552 tc = DNS_HEADER_TC(abuf);
553 rcode = DNS_HEADER_RCODE(abuf);
554
555 /* Find the query corresponding to this packet. The queries are
556 * hashed/bucketed by query id, so this lookup should be quick.
557 * Note that both the query id and the questions must be the same;
558 * when the query id wraps around we can have multiple outstanding
559 * queries with the same query id, so we need to check both the id and
560 * question.
561 */
562 query = NULL;
563 list_head = &(channel->queries_by_qid[id % ARES_QID_TABLE_SIZE]);
564 for (list_node = list_head->next; list_node != list_head;
565 list_node = list_node->next)
566 {
567 struct query *q = list_node->data;
568 if ((q->qid == id) && same_questions(q->qbuf, q->qlen, abuf, alen))
569 {
570 query = q;
571 break;
572 }
573 }
574 if (!query)
575 return;
576
577 /* If we got a truncated UDP packet and are not ignoring truncation,
578 * don't accept the packet, and switch the query to TCP if we hadn't
579 * done so already.
580 */
581 if ((tc || alen > PACKETSZ) && !tcp && !(channel->flags & ARES_FLAG_IGNTC))
582 {
583 if (!query->using_tcp)
584 {
585 query->using_tcp = 1;
586 ares__send_query(channel, query, now);
587 }
588 return;
589 }
590
591 /* Limit alen to PACKETSZ if we aren't using TCP (only relevant if we
592 * are ignoring truncation.
593 */
594 if (alen > PACKETSZ && !tcp)
595 alen = PACKETSZ;
596
597 /* If we aren't passing through all error packets, discard packets
598 * with SERVFAIL, NOTIMP, or REFUSED response codes.
599 */
600 if (!(channel->flags & ARES_FLAG_NOCHECKRESP))
601 {
602 if (rcode == SERVFAIL || rcode == NOTIMP || rcode == REFUSED)
603 {
604 skip_server(channel, query, whichserver);
605 if (query->server == whichserver)
606 next_server(channel, query, now);
607 return;
608 }
609 }
610
611 end_query(channel, query, ARES_SUCCESS, abuf, alen);
612 }
613
614 /* Close all the connections that are no longer usable. */
process_broken_connections(ares_channel channel,struct timeval * now)615 static void process_broken_connections(ares_channel channel,
616 struct timeval *now)
617 {
618 int i;
619 for (i = 0; i < channel->nservers; i++)
620 {
621 struct server_state *server = &channel->servers[i];
622 if (server->is_broken)
623 {
624 handle_error(channel, i, now);
625 }
626 }
627 }
628
handle_error(ares_channel channel,int whichserver,struct timeval * now)629 static void handle_error(ares_channel channel, int whichserver,
630 struct timeval *now)
631 {
632 struct server_state *server;
633 struct query *query;
634 struct list_node list_head;
635 struct list_node* list_node;
636
637 server = &channel->servers[whichserver];
638
639 /* Reset communications with this server. */
640 ares__close_sockets(channel, server);
641
642 /* Tell all queries talking to this server to move on and not try
643 * this server again. We steal the current list of queries that were
644 * in-flight to this server, since when we call next_server this can
645 * cause the queries to be re-sent to this server, which will
646 * re-insert these queries in that same server->queries_to_server
647 * list.
648 */
649 ares__init_list_head(&list_head);
650 ares__swap_lists(&list_head, &(server->queries_to_server));
651 for (list_node = list_head.next; list_node != &list_head; )
652 {
653 query = list_node->data;
654 list_node = list_node->next; /* in case the query gets deleted */
655 assert(query->server == whichserver);
656 skip_server(channel, query, whichserver);
657 next_server(channel, query, now);
658 }
659 /* Each query should have removed itself from our temporary list as
660 * it re-sent itself or finished up...
661 */
662 assert(ares__is_list_empty(&list_head));
663 }
664
skip_server(ares_channel channel,struct query * query,int whichserver)665 static void skip_server(ares_channel channel, struct query *query,
666 int whichserver) {
667 /* The given server gave us problems with this query, so if we have
668 * the luxury of using other servers, then let's skip the
669 * potentially broken server and just use the others. If we only
670 * have one server and we need to retry then we should just go ahead
671 * and re-use that server, since it's our only hope; perhaps we
672 * just got unlucky, and retrying will work (eg, the server timed
673 * out our TCP connection just as we were sending another request).
674 */
675 if (channel->nservers > 1)
676 {
677 query->server_info[whichserver].skip_server = 1;
678 }
679 }
680
next_server(ares_channel channel,struct query * query,struct timeval * now)681 static void next_server(ares_channel channel, struct query *query,
682 struct timeval *now)
683 {
684 /* We need to try each server channel->tries times. We have channel->nservers
685 * servers to try. In total, we need to do channel->nservers * channel->tries
686 * attempts. Use query->try to remember how many times we already attempted
687 * this query. Use modular arithmetic to find the next server to try. */
688 while (++(query->try_count) < (channel->nservers * channel->tries))
689 {
690 struct server_state *server;
691
692 /* Move on to the next server. */
693 query->server = (query->server + 1) % channel->nservers;
694 server = &channel->servers[query->server];
695
696 /* We don't want to use this server if (1) we decided this
697 * connection is broken, and thus about to be closed, (2)
698 * we've decided to skip this server because of earlier
699 * errors we encountered, or (3) we already sent this query
700 * over this exact connection.
701 */
702 if (!server->is_broken &&
703 !query->server_info[query->server].skip_server &&
704 !(query->using_tcp &&
705 (query->server_info[query->server].tcp_connection_generation ==
706 server->tcp_connection_generation)))
707 {
708 ares__send_query(channel, query, now);
709 return;
710 }
711
712 /* You might think that with TCP we only need one try. However,
713 * even when using TCP, servers can time-out our connection just
714 * as we're sending a request, or close our connection because
715 * they die, or never send us a reply because they get wedged or
716 * tickle a bug that drops our request.
717 */
718 }
719
720 /* If we are here, all attempts to perform query failed. */
721 end_query(channel, query, query->error_status, NULL, 0);
722 }
723
ares__send_query(ares_channel channel,struct query * query,struct timeval * now)724 void ares__send_query(ares_channel channel, struct query *query,
725 struct timeval *now)
726 {
727 struct send_request *sendreq;
728 struct server_state *server;
729 int timeplus;
730
731 server = &channel->servers[query->server];
732 if (query->using_tcp)
733 {
734 /* Make sure the TCP socket for this server is set up and queue
735 * a send request.
736 */
737 if (server->tcp_socket == ARES_SOCKET_BAD)
738 {
739 if (open_tcp_socket(channel, server) == -1)
740 {
741 skip_server(channel, query, query->server);
742 next_server(channel, query, now);
743 return;
744 }
745 }
746 sendreq = calloc(1, sizeof(struct send_request));
747 if (!sendreq)
748 {
749 end_query(channel, query, ARES_ENOMEM, NULL, 0);
750 return;
751 }
752 /* To make the common case fast, we avoid copies by using the
753 * query's tcpbuf for as long as the query is alive. In the rare
754 * case where the query ends while it's queued for transmission,
755 * then we give the sendreq its own copy of the request packet
756 * and put it in sendreq->data_storage.
757 */
758 sendreq->data_storage = NULL;
759 sendreq->data = query->tcpbuf;
760 sendreq->len = query->tcplen;
761 sendreq->owner_query = query;
762 sendreq->next = NULL;
763 if (server->qtail)
764 server->qtail->next = sendreq;
765 else
766 {
767 SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 1);
768 server->qhead = sendreq;
769 }
770 server->qtail = sendreq;
771 query->server_info[query->server].tcp_connection_generation =
772 server->tcp_connection_generation;
773 }
774 else
775 {
776 if (server->udp_socket == ARES_SOCKET_BAD)
777 {
778 if (open_udp_socket(channel, server) == -1)
779 {
780 skip_server(channel, query, query->server);
781 next_server(channel, query, now);
782 return;
783 }
784 }
785 if (swrite(server->udp_socket, query->qbuf, query->qlen) == -1)
786 {
787 /* FIXME: Handle EAGAIN here since it likely can happen. */
788 skip_server(channel, query, query->server);
789 next_server(channel, query, now);
790 return;
791 }
792 }
793 timeplus = channel->timeout << (query->try_count / channel->nservers);
794 timeplus = (timeplus * (9 + (rand () & 7))) / 16;
795 query->timeout = *now;
796 ares__timeadd(&query->timeout,
797 timeplus);
798 /* Keep track of queries bucketed by timeout, so we can process
799 * timeout events quickly.
800 */
801 ares__remove_from_list(&(query->queries_by_timeout));
802 ares__insert_in_list(
803 &(query->queries_by_timeout),
804 &(channel->queries_by_timeout[query->timeout.tv_sec %
805 ARES_TIMEOUT_TABLE_SIZE]));
806
807 /* Keep track of queries bucketed by server, so we can process server
808 * errors quickly.
809 */
810 ares__remove_from_list(&(query->queries_to_server));
811 ares__insert_in_list(&(query->queries_to_server),
812 &(server->queries_to_server));
813 }
814
815 /*
816 * setsocknonblock sets the given socket to either blocking or non-blocking
817 * mode based on the 'nonblock' boolean argument. This function is highly
818 * portable.
819 */
setsocknonblock(ares_socket_t sockfd,int nonblock)820 static int setsocknonblock(ares_socket_t sockfd, /* operate on this */
821 int nonblock /* TRUE or FALSE */)
822 {
823 #if defined(USE_BLOCKING_SOCKETS)
824
825 return 0; /* returns success */
826
827 #elif defined(HAVE_FCNTL_O_NONBLOCK)
828
829 /* most recent unix versions */
830 int flags;
831 flags = fcntl(sockfd, F_GETFL, 0);
832 if (FALSE != nonblock)
833 return fcntl(sockfd, F_SETFL, flags | O_NONBLOCK);
834 else
835 return fcntl(sockfd, F_SETFL, flags & (~O_NONBLOCK));
836
837 #elif defined(HAVE_IOCTL_FIONBIO)
838
839 /* older unix versions */
840 int flags;
841 flags = nonblock;
842 return ioctl(sockfd, FIONBIO, &flags);
843
844 #elif defined(HAVE_IOCTLSOCKET_FIONBIO)
845
846 #ifdef WATT32
847 char flags;
848 #else
849 /* Windows */
850 unsigned long flags;
851 #endif
852 flags = nonblock;
853 return ioctlsocket(sockfd, FIONBIO, &flags);
854
855 #elif defined(HAVE_IOCTLSOCKET_CAMEL_FIONBIO)
856
857 /* Amiga */
858 return IoctlSocket(sockfd, FIONBIO, (long)nonblock);
859
860 #elif defined(HAVE_SETSOCKOPT_SO_NONBLOCK)
861
862 /* BeOS */
863 long b = nonblock ? 1 : 0;
864 return setsockopt(sockfd, SOL_SOCKET, SO_NONBLOCK, &b, sizeof(b));
865
866 #else
867 # error "no non-blocking method was found/used/set"
868 #endif
869 }
870
configure_socket(ares_socket_t s,int family,ares_channel channel)871 static int configure_socket(ares_socket_t s, int family, ares_channel channel)
872 {
873 union {
874 struct sockaddr sa;
875 struct sockaddr_in sa4;
876 struct sockaddr_in6 sa6;
877 } local;
878
879 setsocknonblock(s, TRUE);
880
881 #if defined(FD_CLOEXEC) && !defined(MSDOS)
882 /* Configure the socket fd as close-on-exec. */
883 if (fcntl(s, F_SETFD, FD_CLOEXEC) == -1)
884 return -1;
885 #endif
886
887 /* Set the socket's send and receive buffer sizes. */
888 if ((channel->socket_send_buffer_size > 0) &&
889 setsockopt(s, SOL_SOCKET, SO_SNDBUF,
890 (void *)&channel->socket_send_buffer_size,
891 sizeof(channel->socket_send_buffer_size)) == -1)
892 return -1;
893
894 if ((channel->socket_receive_buffer_size > 0) &&
895 setsockopt(s, SOL_SOCKET, SO_RCVBUF,
896 (void *)&channel->socket_receive_buffer_size,
897 sizeof(channel->socket_receive_buffer_size)) == -1)
898 return -1;
899
900 #ifdef SO_BINDTODEVICE
901 if (channel->local_dev_name[0]) {
902 if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
903 channel->local_dev_name, sizeof(channel->local_dev_name))) {
904 /* Only root can do this, and usually not fatal if it doesn't work, so */
905 /* just continue on. */
906 }
907 }
908 #endif
909
910 if (family == AF_INET) {
911 if (channel->local_ip4) {
912 memset(&local.sa4, 0, sizeof(local.sa4));
913 local.sa4.sin_family = AF_INET;
914 local.sa4.sin_addr.s_addr = htonl(channel->local_ip4);
915 if (bind(s, &local.sa, sizeof(local.sa4)) < 0)
916 return -1;
917 }
918 }
919 else if (family == AF_INET6) {
920 if (memcmp(channel->local_ip6, &ares_in6addr_any, sizeof(channel->local_ip6)) != 0) {
921 memset(&local.sa6, 0, sizeof(local.sa6));
922 local.sa6.sin6_family = AF_INET6;
923 memcpy(&local.sa6.sin6_addr, channel->local_ip6, sizeof(channel->local_ip6));
924 if (bind(s, &local.sa, sizeof(local.sa6)) < 0)
925 return -1;
926 }
927 }
928
929 return 0;
930 }
931
open_tcp_socket(ares_channel channel,struct server_state * server)932 static int open_tcp_socket(ares_channel channel, struct server_state *server)
933 {
934 ares_socket_t s;
935 int opt;
936 ares_socklen_t salen;
937 union {
938 struct sockaddr_in sa4;
939 struct sockaddr_in6 sa6;
940 } saddr;
941 struct sockaddr *sa;
942
943 switch (server->addr.family)
944 {
945 case AF_INET:
946 sa = (void *)&saddr.sa4;
947 salen = sizeof(saddr.sa4);
948 memset(sa, 0, salen);
949 saddr.sa4.sin_family = AF_INET;
950 saddr.sa4.sin_port = (unsigned short)(channel->tcp_port & 0xffff);
951 memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4,
952 sizeof(server->addr.addrV4));
953 break;
954 case AF_INET6:
955 sa = (void *)&saddr.sa6;
956 salen = sizeof(saddr.sa6);
957 memset(sa, 0, salen);
958 saddr.sa6.sin6_family = AF_INET6;
959 saddr.sa6.sin6_port = (unsigned short)(channel->tcp_port & 0xffff);
960 memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6,
961 sizeof(server->addr.addrV6));
962 break;
963 default:
964 return -1;
965 }
966
967 /* Acquire a socket. */
968 s = socket(server->addr.family, SOCK_STREAM, 0);
969 if (s == ARES_SOCKET_BAD)
970 return -1;
971
972 /* Configure it. */
973 if (configure_socket(s, server->addr.family, channel) < 0)
974 {
975 sclose(s);
976 return -1;
977 }
978
979 #ifdef TCP_NODELAY
980 /*
981 * Disable the Nagle algorithm (only relevant for TCP sockets, and thus not
982 * in configure_socket). In general, in DNS lookups we're pretty much
983 * interested in firing off a single request and then waiting for a reply,
984 * so batching isn't very interesting.
985 */
986 opt = 1;
987 if (setsockopt(s, IPPROTO_TCP, TCP_NODELAY,
988 (void *)&opt, sizeof(opt)) == -1)
989 {
990 sclose(s);
991 return -1;
992 }
993 #endif
994
995 /* Connect to the server. */
996 if (connect(s, sa, salen) == -1)
997 {
998 int err = SOCKERRNO;
999
1000 if (err != EINPROGRESS && err != EWOULDBLOCK)
1001 {
1002 sclose(s);
1003 return -1;
1004 }
1005 }
1006
1007 if (channel->sock_create_cb)
1008 {
1009 int err = channel->sock_create_cb(s, SOCK_STREAM,
1010 channel->sock_create_cb_data);
1011 if (err < 0)
1012 {
1013 sclose(s);
1014 return err;
1015 }
1016 }
1017
1018 SOCK_STATE_CALLBACK(channel, s, 1, 0);
1019 server->tcp_buffer_pos = 0;
1020 server->tcp_socket = s;
1021 server->tcp_connection_generation = ++channel->tcp_connection_generation;
1022 return 0;
1023 }
1024
open_udp_socket(ares_channel channel,struct server_state * server)1025 static int open_udp_socket(ares_channel channel, struct server_state *server)
1026 {
1027 ares_socket_t s;
1028 ares_socklen_t salen;
1029 union {
1030 struct sockaddr_in sa4;
1031 struct sockaddr_in6 sa6;
1032 } saddr;
1033 struct sockaddr *sa;
1034
1035 switch (server->addr.family)
1036 {
1037 case AF_INET:
1038 sa = (void *)&saddr.sa4;
1039 salen = sizeof(saddr.sa4);
1040 memset(sa, 0, salen);
1041 saddr.sa4.sin_family = AF_INET;
1042 saddr.sa4.sin_port = (unsigned short)(channel->udp_port & 0xffff);
1043 memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4,
1044 sizeof(server->addr.addrV4));
1045 break;
1046 case AF_INET6:
1047 sa = (void *)&saddr.sa6;
1048 salen = sizeof(saddr.sa6);
1049 memset(sa, 0, salen);
1050 saddr.sa6.sin6_family = AF_INET6;
1051 saddr.sa6.sin6_port = (unsigned short)(channel->udp_port & 0xffff);
1052 memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6,
1053 sizeof(server->addr.addrV6));
1054 break;
1055 default:
1056 return -1;
1057 }
1058
1059 /* Acquire a socket. */
1060 s = socket(server->addr.family, SOCK_DGRAM, 0);
1061 if (s == ARES_SOCKET_BAD)
1062 return -1;
1063
1064 /* Set the socket non-blocking. */
1065 if (configure_socket(s, server->addr.family, channel) < 0)
1066 {
1067 sclose(s);
1068 return -1;
1069 }
1070
1071 /* Connect to the server. */
1072 if (connect(s, sa, salen) == -1)
1073 {
1074 int err = SOCKERRNO;
1075
1076 if (err != EINPROGRESS && err != EWOULDBLOCK)
1077 {
1078 sclose(s);
1079 return -1;
1080 }
1081 }
1082
1083 if (channel->sock_create_cb)
1084 {
1085 int err = channel->sock_create_cb(s, SOCK_DGRAM,
1086 channel->sock_create_cb_data);
1087 if (err < 0)
1088 {
1089 sclose(s);
1090 return err;
1091 }
1092 }
1093
1094 SOCK_STATE_CALLBACK(channel, s, 1, 0);
1095
1096 server->udp_socket = s;
1097 return 0;
1098 }
1099
same_questions(const unsigned char * qbuf,int qlen,const unsigned char * abuf,int alen)1100 static int same_questions(const unsigned char *qbuf, int qlen,
1101 const unsigned char *abuf, int alen)
1102 {
1103 struct {
1104 const unsigned char *p;
1105 int qdcount;
1106 char *name;
1107 long namelen;
1108 int type;
1109 int dnsclass;
1110 } q, a;
1111 int i, j;
1112
1113 if (qlen < HFIXEDSZ || alen < HFIXEDSZ)
1114 return 0;
1115
1116 /* Extract qdcount from the request and reply buffers and compare them. */
1117 q.qdcount = DNS_HEADER_QDCOUNT(qbuf);
1118 a.qdcount = DNS_HEADER_QDCOUNT(abuf);
1119 if (q.qdcount != a.qdcount)
1120 return 0;
1121
1122 /* For each question in qbuf, find it in abuf. */
1123 q.p = qbuf + HFIXEDSZ;
1124 for (i = 0; i < q.qdcount; i++)
1125 {
1126 /* Decode the question in the query. */
1127 if (ares_expand_name(q.p, qbuf, qlen, &q.name, &q.namelen)
1128 != ARES_SUCCESS)
1129 return 0;
1130 q.p += q.namelen;
1131 if (q.p + QFIXEDSZ > qbuf + qlen)
1132 {
1133 free(q.name);
1134 return 0;
1135 }
1136 q.type = DNS_QUESTION_TYPE(q.p);
1137 q.dnsclass = DNS_QUESTION_CLASS(q.p);
1138 q.p += QFIXEDSZ;
1139
1140 /* Search for this question in the answer. */
1141 a.p = abuf + HFIXEDSZ;
1142 for (j = 0; j < a.qdcount; j++)
1143 {
1144 /* Decode the question in the answer. */
1145 if (ares_expand_name(a.p, abuf, alen, &a.name, &a.namelen)
1146 != ARES_SUCCESS)
1147 {
1148 free(q.name);
1149 return 0;
1150 }
1151 a.p += a.namelen;
1152 if (a.p + QFIXEDSZ > abuf + alen)
1153 {
1154 free(q.name);
1155 free(a.name);
1156 return 0;
1157 }
1158 a.type = DNS_QUESTION_TYPE(a.p);
1159 a.dnsclass = DNS_QUESTION_CLASS(a.p);
1160 a.p += QFIXEDSZ;
1161
1162 /* Compare the decoded questions. */
1163 if (strcasecmp(q.name, a.name) == 0 && q.type == a.type
1164 && q.dnsclass == a.dnsclass)
1165 {
1166 free(a.name);
1167 break;
1168 }
1169 free(a.name);
1170 }
1171
1172 free(q.name);
1173 if (j == a.qdcount)
1174 return 0;
1175 }
1176 return 1;
1177 }
1178
same_address(struct sockaddr * sa,struct ares_addr * aa)1179 static int same_address(struct sockaddr *sa, struct ares_addr *aa)
1180 {
1181 void *addr1;
1182 void *addr2;
1183
1184 if (sa->sa_family == aa->family)
1185 {
1186 switch (aa->family)
1187 {
1188 case AF_INET:
1189 addr1 = &aa->addrV4;
1190 addr2 = &((struct sockaddr_in *)sa)->sin_addr;
1191 if (memcmp(addr1, addr2, sizeof(aa->addrV4)) == 0)
1192 return 1; /* match */
1193 break;
1194 case AF_INET6:
1195 addr1 = &aa->addrV6;
1196 addr2 = &((struct sockaddr_in6 *)sa)->sin6_addr;
1197 if (memcmp(addr1, addr2, sizeof(aa->addrV6)) == 0)
1198 return 1; /* match */
1199 break;
1200 default:
1201 break;
1202 }
1203 }
1204 return 0; /* different */
1205 }
1206
end_query(ares_channel channel,struct query * query,int status,unsigned char * abuf,int alen)1207 static void end_query (ares_channel channel, struct query *query, int status,
1208 unsigned char *abuf, int alen)
1209 {
1210 int i;
1211
1212 /* First we check to see if this query ended while one of our send
1213 * queues still has pointers to it.
1214 */
1215 for (i = 0; i < channel->nservers; i++)
1216 {
1217 struct server_state *server = &channel->servers[i];
1218 struct send_request *sendreq;
1219 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
1220 if (sendreq->owner_query == query)
1221 {
1222 sendreq->owner_query = NULL;
1223 assert(sendreq->data_storage == NULL);
1224 if (status == ARES_SUCCESS)
1225 {
1226 /* We got a reply for this query, but this queued
1227 * sendreq points into this soon-to-be-gone query's
1228 * tcpbuf. Probably this means we timed out and queued
1229 * the query for retransmission, then received a
1230 * response before actually retransmitting. This is
1231 * perfectly fine, so we want to keep the connection
1232 * running smoothly if we can. But in the worst case
1233 * we may have sent only some prefix of the query,
1234 * with some suffix of the query left to send. Also,
1235 * the buffer may be queued on multiple queues. To
1236 * prevent dangling pointers to the query's tcpbuf and
1237 * handle these cases, we just give such sendreqs
1238 * their own copy of the query packet.
1239 */
1240 sendreq->data_storage = malloc(sendreq->len);
1241 if (sendreq->data_storage != NULL)
1242 {
1243 memcpy(sendreq->data_storage, sendreq->data, sendreq->len);
1244 sendreq->data = sendreq->data_storage;
1245 }
1246 }
1247 if ((status != ARES_SUCCESS) || (sendreq->data_storage == NULL))
1248 {
1249 /* We encountered an error (probably a timeout,
1250 * suggesting the DNS server we're talking to is
1251 * probably unreachable, wedged, or severely
1252 * overloaded) or we couldn't copy the request, so
1253 * mark the connection as broken. When we get to
1254 * process_broken_connections() we'll close the
1255 * connection and try to re-send requests to another
1256 * server.
1257 */
1258 server->is_broken = 1;
1259 /* Just to be paranoid, zero out this sendreq... */
1260 sendreq->data = NULL;
1261 sendreq->len = 0;
1262 }
1263 }
1264 }
1265
1266 /* Invoke the callback */
1267 query->callback(query->arg, status, query->timeouts, abuf, alen);
1268 ares__free_query(query);
1269
1270 /* Simple cleanup policy: if no queries are remaining, close all
1271 * network sockets unless STAYOPEN is set.
1272 */
1273 if (!(channel->flags & ARES_FLAG_STAYOPEN) &&
1274 ares__is_list_empty(&(channel->all_queries)))
1275 {
1276 for (i = 0; i < channel->nservers; i++)
1277 ares__close_sockets(channel, &channel->servers[i]);
1278 }
1279 }
1280
ares__free_query(struct query * query)1281 void ares__free_query(struct query *query)
1282 {
1283 /* Remove the query from all the lists in which it is linked */
1284 ares__remove_from_list(&(query->queries_by_qid));
1285 ares__remove_from_list(&(query->queries_by_timeout));
1286 ares__remove_from_list(&(query->queries_to_server));
1287 ares__remove_from_list(&(query->all_queries));
1288 /* Zero out some important stuff, to help catch bugs */
1289 query->callback = NULL;
1290 query->arg = NULL;
1291 /* Deallocate the memory associated with the query */
1292 free(query->tcpbuf);
1293 free(query->server_info);
1294 free(query);
1295 }
1296