• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2016 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #define LOG_TAG "res_stats"
17 
18 #include <arpa/nameser.h>
19 #include <stdbool.h>
20 #include <string.h>
21 
22 #include <android-base/logging.h>
23 
24 #include "netd_resolv/stats.h"
25 
26 
27 // Calculate the round-trip-time from start time t0 and end time t1.
_res_stats_calculate_rtt(const timespec * t1,const timespec * t0)28 int _res_stats_calculate_rtt(const timespec* t1, const timespec* t0) {
29     // Divide ns by one million to get ms, multiply s by thousand to get ms (obvious)
30     long ms0 = t0->tv_sec * 1000 + t0->tv_nsec / 1000000;
31     long ms1 = t1->tv_sec * 1000 + t1->tv_nsec / 1000000;
32     return (int) (ms1 - ms0);
33 }
34 
35 // Create a sample for calculating server reachability statistics.
_res_stats_set_sample(res_sample * sample,time_t now,int rcode,int rtt)36 void _res_stats_set_sample(res_sample* sample, time_t now, int rcode, int rtt) {
37     LOG(INFO) << __func__ << ": rcode = " << rcode << ", sec = " << rtt;
38     sample->at = now;
39     sample->rcode = rcode;
40     sample->rtt = rtt;
41 }
42 
43 /* Clears all stored samples for the given server. */
_res_stats_clear_samples(res_stats * stats)44 void _res_stats_clear_samples(res_stats* stats) {
45     stats->sample_count = stats->sample_next = 0;
46 }
47 
48 /* Aggregates the reachability statistics for the given server based on on the stored samples. */
android_net_res_stats_aggregate(res_stats * stats,int * successes,int * errors,int * timeouts,int * internal_errors,int * rtt_avg,time_t * last_sample_time)49 void android_net_res_stats_aggregate(res_stats* stats, int* successes, int* errors, int* timeouts,
50                                      int* internal_errors, int* rtt_avg, time_t* last_sample_time) {
51     int s = 0;   // successes
52     int e = 0;   // errors
53     int t = 0;   // timouts
54     int ie = 0;  // internal errors
55     long rtt_sum = 0;
56     time_t last = 0;
57     int rtt_count = 0;
58     for (int i = 0; i < stats->sample_count; ++i) {
59         // Treat everything as an error that the code in send_dg() already considers a
60         // rejection by the server, i.e. SERVFAIL, NOTIMP and REFUSED. Assume that NXDOMAIN
61         // and NOTAUTH can actually occur for user queries. NOERROR with empty answer section
62         // is not treated as an error here either. FORMERR seems to sometimes be returned by
63         // some versions of BIND in response to DNSSEC or EDNS0. Whether to treat such responses
64         // as an indication of a broken server is unclear, though. For now treat such responses,
65         // as well as unknown codes as errors.
66         switch (stats->samples[i].rcode) {
67             case NOERROR:
68             case NOTAUTH:
69             case NXDOMAIN:
70                 ++s;
71                 rtt_sum += stats->samples[i].rtt;
72                 ++rtt_count;
73                 break;
74             case RCODE_TIMEOUT:
75                 ++t;
76                 break;
77             case RCODE_INTERNAL_ERROR:
78                 ++ie;
79                 break;
80             case SERVFAIL:
81             case NOTIMP:
82             case REFUSED:
83             default:
84                 ++e;
85                 break;
86         }
87     }
88     *successes = s;
89     *errors = e;
90     *timeouts = t;
91     *internal_errors = ie;
92     /* If there was at least one successful sample, calculate average RTT. */
93     if (rtt_count) {
94         *rtt_avg = rtt_sum / rtt_count;
95     } else {
96         *rtt_avg = -1;
97     }
98     /* If we had at least one sample, populate last sample time. */
99     if (stats->sample_count > 0) {
100         if (stats->sample_next > 0) {
101             last = stats->samples[stats->sample_next - 1].at;
102         } else {
103             last = stats->samples[stats->sample_count - 1].at;
104         }
105     }
106     *last_sample_time = last;
107 }
108 
109 // Returns true if the server is considered usable, i.e. if the success rate is not lower than the
110 // threshold for the stored stored samples. If not enough samples are stored, the server is
111 // considered usable.
res_stats_usable_server(const res_params * params,res_stats * stats)112 static bool res_stats_usable_server(const res_params* params, res_stats* stats) {
113     int successes = -1;
114     int errors = -1;
115     int timeouts = -1;
116     int internal_errors = -1;
117     int rtt_avg = -1;
118     time_t last_sample_time = 0;
119     android_net_res_stats_aggregate(stats, &successes, &errors, &timeouts, &internal_errors,
120                                     &rtt_avg, &last_sample_time);
121     if (successes >= 0 && errors >= 0 && timeouts >= 0) {
122         int total = successes + errors + timeouts;
123         LOG(INFO) << __func__ << ": NS stats: S " << successes << " + E " << errors << " + T "
124                   << timeouts << " + I " << internal_errors << " = " << total
125                   << ", rtt = " << rtt_avg << ", min_samples = " << unsigned(params->min_samples);
126         if (total >= params->min_samples && (errors > 0 || timeouts > 0)) {
127             int success_rate = successes * 100 / total;
128             LOG(INFO) << __func__ << ": success rate " << success_rate;
129             if (success_rate < params->success_threshold) {
130                 time_t now = time(NULL);
131                 if (now - last_sample_time > params->sample_validity) {
132                     // Note: It might be worth considering to expire old servers after their expiry
133                     // date has been reached, however the code for returning the ring buffer to its
134                     // previous non-circular state would induce additional complexity.
135                     LOG(INFO) << __func__ << ": samples stale, retrying server";
136                     _res_stats_clear_samples(stats);
137                 } else {
138                     LOG(INFO) << __func__ << ": too many resolution errors, ignoring server";
139                     return 0;
140                 }
141             }
142         }
143     }
144     return 1;
145 }
146 
android_net_res_stats_get_usable_servers(const res_params * params,res_stats stats[],int nscount,bool usable_servers[])147 int android_net_res_stats_get_usable_servers(const res_params* params, res_stats stats[],
148                                              int nscount, bool usable_servers[]) {
149     unsigned usable_servers_found = 0;
150     for (int ns = 0; ns < nscount; ns++) {
151         bool usable = res_stats_usable_server(params, &stats[ns]);
152         if (usable) {
153             ++usable_servers_found;
154         }
155         usable_servers[ns] = usable;
156     }
157     // If there are no usable servers, consider all of them usable.
158     // TODO: Explore other possibilities, such as enabling only the best N servers, etc.
159     if (usable_servers_found == 0) {
160         for (int ns = 0; ns < nscount; ns++) {
161             usable_servers[ns] = true;
162         }
163     }
164     return (usable_servers_found == 0) ? nscount : usable_servers_found;
165 }
166