Coverage Report

Created: 2025-11-24 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-ares/src/lib/ares_metrics.c
Line
Count
Source
1
/* MIT License
2
 *
3
 * Copyright (c) 2024 Brad House
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a copy
6
 * of this software and associated documentation files (the "Software"), to deal
7
 * in the Software without restriction, including without limitation the rights
8
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
 * copies of the Software, and to permit persons to whom the Software is
10
 * furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 *
24
 * SPDX-License-Identifier: MIT
25
 */
26
27
28
/* IMPLEMENTATION NOTES
29
 * ====================
30
 *
31
 * With very little effort we should be able to determine fairly proper timeouts
32
 * we can use based on prior query history.  We track in order to be able to
33
 * auto-scale when network conditions change (e.g. maybe there is a provider
34
 * failover and timings change due to that).  Apple appears to do this within
35
 * their system resolver in MacOS.  Obviously we should have a minimum, maximum,
36
 * and initial value to make sure the algorithm doesn't somehow go off the
37
 * rails.
38
 *
39
 * Values:
40
 * - Minimum Timeout: 250ms (approximate RTT half-way around the globe)
41
 * - Maximum Timeout: 5000ms (Recommended timeout in RFC 1123), can be reduced
42
 *   by ARES_OPT_MAXTIMEOUTMS, but otherwise the bound specified by the option
43
 *   caps the retry timeout.
44
 * - Initial Timeout: User-specified via configuration or ARES_OPT_TIMEOUTMS
45
 * - Average latency multiplier: 5x (a local DNS server returning a cached value
46
 *   will be quicker than if it needs to recurse so we need to account for this)
47
 * - Minimum Count for Average: 3.  This is the minimum number of queries we
48
 *   need to form an average for the bucket.
49
 *
50
 * Per-server buckets for tracking latency over time (these are ephemeral
51
 * meaning they don't persist once a channel is destroyed).  We record both the
52
 * current timespan for the bucket and the immediate preceding timespan in case
53
 * of roll-overs we can still maintain recent metrics for calculations:
54
 * - 1 minute
55
 * - 15 minutes
56
 * - 1 hr
57
 * - 1 day
58
 * - since inception
59
 *
60
 * Each bucket would contain:
61
 * - timestamp (divided by interval)
62
 * - minimum latency
63
 * - maximum latency
64
 * - total time
65
 * - count
66
 * NOTE: average latency is (total time / count), we will calculate this
67
 *       dynamically when needed
68
 *
69
 * Basic algorithm for calculating timeout to use would be:
70
 * - Scan from most recent bucket to least recent
71
 * - Check timestamp of bucket, if doesn't match current time, continue to next
72
 *   bucket
73
 * - Check count of bucket, if its not at least the "Minimum Count for Average",
74
 *   check the previous bucket, otherwise continue to next bucket
75
 * - If we reached the end with no bucket match, use "Initial Timeout"
76
 * - If bucket is selected, take ("total time" / count) as Average latency,
77
 *   multiply by "Average Latency Multiplier", bound by "Minimum Timeout" and
78
 *   "Maximum Timeout"
79
 * NOTE: The timeout calculated may not be the timeout used.  If we are retrying
80
 * the query on the same server another time, then it will use a larger value
81
 *
82
 * On each query reply where the response is legitimate (proper response or
83
 * NXDOMAIN) and not something like a server error:
84
 * - Cycle through each bucket in order
85
 * - Check timestamp of bucket against current timestamp, if out of date
86
 *   overwrite previous entry with values, clear current values
87
 * - Compare current minimum and maximum recorded latency against query time and
88
 *   adjust if necessary
89
 * - Increment "count" by 1 and "total time" by the query time
90
 *
91
 * Other Notes:
92
 * - This is always-on, the only user-configurable value is the initial
93
 *   timeout which will simply re-uses the current option.
94
 * - Minimum and Maximum latencies for a bucket are currently unused but are
95
 *   there in case we find a need for them in the future.
96
 */
97
98
#include "ares_private.h"
99
100
/*! Minimum timeout value. Chosen due to it being approximately RTT half-way
101
 *  around the world */
102
0
#define MIN_TIMEOUT_MS 250
103
104
/*! Multiplier to apply to average latency to come up with an initial timeout */
105
0
#define AVG_TIMEOUT_MULTIPLIER 5
106
107
/*! Upper timeout bounds, only used if channel->maxtimeout not set */
108
0
#define MAX_TIMEOUT_MS 5000
109
110
/*! Minimum queries required to form an average */
111
0
#define MIN_COUNT_FOR_AVERAGE 3
112
113
static time_t ares_metric_timestamp(ares_server_bucket_t  bucket,
114
                                    const ares_timeval_t *now,
115
                                    ares_bool_t           is_previous)
116
0
{
117
0
  time_t divisor = 1; /* Silence bogus MSVC warning by setting default value */
118
119
0
  switch (bucket) {
120
0
    case ARES_METRIC_1MINUTE:
121
0
      divisor = 60;
122
0
      break;
123
0
    case ARES_METRIC_15MINUTES:
124
0
      divisor = 15 * 60;
125
0
      break;
126
0
    case ARES_METRIC_1HOUR:
127
0
      divisor = 60 * 60;
128
0
      break;
129
0
    case ARES_METRIC_1DAY:
130
0
      divisor = 24 * 60 * 60;
131
0
      break;
132
0
    case ARES_METRIC_INCEPTION:
133
0
      return is_previous ? 0 : 1;
134
0
    case ARES_METRIC_COUNT:
135
0
      return 0; /* Invalid! */
136
0
  }
137
138
0
  if (is_previous) {
139
0
    if (divisor >= now->sec) {
140
0
      return 0;
141
0
    }
142
0
    return (time_t)((now->sec - divisor) / divisor);
143
0
  }
144
145
0
  return (time_t)(now->sec / divisor);
146
0
}
147
148
void ares_metrics_record(const ares_query_t *query, ares_server_t *server,
149
                         ares_status_t status, const ares_dns_record_t *dnsrec)
150
0
{
151
0
  ares_timeval_t       now;
152
0
  ares_timeval_t       tvdiff;
153
0
  unsigned int         query_ms;
154
0
  ares_dns_rcode_t     rcode;
155
0
  ares_server_bucket_t i;
156
157
0
  if (status != ARES_SUCCESS) {
158
0
    return;
159
0
  }
160
161
0
  if (server == NULL) {
162
0
    return;
163
0
  }
164
165
0
  ares_tvnow(&now);
166
167
0
  rcode = ares_dns_record_get_rcode(dnsrec);
168
0
  if (rcode != ARES_RCODE_NOERROR && rcode != ARES_RCODE_NXDOMAIN) {
169
0
    return;
170
0
  }
171
172
0
  ares_timeval_diff(&tvdiff, &query->ts, &now);
173
0
  query_ms = (unsigned int)((tvdiff.sec * 1000) + (tvdiff.usec / 1000));
174
0
  if (query_ms == 0) {
175
0
    query_ms = 1;
176
0
  }
177
178
  /* Place in each bucket */
179
0
  for (i = 0; i < ARES_METRIC_COUNT; i++) {
180
0
    time_t ts = ares_metric_timestamp(i, &now, ARES_FALSE);
181
182
    /* Copy metrics to prev and clear */
183
0
    if (ts != server->metrics[i].ts) {
184
0
      server->metrics[i].prev_ts          = server->metrics[i].ts;
185
0
      server->metrics[i].prev_total_ms    = server->metrics[i].total_ms;
186
0
      server->metrics[i].prev_total_count = server->metrics[i].total_count;
187
0
      server->metrics[i].ts               = ts;
188
0
      server->metrics[i].latency_min_ms   = 0;
189
0
      server->metrics[i].latency_max_ms   = 0;
190
0
      server->metrics[i].total_ms         = 0;
191
0
      server->metrics[i].total_count      = 0;
192
0
    }
193
194
0
    if (server->metrics[i].latency_min_ms == 0 ||
195
0
        server->metrics[i].latency_min_ms > query_ms) {
196
0
      server->metrics[i].latency_min_ms = query_ms;
197
0
    }
198
199
0
    if (query_ms > server->metrics[i].latency_max_ms) {
200
0
      server->metrics[i].latency_max_ms = query_ms;
201
0
    }
202
203
0
    server->metrics[i].total_count++;
204
0
    server->metrics[i].total_ms += (ares_uint64_t)query_ms;
205
0
  }
206
0
}
207
208
size_t ares_metrics_server_timeout(const ares_server_t  *server,
209
                                   const ares_timeval_t *now)
210
0
{
211
0
  const ares_channel_t *channel = server->channel;
212
0
  ares_server_bucket_t  i;
213
0
  size_t                timeout_ms = 0;
214
0
  size_t                max_timeout_ms;
215
216
0
  for (i = 0; i < ARES_METRIC_COUNT; i++) {
217
0
    time_t ts = ares_metric_timestamp(i, now, ARES_FALSE);
218
219
    /* This ts has been invalidated, see if we should use the previous
220
     * time period */
221
0
    if (ts != server->metrics[i].ts ||
222
0
        server->metrics[i].total_count < MIN_COUNT_FOR_AVERAGE) {
223
0
      time_t prev_ts = ares_metric_timestamp(i, now, ARES_TRUE);
224
0
      if (prev_ts != server->metrics[i].prev_ts ||
225
0
          server->metrics[i].prev_total_count < MIN_COUNT_FOR_AVERAGE) {
226
        /* Move onto next bucket */
227
0
        continue;
228
0
      }
229
      /* Calculate average time for previous bucket */
230
0
      timeout_ms = (size_t)(server->metrics[i].prev_total_ms /
231
0
                            server->metrics[i].prev_total_count);
232
0
    } else {
233
      /* Calculate average time for current bucket*/
234
0
      timeout_ms =
235
0
        (size_t)(server->metrics[i].total_ms / server->metrics[i].total_count);
236
0
    }
237
238
    /* Multiply average by constant to get timeout value */
239
0
    timeout_ms *= AVG_TIMEOUT_MULTIPLIER;
240
0
    break;
241
0
  }
242
243
  /* If we're here, that means its the first query for the server, so we just
244
   * use the initial default timeout */
245
0
  if (timeout_ms == 0) {
246
0
    timeout_ms = channel->timeout;
247
0
  }
248
249
  /* don't go below lower bounds */
250
0
  if (timeout_ms < MIN_TIMEOUT_MS) {
251
0
    timeout_ms = MIN_TIMEOUT_MS;
252
0
  }
253
254
  /* don't go above upper bounds */
255
0
  max_timeout_ms = channel->maxtimeout ? channel->maxtimeout : MAX_TIMEOUT_MS;
256
0
  if (timeout_ms > max_timeout_ms) {
257
0
    timeout_ms = max_timeout_ms;
258
0
  }
259
260
0
  return timeout_ms;
261
0
}