Coverage Report

Created: 2026-02-26 06:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/bind9/lib/isc/netmgr/socket.c
Line
Count
Source
1
/*
2
 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3
 *
4
 * SPDX-License-Identifier: MPL-2.0
5
 *
6
 * This Source Code Form is subject to the terms of the Mozilla Public
7
 * License, v. 2.0. If a copy of the MPL was not distributed with this
8
 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
9
 *
10
 * See the COPYRIGHT file distributed with this work for additional
11
 * information regarding copyright ownership.
12
 */
13
14
#include <netinet/in.h>
15
16
#include <isc/errno.h>
17
#include <isc/result.h>
18
#include <isc/uv.h>
19
20
#include "netmgr-int.h"
21
22
#define setsockopt_on(socket, level, name) \
23
0
  setsockopt(socket, level, name, &(int){ 1 }, sizeof(int))
24
25
#define setsockopt_off(socket, level, name) \
26
0
  setsockopt(socket, level, name, &(int){ 0 }, sizeof(int))
27
28
static isc_result_t
29
0
socket_freebind(uv_os_sock_t fd, sa_family_t sa_family) {
30
  /*
31
   * Set the IP_FREEBIND (or equivalent option) on the uv_handle.
32
   */
33
0
#ifdef IP_FREEBIND
34
0
  UNUSED(sa_family);
35
0
  if (setsockopt_on(fd, IPPROTO_IP, IP_FREEBIND) == -1) {
36
0
    return ISC_R_FAILURE;
37
0
  }
38
0
  return ISC_R_SUCCESS;
39
#elif defined(IP_BINDANY) || defined(IPV6_BINDANY)
40
  if (sa_family == AF_INET) {
41
#if defined(IP_BINDANY)
42
    if (setsockopt_on(fd, IPPROTO_IP, IP_BINDANY) == -1) {
43
      return ISC_R_FAILURE;
44
    }
45
    return ISC_R_SUCCESS;
46
#endif
47
  } else if (sa_family == AF_INET6) {
48
#if defined(IPV6_BINDANY)
49
    if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_BINDANY) == -1) {
50
      return ISC_R_FAILURE;
51
    }
52
    return ISC_R_SUCCESS;
53
#endif
54
  }
55
  return ISC_R_NOTIMPLEMENTED;
56
#elif defined(SO_BINDANY)
57
  UNUSED(sa_family);
58
  if (setsockopt_on(fd, SOL_SOCKET, SO_BINDANY) == -1) {
59
    return ISC_R_FAILURE;
60
  }
61
  return ISC_R_SUCCESS;
62
#else
63
  UNUSED(fd);
64
  UNUSED(sa_family);
65
  return ISC_R_NOTIMPLEMENTED;
66
#endif
67
0
}
68
69
int
70
isc__nm_udp_freebind(uv_udp_t *handle, const struct sockaddr *addr,
71
0
         unsigned int flags) {
72
0
  int r;
73
0
  uv_os_sock_t fd = -1;
74
75
0
  r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd);
76
0
  if (r < 0) {
77
0
    return r;
78
0
  }
79
80
0
  r = uv_udp_bind(handle, addr, flags);
81
0
  if (r == UV_EADDRNOTAVAIL &&
82
0
      socket_freebind(fd, addr->sa_family) == ISC_R_SUCCESS)
83
0
  {
84
    /*
85
     * Retry binding with IP_FREEBIND (or equivalent option) if the
86
     * address is not available. This helps with IPv6 tentative
87
     * addresses which are reported by the route socket, although
88
     * named is not yet able to properly bind to them.
89
     */
90
0
    r = uv_udp_bind(handle, addr, flags);
91
0
  }
92
93
0
  return r;
94
0
}
95
96
static int
97
tcp_bind_now(uv_tcp_t *handle, const struct sockaddr *addr,
98
0
       unsigned int flags) {
99
0
  int r;
100
0
  struct sockaddr_storage sname;
101
0
  int snamelen = sizeof(sname);
102
103
0
  r = uv_tcp_bind(handle, addr, flags);
104
0
  if (r < 0) {
105
0
    return r;
106
0
  }
107
108
  /*
109
   * uv_tcp_bind() uses a delayed error, initially returning
110
   * success even if bind() fails. By calling uv_tcp_getsockname()
111
   * here we can find out whether the bind() call was successful.
112
   */
113
0
  r = uv_tcp_getsockname(handle, (struct sockaddr *)&sname, &snamelen);
114
0
  if (r < 0) {
115
0
    return r;
116
0
  }
117
118
0
  return 0;
119
0
}
120
121
int
122
isc__nm_tcp_freebind(uv_tcp_t *handle, const struct sockaddr *addr,
123
0
         unsigned int flags) {
124
0
  int r;
125
0
  uv_os_sock_t fd = -1;
126
127
0
  r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd);
128
0
  if (r < 0) {
129
0
    return r;
130
0
  }
131
132
0
  r = tcp_bind_now(handle, addr, flags);
133
0
  if (r == UV_EADDRNOTAVAIL &&
134
0
      socket_freebind(fd, addr->sa_family) == ISC_R_SUCCESS)
135
0
  {
136
    /*
137
     * Retry binding with IP_FREEBIND (or equivalent option) if the
138
     * address is not available. This helps with IPv6 tentative
139
     * addresses which are reported by the route socket, although
140
     * named is not yet able to properly bind to them.
141
     */
142
0
    r = tcp_bind_now(handle, addr, flags);
143
0
  }
144
145
0
  return r;
146
0
}
147
148
isc_result_t
149
0
isc__nm_socket(int domain, int type, int protocol, uv_os_sock_t *sockp) {
150
0
  int sock = socket(domain, type, protocol);
151
0
  if (sock < 0) {
152
0
    return isc_errno_toresult(errno);
153
0
  }
154
155
0
  *sockp = (uv_os_sock_t)sock;
156
0
  return ISC_R_SUCCESS;
157
0
}
158
159
void
160
0
isc__nm_closesocket(uv_os_sock_t sock) {
161
0
  close(sock);
162
0
}
163
164
isc_result_t
165
0
isc__nm_socket_reuse(uv_os_sock_t fd, int val) {
166
  /*
167
   * Generally, the SO_REUSEADDR socket option allows reuse of
168
   * local addresses.
169
   *
170
   * On the BSDs, SO_REUSEPORT implies SO_REUSEADDR but with some
171
   * additional refinements for programs that use multicast.
172
   *
173
   * On Linux, SO_REUSEPORT has different semantics: it _shares_ the port
174
   * rather than steal it from the current listener, so we don't use it
175
   * here, but rather in isc__nm_socket_reuse_lb().
176
   */
177
178
#if defined(SO_REUSEPORT) && !defined(__linux__)
179
  if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val)) == -1) {
180
    return ISC_R_FAILURE;
181
  }
182
  return ISC_R_SUCCESS;
183
#elif defined(SO_REUSEADDR)
184
0
  if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)) == -1) {
185
0
    return ISC_R_FAILURE;
186
0
  }
187
0
  return ISC_R_SUCCESS;
188
#else
189
  UNUSED(fd);
190
  return ISC_R_NOTIMPLEMENTED;
191
#endif
192
0
}
193
194
isc_result_t
195
0
isc__nm_socket_reuse_lb(uv_os_sock_t fd) {
196
  /*
197
   * On FreeBSD 12+, SO_REUSEPORT_LB socket option allows sockets to be
198
   * bound to an identical socket address. For UDP sockets, the use of
199
   * this option can provide better distribution of incoming datagrams to
200
   * multiple processes (or threads) as compared to the traditional
201
   * technique of having multiple processes compete to receive datagrams
202
   * on the same socket.
203
   *
204
   * On Linux, the same thing is achieved simply with SO_REUSEPORT.
205
   */
206
#if defined(SO_REUSEPORT_LB)
207
  if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT_LB) == -1) {
208
    return ISC_R_FAILURE;
209
  } else {
210
    return ISC_R_SUCCESS;
211
  }
212
#elif defined(SO_REUSEPORT) && defined(__linux__)
213
0
  if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT) == -1) {
214
0
    return ISC_R_FAILURE;
215
0
  } else {
216
0
    return ISC_R_SUCCESS;
217
0
  }
218
#else
219
  UNUSED(fd);
220
  return ISC_R_NOTIMPLEMENTED;
221
#endif
222
0
}
223
224
isc_result_t
225
0
isc__nm_socket_disable_pmtud(uv_os_sock_t fd, sa_family_t sa_family) {
226
  /*
227
   * Disable the Path MTU Discovery on IP packets
228
   */
229
0
  if (sa_family == AF_INET6) {
230
0
#if defined(IPV6_DONTFRAG)
231
0
    if (setsockopt_off(fd, IPPROTO_IPV6, IPV6_DONTFRAG) == -1) {
232
0
      return ISC_R_FAILURE;
233
0
    } else {
234
0
      return ISC_R_SUCCESS;
235
0
    }
236
#elif defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
237
    if (setsockopt(fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
238
             &(int){ IP_PMTUDISC_OMIT }, sizeof(int)) == -1)
239
    {
240
      return ISC_R_FAILURE;
241
    } else {
242
      return ISC_R_SUCCESS;
243
    }
244
#else
245
    UNUSED(fd);
246
#endif
247
0
  } else if (sa_family == AF_INET) {
248
#if defined(IP_DONTFRAG)
249
    if (setsockopt_off(fd, IPPROTO_IP, IP_DONTFRAG) == -1) {
250
      return ISC_R_FAILURE;
251
    } else {
252
      return ISC_R_SUCCESS;
253
    }
254
#elif defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
255
0
    if (setsockopt(fd, IPPROTO_IP, IP_MTU_DISCOVER,
256
0
             &(int){ IP_PMTUDISC_OMIT }, sizeof(int)) == -1)
257
0
    {
258
0
      return ISC_R_FAILURE;
259
0
    } else {
260
0
      return ISC_R_SUCCESS;
261
0
    }
262
#else
263
    UNUSED(fd);
264
#endif
265
0
  } else {
266
0
    return ISC_R_FAMILYNOSUPPORT;
267
0
  }
268
269
0
  return ISC_R_NOTIMPLEMENTED;
270
0
}
271
272
isc_result_t
273
0
isc__nm_socket_v6only(uv_os_sock_t fd, sa_family_t sa_family) {
274
  /*
275
   * Enable the IPv6-only option on IPv6 sockets
276
   */
277
0
  if (sa_family == AF_INET6) {
278
0
#if defined(IPV6_V6ONLY)
279
0
    if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_V6ONLY) == -1) {
280
0
      return ISC_R_FAILURE;
281
0
    } else {
282
0
      return ISC_R_SUCCESS;
283
0
    }
284
#else
285
    UNUSED(fd);
286
#endif
287
0
  }
288
0
  return ISC_R_NOTIMPLEMENTED;
289
0
}
290
291
isc_result_t
292
0
isc__nm_socket_connectiontimeout(uv_os_sock_t fd, int timeout_ms) {
293
#if defined(TIMEOUT_OPTNAME)
294
  TIMEOUT_TYPE timeout = timeout_ms / TIMEOUT_DIV;
295
296
  if (timeout == 0) {
297
    timeout = 1;
298
  }
299
300
  if (setsockopt(fd, IPPROTO_TCP, TIMEOUT_OPTNAME, &timeout,
301
           sizeof(timeout)) == -1)
302
  {
303
    return ISC_R_FAILURE;
304
  }
305
306
  return ISC_R_SUCCESS;
307
#else
308
0
  UNUSED(fd);
309
0
  UNUSED(timeout_ms);
310
311
0
  return ISC_R_SUCCESS;
312
0
#endif
313
0
}
314
315
isc_result_t
316
0
isc__nm_socket_tcp_nodelay(uv_os_sock_t fd, bool value) {
317
0
#ifdef TCP_NODELAY
318
0
  int ret;
319
320
0
  if (value) {
321
0
    ret = setsockopt_on(fd, IPPROTO_TCP, TCP_NODELAY);
322
0
  } else {
323
0
    ret = setsockopt_off(fd, IPPROTO_TCP, TCP_NODELAY);
324
0
  }
325
326
0
  if (ret == -1) {
327
0
    return ISC_R_FAILURE;
328
0
  } else {
329
0
    return ISC_R_SUCCESS;
330
0
  }
331
#else
332
  UNUSED(fd);
333
  return ISC_R_SUCCESS;
334
#endif
335
0
}
336
337
isc_result_t
338
0
isc__nm_socket_tcp_maxseg(uv_os_sock_t fd, int size) {
339
0
#ifdef TCP_MAXSEG
340
0
  if (setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, (void *)&size,
341
0
           sizeof(size)))
342
0
  {
343
0
    return ISC_R_FAILURE;
344
0
  } else {
345
0
    return ISC_R_SUCCESS;
346
0
  }
347
#else
348
  UNUSED(fd);
349
  UNUSED(size);
350
  return ISC_R_SUCCESS;
351
#endif
352
0
}
353
354
isc_result_t
355
0
isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) {
356
0
  if (sa_family != AF_INET6) {
357
0
    return ISC_R_SUCCESS;
358
0
  }
359
#ifdef IPV6_USE_MIN_MTU
360
  if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU) == -1) {
361
    return ISC_R_FAILURE;
362
  }
363
#elif defined(IPV6_MTU)
364
0
  if (setsockopt(fd, IPPROTO_IPV6, IPV6_MTU, &(int){ 1280 },
365
0
           sizeof(int)) == -1)
366
0
  {
367
0
    return ISC_R_FAILURE;
368
0
  }
369
#else
370
  UNUSED(fd);
371
#endif
372
373
0
  return ISC_R_SUCCESS;
374
0
}
375
376
/*
377
 * See
378
 * https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel
379
 * for rationalle.
380
 */
381
0
#define PORT_RANGE 1000
382
383
isc_result_t
384
isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
385
0
            sa_family_t af ISC_ATTR_UNUSED) {
386
0
#ifdef IP_BIND_ADDRESS_NO_PORT
387
0
  if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) {
388
0
    return ISC_R_FAILURE;
389
0
  }
390
0
#endif
391
392
0
#if defined(IP_LOCAL_PORT_RANGE) && defined(__linux__)
393
  /*
394
   * The option takes an uint32_t value with the high 16 bits
395
   * set to the upper range bound, and the low 16 bits set to
396
   * the lower range bound.  Range bounds are inclusive.  The
397
   * 16-bit values should be in host byte order.
398
   */
399
0
  uint32_t port_range;
400
0
  int major, minor;
401
0
  isc_os_kernel(NULL, &major, &minor, NULL);
402
403
0
  in_port_t port_low, port_high;
404
0
  switch (af) {
405
0
  case AF_INET:
406
0
    port_low = isc__netmgr->port_low4;
407
0
    port_high = isc__netmgr->port_high4;
408
0
    break;
409
0
  case AF_INET6:
410
0
    port_low = isc__netmgr->port_low6;
411
0
    port_high = isc__netmgr->port_high6;
412
0
    break;
413
0
  default:
414
0
    INSIST(0);
415
0
  }
416
417
  /*
418
   * Linux 6.8 implemented a following patch:
419
   *
420
   * If IP_LOCAL_PORT_RANGE is set on a socket before accept(),
421
   * port selection no longer favors even ports.
422
   *
423
   * This means that connect() can find a suitable source port
424
   * faster, and applications can use a different split between
425
   * connect() and bind() users.
426
   */
427
0
  if (major < 6 || (major == 6 && minor < 8)) {
428
    /*
429
     * On Linux << 6.8, use IP_LOCAL_PORT_RANGE to
430
     * partition ephemeral port range randomly to help
431
     * with the port selection.
432
     */
433
0
    if (port_high - port_low <= PORT_RANGE) {
434
0
      return ISC_R_RANGE;
435
0
    }
436
437
    /*
438
     * port_low <= N < port_high - PORT_RANGE
439
     */
440
0
    port_high -= PORT_RANGE;
441
0
    port_low += isc_random_uniform(port_high - port_low);
442
0
    port_high = port_low + PORT_RANGE;
443
0
  }
444
0
  INSIST(port_low > 0);
445
0
  INSIST(port_low < port_high);
446
447
0
  port_range = (uint32_t)port_low | ((uint32_t)port_high << 16);
448
0
  if (setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE, &port_range,
449
0
           sizeof(port_range)) == -1)
450
0
  {
451
0
    return ISC_R_FAILURE;
452
0
  }
453
0
#endif
454
0
  return ISC_R_SUCCESS;
455
0
}