Coverage Report

Created: 2026-06-15 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/CMake/Utilities/cmlibuv/src/unix/linux.c
Line
Count
Source
1
/* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
2
 * Permission is hereby granted, free of charge, to any person obtaining a copy
3
 * of this software and associated documentation files (the "Software"), to
4
 * deal in the Software without restriction, including without limitation the
5
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6
 * sell copies of the Software, and to permit persons to whom the Software is
7
 * furnished to do so, subject to the following conditions:
8
 *
9
 * The above copyright notice and this permission notice shall be included in
10
 * all copies or substantial portions of the Software.
11
 *
12
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
18
 * IN THE SOFTWARE.
19
 */
20
21
/* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
22
 * EPOLL* counterparts.  We use the POLL* variants in this file because that
23
 * is what libuv uses elsewhere.
24
 */
25
26
#include "uv.h"
27
#include "internal.h"
28
29
#include <inttypes.h>
30
#include <stdatomic.h>
31
#include <stddef.h>  /* offsetof */
32
#include <stdint.h>
33
#include <stdio.h>
34
#include <stdlib.h>
35
#include <string.h>
36
#include <assert.h>
37
#include <errno.h>
38
39
#include <fcntl.h>
40
#include <ifaddrs.h>
41
#include <net/ethernet.h>
42
#include <net/if.h>
43
#include <netpacket/packet.h>
44
#include <sys/epoll.h>
45
#include <sys/inotify.h>
46
#include <sys/mman.h>
47
#include <sys/param.h>
48
#include <sys/prctl.h>
49
#include <sys/socket.h>
50
#include <sys/stat.h>
51
#include <sys/syscall.h>
52
#include <sys/sysinfo.h>
53
#include <sys/sysmacros.h>
54
#include <sys/types.h>
55
#include <sys/utsname.h>
56
#include <time.h>
57
#include <unistd.h>
58
59
#ifndef __NR_io_uring_setup
60
# define __NR_io_uring_setup 425
61
#endif
62
63
#ifndef __NR_io_uring_enter
64
# define __NR_io_uring_enter 426
65
#endif
66
67
#ifndef __NR_io_uring_register
68
# define __NR_io_uring_register 427
69
#endif
70
71
#ifndef __NR_copy_file_range
72
# if defined(__x86_64__)
73
#  define __NR_copy_file_range 326
74
# elif defined(__i386__)
75
#  define __NR_copy_file_range 377
76
# elif defined(__s390__)
77
#  define __NR_copy_file_range 375
78
# elif defined(__arm__)
79
#  define __NR_copy_file_range 391
80
# elif defined(__aarch64__)
81
#  define __NR_copy_file_range 285
82
# elif defined(__powerpc__)
83
#  define __NR_copy_file_range 379
84
# elif defined(__arc__)
85
#  define __NR_copy_file_range 285
86
# elif defined(__riscv)
87
#  define __NR_copy_file_range 285
88
# endif
89
#endif /* __NR_copy_file_range */
90
91
#ifndef __NR_statx
92
# if defined(__x86_64__)
93
#  define __NR_statx 332
94
# elif defined(__i386__)
95
#  define __NR_statx 383
96
# elif defined(__aarch64__)
97
#  define __NR_statx 397
98
# elif defined(__arm__)
99
#  define __NR_statx 397
100
# elif defined(__ppc__)
101
#  define __NR_statx 383
102
# elif defined(__s390__)
103
#  define __NR_statx 379
104
# elif defined(__riscv)
105
#  define __NR_statx 291
106
# endif
107
#endif /* __NR_statx */
108
109
#ifndef __NR_getrandom
110
# if defined(__x86_64__)
111
#  define __NR_getrandom 318
112
# elif defined(__i386__)
113
#  define __NR_getrandom 355
114
# elif defined(__aarch64__)
115
#  define __NR_getrandom 384
116
# elif defined(__arm__)
117
#  define __NR_getrandom 384
118
# elif defined(__ppc__)
119
#  define __NR_getrandom 359
120
# elif defined(__s390__)
121
#  define __NR_getrandom 349
122
# elif defined(__riscv)
123
#  define __NR_getrandom 278
124
# endif
125
#endif /* __NR_getrandom */
126
127
enum {
128
  UV__IORING_SETUP_SQPOLL = 2u,
129
  UV__IORING_SETUP_NO_SQARRAY = 0x10000u,
130
};
131
132
enum {
133
  UV__IORING_FEAT_SINGLE_MMAP = 1u,
134
  UV__IORING_FEAT_NODROP = 2u,
135
  UV__IORING_FEAT_RSRC_TAGS = 1024u,  /* linux v5.13 */
136
};
137
138
enum {
139
  UV__IORING_OP_READV = 1,
140
  UV__IORING_OP_WRITEV = 2,
141
  UV__IORING_OP_FSYNC = 3,
142
  UV__IORING_OP_OPENAT = 18,
143
  UV__IORING_OP_CLOSE = 19,
144
  UV__IORING_OP_STATX = 21,
145
  UV__IORING_OP_EPOLL_CTL = 29,
146
  UV__IORING_OP_RENAMEAT = 35,
147
  UV__IORING_OP_UNLINKAT = 36,
148
  UV__IORING_OP_MKDIRAT = 37,
149
  UV__IORING_OP_SYMLINKAT = 38,
150
  UV__IORING_OP_LINKAT = 39,
151
  UV__IORING_OP_FTRUNCATE = 55,
152
};
153
154
enum {
155
  UV__IORING_ENTER_GETEVENTS = 1u,
156
  UV__IORING_ENTER_SQ_WAKEUP = 2u,
157
};
158
159
enum {
160
  UV__IORING_SQ_NEED_WAKEUP = 1u,
161
  UV__IORING_SQ_CQ_OVERFLOW = 2u,
162
};
163
164
struct uv__io_cqring_offsets {
165
  uint32_t head;
166
  uint32_t tail;
167
  uint32_t ring_mask;
168
  uint32_t ring_entries;
169
  uint32_t overflow;
170
  uint32_t cqes;
171
  uint64_t reserved0;
172
  uint64_t reserved1;
173
};
174
175
STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
176
177
struct uv__io_sqring_offsets {
178
  uint32_t head;
179
  uint32_t tail;
180
  uint32_t ring_mask;
181
  uint32_t ring_entries;
182
  uint32_t flags;
183
  uint32_t dropped;
184
  uint32_t array;
185
  uint32_t reserved0;
186
  uint64_t reserved1;
187
};
188
189
STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
190
191
struct uv__io_uring_cqe {
192
  uint64_t user_data;
193
  int32_t res;
194
  uint32_t flags;
195
};
196
197
STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
198
199
struct uv__io_uring_sqe {
200
  uint8_t opcode;
201
  uint8_t flags;
202
  uint16_t ioprio;
203
  int32_t fd;
204
  union {
205
    uint64_t off;
206
    uint64_t addr2;
207
  };
208
  union {
209
    uint64_t addr;
210
  };
211
  uint32_t len;
212
  union {
213
    uint32_t rw_flags;
214
    uint32_t fsync_flags;
215
    uint32_t open_flags;
216
    uint32_t statx_flags;
217
  };
218
  uint64_t user_data;
219
  union {
220
    uint16_t buf_index;
221
    uint64_t pad[3];
222
  };
223
};
224
225
STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
226
STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
227
STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
228
STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
229
STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
230
STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
231
STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
232
STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
233
STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
234
STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
235
STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
236
237
struct uv__io_uring_params {
238
  uint32_t sq_entries;
239
  uint32_t cq_entries;
240
  uint32_t flags;
241
  uint32_t sq_thread_cpu;
242
  uint32_t sq_thread_idle;
243
  uint32_t features;
244
  uint32_t reserved[4];
245
  struct uv__io_sqring_offsets sq_off;  /* 40 bytes */
246
  struct uv__io_cqring_offsets cq_off;  /* 40 bytes */
247
};
248
249
STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
250
STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
251
STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
252
253
STATIC_ASSERT(EPOLL_CTL_ADD < 4);
254
STATIC_ASSERT(EPOLL_CTL_DEL < 4);
255
STATIC_ASSERT(EPOLL_CTL_MOD < 4);
256
257
struct watcher_list {
258
  RB_ENTRY(watcher_list) entry;
259
  struct uv__queue watchers;
260
  int iterating;
261
  char* path;
262
  int wd;
263
};
264
265
struct watcher_root {
266
  struct watcher_list* rbh_root;
267
};
268
269
static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
270
static int compare_watchers(const struct watcher_list* a,
271
                            const struct watcher_list* b);
272
static void maybe_free_watcher_list(struct watcher_list* w,
273
                                    uv_loop_t* loop);
274
275
static void uv__epoll_ctl_flush(int epollfd,
276
                                struct uv__iou* ctl,
277
                                struct epoll_event (*events)[256]);
278
279
static void uv__epoll_ctl_prep(int epollfd,
280
                               struct uv__iou* ctl,
281
                               struct epoll_event (*events)[256],
282
                               int op,
283
                               int fd,
284
                               struct epoll_event* e);
285
286
0
RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
Unexecuted instantiation: linux.c:watcher_root_RB_MINMAX
Unexecuted instantiation: linux.c:watcher_root_RB_FIND
Unexecuted instantiation: linux.c:watcher_root_RB_REMOVE
Unexecuted instantiation: linux.c:watcher_root_RB_REMOVE_COLOR
Unexecuted instantiation: linux.c:watcher_root_RB_INSERT
Unexecuted instantiation: linux.c:watcher_root_RB_INSERT_COLOR
287
0
288
0
289
0
static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
290
  /* This cast works because watcher_root is a struct with a pointer as its
291
   * sole member. Such type punning is unsafe in the presence of strict
292
   * pointer aliasing (and is just plain nasty) but that is why libuv
293
   * is compiled with -fno-strict-aliasing.
294
   */
295
0
  return (struct watcher_root*) &loop->inotify_watchers;
296
0
}
297
298
299
0
unsigned uv__kernel_version(void) {
300
0
  static _Atomic unsigned cached_version;
301
0
  struct utsname u;
302
0
  unsigned version;
303
0
  unsigned major;
304
0
  unsigned minor;
305
0
  unsigned patch;
306
0
  char v_sig[256];
307
0
  char* needle;
308
309
0
  version = atomic_load_explicit(&cached_version, memory_order_relaxed);
310
0
  if (version != 0)
311
0
    return version;
312
313
  /* Check /proc/version_signature first as it's the way to get the mainline
314
   * kernel version in Ubuntu. The format is:
315
   *   Ubuntu ubuntu_kernel_version mainline_kernel_version
316
   * For example:
317
   *   Ubuntu 5.15.0-79.86-generic 5.15.111
318
   */
319
0
  if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig)))
320
0
    if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch))
321
0
      goto calculate_version;
322
323
0
  if (-1 == uname(&u))
324
0
    return 0;
325
326
  /* In Debian we need to check `version` instead of `release` to extract the
327
   * mainline kernel version. This is an example of how it looks like:
328
   *  #1 SMP Debian 5.10.46-4 (2021-08-03)
329
   */
330
0
  needle = strstr(u.version, "Debian ");
331
0
  if (needle != NULL)
332
0
    if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch))
333
0
      goto calculate_version;
334
335
0
  if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
336
0
    return 0;
337
338
  /* Handle it when the process runs under the UNAME26 personality:
339
   *
340
   * - kernels >= 3.x identify as 2.6.40+x
341
   * - kernels >= 4.x identify as 2.6.60+x
342
   *
343
   * UNAME26 is a poorly conceived hack that doesn't let us distinguish
344
   * between 4.x kernels and 5.x/6.x kernels so we conservatively assume
345
   * that 2.6.60+x means 4.x.
346
   *
347
   * Fun fact of the day: it's technically possible to observe the actual
348
   * kernel version for a brief moment because uname() first copies out the
349
   * real release string before overwriting it with the backcompat string.
350
   */
351
0
  if (major == 2 && minor == 6) {
352
0
    if (patch >= 60) {
353
0
      major = 4;
354
0
      minor = patch - 60;
355
0
      patch = 0;
356
0
    } else if (patch >= 40) {
357
0
      major = 3;
358
0
      minor = patch - 40;
359
0
      patch = 0;
360
0
    }
361
0
  }
362
363
0
calculate_version:
364
0
  version = major * 65536 + minor * 256 + patch;
365
0
  atomic_store_explicit(&cached_version, version, memory_order_relaxed);
366
367
0
  return version;
368
0
}
369
370
371
ssize_t
372
uv__fs_copy_file_range(int fd_in,
373
                       off_t* off_in,
374
                       int fd_out,
375
                       off_t* off_out,
376
                       size_t len,
377
                       unsigned int flags)
378
0
{
379
0
#ifdef __NR_copy_file_range
380
0
  return syscall(__NR_copy_file_range,
381
0
                 fd_in,
382
0
                 off_in,
383
0
                 fd_out,
384
0
                 off_out,
385
0
                 len,
386
0
                 flags);
387
#else
388
  return errno = ENOSYS, -1;
389
#endif
390
0
}
391
392
393
int uv__statx(int dirfd,
394
              const char* path,
395
              int flags,
396
              unsigned int mask,
397
0
              struct uv__statx* statxbuf) {
398
#if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
399
  return errno = ENOSYS, -1;
400
#else
401
0
  int rc;
402
403
0
  rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
404
0
  if (rc >= 0)
405
0
    uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
406
407
0
  return rc;
408
0
#endif
409
0
}
410
411
412
0
ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
413
#if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
414
  return errno = ENOSYS, -1;
415
#else
416
0
  ssize_t rc;
417
418
0
  rc = syscall(__NR_getrandom, buf, buflen, flags);
419
0
  if (rc >= 0)
420
0
    uv__msan_unpoison(buf, buflen);
421
422
0
  return rc;
423
0
#endif
424
0
}
425
426
427
0
int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
428
0
  return syscall(__NR_io_uring_setup, entries, params);
429
0
}
430
431
432
int uv__io_uring_enter(int fd,
433
                       unsigned to_submit,
434
                       unsigned min_complete,
435
0
                       unsigned flags) {
436
  /* io_uring_enter used to take a sigset_t but it's unused
437
   * in newer kernels unless IORING_ENTER_EXT_ARG is set,
438
   * in which case it takes a struct io_uring_getevents_arg.
439
   */
440
0
  return syscall(__NR_io_uring_enter,
441
0
                 fd,
442
0
                 to_submit,
443
0
                 min_complete,
444
0
                 flags,
445
0
                 NULL,
446
0
                 0L);
447
0
}
448
449
450
0
int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
451
0
  return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
452
0
}
453
454
455
0
static int uv__use_io_uring(uint32_t flags) {
456
#if defined(__ANDROID_API__)
457
  return 0;  /* Possibly available but blocked by seccomp. */
458
#elif defined(__arm__) && __SIZEOF_POINTER__ == 4
459
  /* See https://github.com/libuv/libuv/issues/4158. */
460
  return 0;  /* All 32 bits kernels appear buggy. */
461
#elif defined(__powerpc64__) || defined(__ppc64__)
462
  /* See https://github.com/libuv/libuv/issues/4283. */
463
  return 0; /* Random SIGSEGV in signal handler. */
464
#else
465
  /* Ternary: unknown=0, yes=1, no=-1 */
466
0
  static _Atomic int use_io_uring;
467
0
  char* val;
468
0
  int use;
469
470
#if defined(__hppa__)
471
  /* io_uring first supported on parisc in 6.1, functional in .51
472
   * https://lore.kernel.org/all/cb912694-b1fe-dbb0-4d8c-d608f3526905@gmx.de/
473
   */
474
  if (uv__kernel_version() < /*6.1.51*/0x060133)
475
    return 0;
476
#endif
477
478
  /* SQPOLL is all kinds of buggy but epoll batching should work fine. */
479
0
  if (0 == (flags & UV__IORING_SETUP_SQPOLL))
480
0
    return 1;
481
482
  /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */
483
0
  if (uv__kernel_version() < /*5.10.186*/0x050ABA)
484
0
    return 0;
485
486
0
  use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
487
488
0
  if (use == 0) {
489
0
    val = getenv("UV_USE_IO_URING");
490
0
    use = val != NULL && atoi(val) > 0 ? 1 : -1;
491
0
    atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
492
0
  }
493
494
0
  return use > 0;
495
0
#endif
496
0
}
497
498
499
static void uv__iou_init(int epollfd,
500
                         struct uv__iou* iou,
501
                         uint32_t entries,
502
0
                         uint32_t flags) {
503
0
  struct uv__io_uring_params params;
504
0
  struct epoll_event e;
505
0
  size_t cqlen;
506
0
  size_t sqlen;
507
0
  size_t maxlen;
508
0
  size_t sqelen;
509
0
  unsigned kernel_version;
510
0
  uint32_t* sqarray;
511
0
  uint32_t i;
512
0
  char* sq;
513
0
  char* sqe;
514
0
  int ringfd;
515
0
  int no_sqarray;
516
517
0
  sq = MAP_FAILED;
518
0
  sqe = MAP_FAILED;
519
520
0
  if (!uv__use_io_uring(flags))
521
0
    return;
522
523
0
  kernel_version = uv__kernel_version();
524
0
  no_sqarray =
525
0
      UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600);
526
527
  /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
528
   * Mostly academic because we check for a v5.13 kernel afterwards anyway.
529
   */
530
0
  memset(&params, 0, sizeof(params));
531
0
  params.flags = flags | no_sqarray;
532
533
0
  if (flags & UV__IORING_SETUP_SQPOLL)
534
0
    params.sq_thread_idle = 10;  /* milliseconds */
535
536
  /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
537
0
  ringfd = uv__io_uring_setup(entries, &params);
538
0
  if (ringfd == -1)
539
0
    return;
540
541
  /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
542
   * actually detecting is whether IORING_OP_STATX works with SQPOLL.
543
   */
544
0
  if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
545
0
    goto fail;
546
547
  /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
548
0
  if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
549
0
    goto fail;
550
551
  /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
552
0
  if (!(params.features & UV__IORING_FEAT_NODROP))
553
0
    goto fail;
554
555
0
  sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
556
0
  cqlen =
557
0
      params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
558
0
  maxlen = sqlen < cqlen ? cqlen : sqlen;
559
0
  sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
560
561
0
  sq = mmap(0,
562
0
            maxlen,
563
0
            PROT_READ | PROT_WRITE,
564
0
            MAP_SHARED | MAP_POPULATE,
565
0
            ringfd,
566
0
            0);  /* IORING_OFF_SQ_RING */
567
568
0
  sqe = mmap(0,
569
0
             sqelen,
570
0
             PROT_READ | PROT_WRITE,
571
0
             MAP_SHARED | MAP_POPULATE,
572
0
             ringfd,
573
0
             0x10000000ull);  /* IORING_OFF_SQES */
574
575
0
  if (sq == MAP_FAILED || sqe == MAP_FAILED)
576
0
    goto fail;
577
578
0
  if (flags & UV__IORING_SETUP_SQPOLL) {
579
    /* Only interested in completion events. To get notified when
580
     * the kernel pulls items from the submission ring, add POLLOUT.
581
     */
582
0
    memset(&e, 0, sizeof(e));
583
0
    e.events = POLLIN;
584
0
    e.data.fd = ringfd;
585
586
0
    if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
587
0
      goto fail;
588
0
  }
589
590
0
  iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
591
0
  iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
592
0
  iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
593
0
  iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
594
0
  iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
595
0
  iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
596
0
  iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
597
0
  iou->sq = sq;
598
0
  iou->cqe = sq + params.cq_off.cqes;
599
0
  iou->sqe = sqe;
600
0
  iou->sqlen = sqlen;
601
0
  iou->cqlen = cqlen;
602
0
  iou->maxlen = maxlen;
603
0
  iou->sqelen = sqelen;
604
0
  iou->ringfd = ringfd;
605
0
  iou->in_flight = 0;
606
607
0
  if (no_sqarray)
608
0
    return;
609
610
0
  sqarray = (uint32_t*) (sq + params.sq_off.array);
611
0
  for (i = 0; i <= iou->sqmask; i++)
612
0
    sqarray[i] = i;  /* Slot -> sqe identity mapping. */
613
614
0
  return;
615
616
0
fail:
617
0
  if (sq != MAP_FAILED)
618
0
    munmap(sq, maxlen);
619
620
0
  if (sqe != MAP_FAILED)
621
0
    munmap(sqe, sqelen);
622
623
0
  uv__close(ringfd);
624
0
}
625
626
627
0
static void uv__iou_delete(struct uv__iou* iou) {
628
0
  if (iou->ringfd > -1) {
629
0
    munmap(iou->sq, iou->maxlen);
630
0
    munmap(iou->sqe, iou->sqelen);
631
0
    uv__close(iou->ringfd);
632
0
    iou->ringfd = -1;
633
0
  }
634
0
}
635
636
637
0
int uv__platform_loop_init(uv_loop_t* loop) {
638
0
  uv__loop_internal_fields_t* lfields;
639
640
0
  lfields = uv__get_internal_fields(loop);
641
0
  lfields->ctl.ringfd = -1;
642
0
  lfields->iou.ringfd = -2;  /* "uninitialized" */
643
644
0
  loop->inotify_watchers = NULL;
645
0
  loop->inotify_fd = -1;
646
0
  loop->backend_fd = epoll_create1(O_CLOEXEC);
647
648
0
  if (loop->backend_fd == -1)
649
0
    return UV__ERR(errno);
650
651
0
  uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
652
653
0
  return 0;
654
0
}
655
656
657
0
int uv__io_fork(uv_loop_t* loop) {
658
0
  int err;
659
0
  struct watcher_list* root;
660
661
0
  root = uv__inotify_watchers(loop)->rbh_root;
662
663
0
  uv__close(loop->backend_fd);
664
0
  loop->backend_fd = -1;
665
666
  /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
667
0
  uv__platform_loop_delete(loop);
668
669
0
  err = uv__platform_loop_init(loop);
670
0
  if (err)
671
0
    return err;
672
673
0
  return uv__inotify_fork(loop, root);
674
0
}
675
676
677
0
void uv__platform_loop_delete(uv_loop_t* loop) {
678
0
  uv__loop_internal_fields_t* lfields;
679
680
0
  lfields = uv__get_internal_fields(loop);
681
0
  uv__iou_delete(&lfields->ctl);
682
0
  uv__iou_delete(&lfields->iou);
683
684
0
  if (loop->inotify_fd != -1) {
685
0
    uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
686
0
    uv__close(loop->inotify_fd);
687
0
    loop->inotify_fd = -1;
688
0
  }
689
0
}
690
691
692
struct uv__invalidate {
693
  struct epoll_event (*prep)[256];
694
  struct epoll_event* events;
695
  int nfds;
696
};
697
698
699
0
void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
700
0
  uv__loop_internal_fields_t* lfields;
701
0
  struct uv__invalidate* inv;
702
0
  struct epoll_event dummy;
703
0
  int i;
704
705
0
  lfields = uv__get_internal_fields(loop);
706
0
  inv = lfields->inv;
707
708
  /* Invalidate events with same file descriptor */
709
0
  if (inv != NULL)
710
0
    for (i = 0; i < inv->nfds; i++)
711
0
      if (inv->events[i].data.fd == fd)
712
0
        inv->events[i].data.fd = -1;
713
714
  /* Remove the file descriptor from the epoll.
715
   * This avoids a problem where the same file description remains open
716
   * in another process, causing repeated junk epoll events.
717
   *
718
   * Perform EPOLL_CTL_DEL immediately instead of going through
719
   * io_uring's submit queue, otherwise the file descriptor may
720
   * be closed by the time the kernel starts the operation.
721
   *
722
   * We pass in a dummy epoll_event, to work around a bug in old kernels.
723
   *
724
   * Work around a bug in kernels 3.10 to 3.19 where passing a struct that
725
   * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
726
   */
727
0
  memset(&dummy, 0, sizeof(dummy));
728
0
  epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
729
0
}
730
731
732
0
int uv__io_check_fd(uv_loop_t* loop, int fd) {
733
0
  struct epoll_event e;
734
0
  int rc;
735
736
0
  memset(&e, 0, sizeof(e));
737
0
  e.events = POLLIN;
738
0
  e.data.fd = -1;
739
740
0
  rc = 0;
741
0
  if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
742
0
    if (errno != EEXIST)
743
0
      rc = UV__ERR(errno);
744
745
0
  if (rc == 0)
746
0
    if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
747
0
      abort();
748
749
0
  return rc;
750
0
}
751
752
753
/* Caller must initialize SQE and call uv__iou_submit(). */
754
static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
755
                                                uv_loop_t* loop,
756
0
                                                uv_fs_t* req) {
757
0
  struct uv__io_uring_sqe* sqe;
758
0
  uint32_t head;
759
0
  uint32_t tail;
760
0
  uint32_t mask;
761
0
  uint32_t slot;
762
763
  /* Lazily create the ring. State machine: -2 means uninitialized, -1 means
764
   * initialization failed. Anything else is a valid ring file descriptor.
765
   */
766
0
  if (iou->ringfd == -2) {
767
    /* By default, the SQPOLL is not created. Enable only if the loop is
768
     * configured with UV_LOOP_USE_IO_URING_SQPOLL and the UV_USE_IO_URING
769
     * environment variable is unset or a positive number.
770
     */
771
0
    if (loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL)
772
0
      if (uv__use_io_uring(UV__IORING_SETUP_SQPOLL))
773
0
        uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL);
774
775
0
    if (iou->ringfd == -2)
776
0
      iou->ringfd = -1;  /* "failed" */
777
0
  }
778
779
0
  if (iou->ringfd == -1)
780
0
    return NULL;
781
782
0
  head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
783
0
                              memory_order_acquire);
784
0
  tail = *iou->sqtail;
785
0
  mask = iou->sqmask;
786
787
0
  if ((head & mask) == ((tail + 1) & mask))
788
0
    return NULL;  /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
789
790
0
  slot = tail & mask;
791
0
  sqe = iou->sqe;
792
0
  sqe = &sqe[slot];
793
0
  memset(sqe, 0, sizeof(*sqe));
794
0
  sqe->user_data = (uintptr_t) req;
795
796
  /* Pacify uv_cancel(). */
797
0
  req->work_req.loop = loop;
798
0
  req->work_req.work = NULL;
799
0
  req->work_req.done = NULL;
800
0
  uv__queue_init(&req->work_req.wq);
801
802
0
  uv__req_register(loop);
803
0
  iou->in_flight++;
804
805
0
  return sqe;
806
0
}
807
808
809
0
static void uv__iou_submit(struct uv__iou* iou) {
810
0
  uint32_t flags;
811
812
0
  atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
813
0
                        *iou->sqtail + 1,
814
0
                        memory_order_release);
815
816
0
  flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
817
0
                               memory_order_acquire);
818
819
0
  if (flags & UV__IORING_SQ_NEED_WAKEUP)
820
0
    if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
821
0
      if (errno != EOWNERDEAD)  /* Kernel bug. Harmless, ignore. */
822
0
        perror("libuv: io_uring_enter(wakeup)");  /* Can't happen. */
823
0
}
824
825
826
0
int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
827
0
  struct uv__io_uring_sqe* sqe;
828
0
  struct uv__iou* iou;
829
0
  int kv;
830
831
0
  kv = uv__kernel_version();
832
  /* Work around a poorly understood bug in older kernels where closing a file
833
   * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
834
   * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
835
   * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
836
   * but good candidates are the several data race fixes. Interestingly, it
837
   * seems to manifest only when running under Docker so the possibility of
838
   * a Docker bug can't be completely ruled out either. Yay, computers.
839
   * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and
840
   * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be
841
   * solved.
842
   */
843
0
  if (kv < /* 5.15.90 */ 0x050F5A)
844
0
    return 0;
845
846
0
  if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100)
847
0
    return 0;
848
849
850
0
  iou = &uv__get_internal_fields(loop)->iou;
851
852
0
  sqe = uv__iou_get_sqe(iou, loop, req);
853
0
  if (sqe == NULL)
854
0
    return 0;
855
856
0
  sqe->fd = req->file;
857
0
  sqe->opcode = UV__IORING_OP_CLOSE;
858
859
0
  uv__iou_submit(iou);
860
861
0
  return 1;
862
0
}
863
864
865
0
int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) {
866
0
  struct uv__io_uring_sqe* sqe;
867
0
  struct uv__iou* iou;
868
869
0
  if (uv__kernel_version() < /* 6.9 */0x060900)
870
0
    return 0;
871
872
0
  iou = &uv__get_internal_fields(loop)->iou;
873
0
  sqe = uv__iou_get_sqe(iou, loop, req);
874
0
  if (sqe == NULL)
875
0
    return 0;
876
877
0
  sqe->fd = req->file;
878
0
  sqe->off = req->off;
879
0
  sqe->opcode = UV__IORING_OP_FTRUNCATE;
880
0
  uv__iou_submit(iou);
881
882
0
  return 1;
883
0
}
884
885
int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
886
                                  uv_fs_t* req,
887
0
                                  uint32_t fsync_flags) {
888
0
  struct uv__io_uring_sqe* sqe;
889
0
  struct uv__iou* iou;
890
891
0
  iou = &uv__get_internal_fields(loop)->iou;
892
893
0
  sqe = uv__iou_get_sqe(iou, loop, req);
894
0
  if (sqe == NULL)
895
0
    return 0;
896
897
  /* Little known fact: setting seq->off and seq->len turns
898
   * it into an asynchronous sync_file_range() operation.
899
   */
900
0
  sqe->fd = req->file;
901
0
  sqe->fsync_flags = fsync_flags;
902
0
  sqe->opcode = UV__IORING_OP_FSYNC;
903
904
0
  uv__iou_submit(iou);
905
906
0
  return 1;
907
0
}
908
909
910
0
int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
911
0
  struct uv__io_uring_sqe* sqe;
912
0
  struct uv__iou* iou;
913
914
0
  if (uv__kernel_version() < /* 5.15.0 */0x050F00)
915
0
    return 0;
916
917
0
  iou = &uv__get_internal_fields(loop)->iou;
918
0
  sqe = uv__iou_get_sqe(iou, loop, req);
919
0
  if (sqe == NULL)
920
0
    return 0;
921
922
0
  sqe->addr = (uintptr_t) req->path;
923
0
  sqe->fd = AT_FDCWD;
924
0
  sqe->addr2 = (uintptr_t) req->new_path;
925
0
  sqe->len = AT_FDCWD;
926
0
  sqe->opcode = UV__IORING_OP_LINKAT;
927
928
0
  uv__iou_submit(iou);
929
930
0
  return 1;
931
0
}
932
933
934
0
int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
935
0
  struct uv__io_uring_sqe* sqe;
936
0
  struct uv__iou* iou;
937
938
0
  if (uv__kernel_version() < /* 5.15.0 */0x050F00)
939
0
    return 0;
940
941
0
  iou = &uv__get_internal_fields(loop)->iou;
942
0
  sqe = uv__iou_get_sqe(iou, loop, req);
943
0
  if (sqe == NULL)
944
0
    return 0;
945
946
0
  sqe->addr = (uintptr_t) req->path;
947
0
  sqe->fd = AT_FDCWD;
948
0
  sqe->len = req->mode;
949
0
  sqe->opcode = UV__IORING_OP_MKDIRAT;
950
951
0
  uv__iou_submit(iou);
952
953
0
  return 1;
954
0
}
955
956
957
0
int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
958
0
  struct uv__io_uring_sqe* sqe;
959
0
  struct uv__iou* iou;
960
961
0
  iou = &uv__get_internal_fields(loop)->iou;
962
963
0
  sqe = uv__iou_get_sqe(iou, loop, req);
964
0
  if (sqe == NULL)
965
0
    return 0;
966
967
0
  sqe->addr = (uintptr_t) req->path;
968
0
  sqe->fd = AT_FDCWD;
969
0
  sqe->len = req->mode;
970
0
  sqe->opcode = UV__IORING_OP_OPENAT;
971
0
  sqe->open_flags = req->flags | O_CLOEXEC;
972
973
0
  uv__iou_submit(iou);
974
975
0
  return 1;
976
0
}
977
978
979
0
int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
980
0
  struct uv__io_uring_sqe* sqe;
981
0
  struct uv__iou* iou;
982
983
0
  iou = &uv__get_internal_fields(loop)->iou;
984
985
0
  sqe = uv__iou_get_sqe(iou, loop, req);
986
0
  if (sqe == NULL)
987
0
    return 0;
988
989
0
  sqe->addr = (uintptr_t) req->path;
990
0
  sqe->fd = AT_FDCWD;
991
0
  sqe->addr2 = (uintptr_t) req->new_path;
992
0
  sqe->len = AT_FDCWD;
993
0
  sqe->opcode = UV__IORING_OP_RENAMEAT;
994
995
0
  uv__iou_submit(iou);
996
997
0
  return 1;
998
0
}
999
1000
1001
0
int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
1002
0
  struct uv__io_uring_sqe* sqe;
1003
0
  struct uv__iou* iou;
1004
1005
0
  if (uv__kernel_version() < /* 5.15.0 */0x050F00)
1006
0
    return 0;
1007
1008
0
  iou = &uv__get_internal_fields(loop)->iou;
1009
0
  sqe = uv__iou_get_sqe(iou, loop, req);
1010
0
  if (sqe == NULL)
1011
0
    return 0;
1012
1013
0
  sqe->addr = (uintptr_t) req->path;
1014
0
  sqe->fd = AT_FDCWD;
1015
0
  sqe->addr2 = (uintptr_t) req->new_path;
1016
0
  sqe->opcode = UV__IORING_OP_SYMLINKAT;
1017
1018
0
  uv__iou_submit(iou);
1019
1020
0
  return 1;
1021
0
}
1022
1023
1024
0
int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
1025
0
  struct uv__io_uring_sqe* sqe;
1026
0
  struct uv__iou* iou;
1027
1028
0
  iou = &uv__get_internal_fields(loop)->iou;
1029
1030
0
  sqe = uv__iou_get_sqe(iou, loop, req);
1031
0
  if (sqe == NULL)
1032
0
    return 0;
1033
1034
0
  sqe->addr = (uintptr_t) req->path;
1035
0
  sqe->fd = AT_FDCWD;
1036
0
  sqe->opcode = UV__IORING_OP_UNLINKAT;
1037
1038
0
  uv__iou_submit(iou);
1039
1040
0
  return 1;
1041
0
}
1042
1043
1044
int uv__iou_fs_read_or_write(uv_loop_t* loop,
1045
                             uv_fs_t* req,
1046
0
                             int is_read) {
1047
0
  struct uv__io_uring_sqe* sqe;
1048
0
  struct uv__iou* iou;
1049
1050
  /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
1051
   * to the threadpool on writes */
1052
0
  if (req->nbufs > IOV_MAX) {
1053
0
    if (is_read)
1054
0
      req->nbufs = IOV_MAX;
1055
0
    else
1056
0
      return 0;
1057
0
  }
1058
1059
0
  iou = &uv__get_internal_fields(loop)->iou;
1060
1061
0
  sqe = uv__iou_get_sqe(iou, loop, req);
1062
0
  if (sqe == NULL)
1063
0
    return 0;
1064
1065
0
  sqe->addr = (uintptr_t) req->bufs;
1066
0
  sqe->fd = req->file;
1067
0
  sqe->len = req->nbufs;
1068
0
  sqe->off = req->off < 0 ? -1 : req->off;
1069
0
  sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
1070
1071
0
  uv__iou_submit(iou);
1072
1073
0
  return 1;
1074
0
}
1075
1076
1077
int uv__iou_fs_statx(uv_loop_t* loop,
1078
                     uv_fs_t* req,
1079
                     int is_fstat,
1080
0
                     int is_lstat) {
1081
0
  struct uv__io_uring_sqe* sqe;
1082
0
  struct uv__statx* statxbuf;
1083
0
  struct uv__iou* iou;
1084
1085
0
  statxbuf = uv__malloc(sizeof(*statxbuf));
1086
0
  if (statxbuf == NULL)
1087
0
    return 0;
1088
1089
0
  iou = &uv__get_internal_fields(loop)->iou;
1090
1091
0
  sqe = uv__iou_get_sqe(iou, loop, req);
1092
0
  if (sqe == NULL) {
1093
0
    uv__free(statxbuf);
1094
0
    return 0;
1095
0
  }
1096
1097
0
  req->ptr = statxbuf;
1098
1099
0
  sqe->addr = (uintptr_t) req->path;
1100
0
  sqe->addr2 = (uintptr_t) statxbuf;
1101
0
  sqe->fd = AT_FDCWD;
1102
0
  sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
1103
0
  sqe->opcode = UV__IORING_OP_STATX;
1104
1105
0
  if (is_fstat) {
1106
0
    sqe->addr = (uintptr_t) "";
1107
0
    sqe->fd = req->file;
1108
0
    sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
1109
0
  }
1110
1111
0
  if (is_lstat)
1112
0
    sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
1113
1114
0
  uv__iou_submit(iou);
1115
1116
0
  return 1;
1117
0
}
1118
1119
1120
0
void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
1121
0
  buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
1122
0
  buf->st_mode = statxbuf->stx_mode;
1123
0
  buf->st_nlink = statxbuf->stx_nlink;
1124
0
  buf->st_uid = statxbuf->stx_uid;
1125
0
  buf->st_gid = statxbuf->stx_gid;
1126
0
  buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
1127
0
  buf->st_ino = statxbuf->stx_ino;
1128
0
  buf->st_size = statxbuf->stx_size;
1129
0
  buf->st_blksize = statxbuf->stx_blksize;
1130
0
  buf->st_blocks = statxbuf->stx_blocks;
1131
0
  buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
1132
0
  buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
1133
0
  buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
1134
0
  buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
1135
0
  buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
1136
0
  buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
1137
0
  buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
1138
0
  buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
1139
0
  buf->st_flags = 0;
1140
0
  buf->st_gen = 0;
1141
0
}
1142
1143
1144
0
static void uv__iou_fs_statx_post(uv_fs_t* req) {
1145
0
  struct uv__statx* statxbuf;
1146
0
  uv_stat_t* buf;
1147
1148
0
  buf = &req->statbuf;
1149
0
  statxbuf = req->ptr;
1150
0
  req->ptr = NULL;
1151
1152
0
  if (req->result == 0) {
1153
0
    uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
1154
0
    uv__statx_to_stat(statxbuf, buf);
1155
0
    req->ptr = buf;
1156
0
  }
1157
1158
0
  uv__free(statxbuf);
1159
0
}
1160
1161
1162
0
static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
1163
0
  struct uv__io_uring_cqe* cqe;
1164
0
  struct uv__io_uring_cqe* e;
1165
0
  uv_fs_t* req;
1166
0
  uint32_t head;
1167
0
  uint32_t tail;
1168
0
  uint32_t mask;
1169
0
  uint32_t i;
1170
0
  uint32_t flags;
1171
0
  int nevents;
1172
0
  int rc;
1173
1174
0
  head = *iou->cqhead;
1175
0
  tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
1176
0
                              memory_order_acquire);
1177
0
  mask = iou->cqmask;
1178
0
  cqe = iou->cqe;
1179
0
  nevents = 0;
1180
1181
0
  for (i = head; i != tail; i++) {
1182
0
    e = &cqe[i & mask];
1183
1184
0
    req = (uv_fs_t*) (uintptr_t) e->user_data;
1185
0
    assert(req->type == UV_FS);
1186
1187
0
    uv__req_unregister(loop);
1188
0
    iou->in_flight--;
1189
1190
    /* If the op is not supported by the kernel retry using the thread pool */
1191
0
    if (e->res == -EOPNOTSUPP) {
1192
0
      uv__fs_post(loop, req);
1193
0
      continue;
1194
0
    }
1195
1196
    /* io_uring stores error codes as negative numbers, same as libuv. */
1197
0
    req->result = e->res;
1198
1199
0
    switch (req->fs_type) {
1200
0
      case UV_FS_FSTAT:
1201
0
      case UV_FS_LSTAT:
1202
0
      case UV_FS_STAT:
1203
0
        uv__iou_fs_statx_post(req);
1204
0
        break;
1205
0
      default:  /* Squelch -Wswitch warnings. */
1206
0
        break;
1207
0
    }
1208
1209
0
    uv__metrics_update_idle_time(loop);
1210
0
    req->cb(req);
1211
0
    nevents++;
1212
0
  }
1213
1214
0
  atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
1215
0
                        tail,
1216
0
                        memory_order_release);
1217
1218
  /* Check whether CQE's overflowed, if so enter the kernel to make them
1219
   * available. Don't grab them immediately but in the next loop iteration to
1220
   * avoid loop starvation. */
1221
0
  flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
1222
0
                               memory_order_acquire);
1223
1224
0
  if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
1225
0
    do
1226
0
      rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
1227
0
    while (rc == -1 && errno == EINTR);
1228
1229
0
    if (rc < 0)
1230
0
      perror("libuv: io_uring_enter(getevents)");  /* Can't happen. */
1231
0
  }
1232
1233
0
  uv__metrics_inc_events(loop, nevents);
1234
0
  if (uv__get_internal_fields(loop)->current_timeout == 0)
1235
0
    uv__metrics_inc_events_waiting(loop, nevents);
1236
0
}
1237
1238
1239
/* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
1240
 * executed immediately, otherwise the file descriptor may have been closed
1241
 * by the time the kernel starts the operation.
1242
 */
1243
static void uv__epoll_ctl_prep(int epollfd,
1244
                               struct uv__iou* ctl,
1245
                               struct epoll_event (*events)[256],
1246
                               int op,
1247
                               int fd,
1248
0
                               struct epoll_event* e) {
1249
  /* FIXME: Avoid dangling pointer to uv__io_poll stack frame.  */
1250
0
#ifndef __clang_analyzer__ /* core.StackAddressEscape */
1251
0
  struct uv__io_uring_sqe* sqe;
1252
0
  struct epoll_event* pe;
1253
0
  uint32_t mask;
1254
0
  uint32_t slot;
1255
1256
0
  assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
1257
0
  assert(ctl->ringfd != -1);
1258
1259
0
  mask = ctl->sqmask;
1260
0
  slot = (*ctl->sqtail)++ & mask;
1261
1262
0
  pe = &(*events)[slot];
1263
0
  *pe = *e;
1264
1265
0
  sqe = ctl->sqe;
1266
0
  sqe = &sqe[slot];
1267
1268
0
  memset(sqe, 0, sizeof(*sqe));
1269
0
  sqe->addr = (uintptr_t) pe;
1270
0
  sqe->fd = epollfd;
1271
0
  sqe->len = op;
1272
0
  sqe->off = fd;
1273
0
  sqe->opcode = UV__IORING_OP_EPOLL_CTL;
1274
0
  sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
1275
1276
0
  if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
1277
0
    uv__epoll_ctl_flush(epollfd, ctl, events);
1278
0
#endif
1279
0
}
1280
1281
1282
static void uv__epoll_ctl_flush(int epollfd,
1283
                                struct uv__iou* ctl,
1284
0
                                struct epoll_event (*events)[256]) {
1285
0
  struct epoll_event oldevents[256];
1286
0
  struct uv__io_uring_cqe* cqe;
1287
0
  uint32_t oldslot;
1288
0
  uint32_t slot;
1289
0
  uint32_t n;
1290
0
  int fd;
1291
0
  int op;
1292
0
  int rc;
1293
1294
0
  STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
1295
0
  assert(ctl->ringfd != -1);
1296
0
  assert(*ctl->sqhead != *ctl->sqtail);
1297
1298
0
  n = *ctl->sqtail - *ctl->sqhead;
1299
0
  do
1300
0
    rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
1301
0
  while (rc == -1 && errno == EINTR);
1302
1303
0
  if (rc < 0)
1304
0
    perror("libuv: io_uring_enter(getevents)");  /* Can't happen. */
1305
1306
0
  if (rc != (int) n)
1307
0
    abort();
1308
1309
0
  assert(*ctl->sqhead == *ctl->sqtail);
1310
1311
0
  memcpy(oldevents, *events, sizeof(*events));
1312
1313
  /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
1314
   * that have been closed, or EPOLL_CTL_ADD commands for file descriptors
1315
   * that we are already watching. Ignore the former and retry the latter
1316
   * with EPOLL_CTL_MOD.
1317
   */
1318
0
  while (*ctl->cqhead != *ctl->cqtail) {
1319
0
    slot = (*ctl->cqhead)++ & ctl->cqmask;
1320
1321
0
    cqe = ctl->cqe;
1322
0
    cqe = &cqe[slot];
1323
1324
0
    if (cqe->res == 0)
1325
0
      continue;
1326
1327
0
    fd = cqe->user_data >> 32;
1328
0
    op = 3 & cqe->user_data;
1329
0
    oldslot = 255 & (cqe->user_data >> 2);
1330
1331
0
    if (op == EPOLL_CTL_DEL)
1332
0
      continue;
1333
1334
0
    if (op != EPOLL_CTL_ADD)
1335
0
      abort();
1336
1337
0
    if (cqe->res != -EEXIST)
1338
0
      abort();
1339
1340
0
    uv__epoll_ctl_prep(epollfd,
1341
0
                       ctl,
1342
0
                       events,
1343
0
                       EPOLL_CTL_MOD,
1344
0
                       fd,
1345
0
                       &oldevents[oldslot]);
1346
0
  }
1347
0
}
1348
1349
1350
0
void uv__io_poll(uv_loop_t* loop, int timeout) {
1351
0
  uv__loop_internal_fields_t* lfields;
1352
0
  struct epoll_event events[1024];
1353
0
  struct epoll_event prep[256];
1354
0
  struct uv__invalidate inv;
1355
0
  struct epoll_event* pe;
1356
0
  struct epoll_event e;
1357
0
  struct uv__iou* ctl;
1358
0
  struct uv__iou* iou;
1359
0
  int real_timeout;
1360
0
  struct uv__queue* q;
1361
0
  uv__io_t* w;
1362
0
  sigset_t* sigmask;
1363
0
  sigset_t sigset;
1364
0
  uint64_t base;
1365
0
  int have_iou_events;
1366
0
  int have_signals;
1367
0
  int nevents;
1368
0
  int epollfd;
1369
0
  int count;
1370
0
  int nfds;
1371
0
  int fd;
1372
0
  int op;
1373
0
  int i;
1374
0
  int user_timeout;
1375
0
  int reset_timeout;
1376
1377
0
  lfields = uv__get_internal_fields(loop);
1378
0
  ctl = &lfields->ctl;
1379
0
  iou = &lfields->iou;
1380
1381
0
  sigmask = NULL;
1382
0
  if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
1383
0
    sigemptyset(&sigset);
1384
0
    sigaddset(&sigset, SIGPROF);
1385
0
    sigmask = &sigset;
1386
0
  }
1387
1388
0
  assert(timeout >= -1);
1389
0
  base = loop->time;
1390
0
  count = 48; /* Benchmarks suggest this gives the best throughput. */
1391
0
  real_timeout = timeout;
1392
1393
0
  if (lfields->flags & UV_METRICS_IDLE_TIME) {
1394
0
    reset_timeout = 1;
1395
0
    user_timeout = timeout;
1396
0
    timeout = 0;
1397
0
  } else {
1398
0
    reset_timeout = 0;
1399
0
    user_timeout = 0;
1400
0
  }
1401
1402
0
  epollfd = loop->backend_fd;
1403
1404
0
  memset(&e, 0, sizeof(e));
1405
1406
0
  while (!uv__queue_empty(&loop->watcher_queue)) {
1407
0
    q = uv__queue_head(&loop->watcher_queue);
1408
0
    w = uv__queue_data(q, uv__io_t, watcher_queue);
1409
0
    uv__queue_remove(q);
1410
0
    uv__queue_init(q);
1411
1412
0
    op = EPOLL_CTL_MOD;
1413
0
    if (w->events == 0)
1414
0
      op = EPOLL_CTL_ADD;
1415
1416
0
    w->events = w->pevents;
1417
0
    e.events = w->pevents;
1418
0
    e.data.fd = w->fd;
1419
0
    fd = w->fd;
1420
1421
0
    if (ctl->ringfd != -1) {
1422
0
      uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
1423
0
      continue;
1424
0
    }
1425
1426
0
    if (!epoll_ctl(epollfd, op, fd, &e))
1427
0
      continue;
1428
1429
0
    assert(op == EPOLL_CTL_ADD);
1430
0
    assert(errno == EEXIST);
1431
1432
    /* File descriptor that's been watched before, update event mask. */
1433
0
    if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
1434
0
      abort();
1435
0
  }
1436
1437
0
  inv.events = events;
1438
0
  inv.prep = &prep;
1439
0
  inv.nfds = -1;
1440
1441
0
  for (;;) {
1442
0
    if (loop->nfds == 0)
1443
0
      if (iou->in_flight == 0)
1444
0
        break;
1445
1446
    /* All event mask mutations should be visible to the kernel before
1447
     * we enter epoll_pwait().
1448
     */
1449
0
    if (ctl->ringfd != -1)
1450
0
      while (*ctl->sqhead != *ctl->sqtail)
1451
0
        uv__epoll_ctl_flush(epollfd, ctl, &prep);
1452
1453
0
    uv__io_poll_prepare(loop, NULL, timeout);
1454
0
    nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
1455
0
    uv__io_poll_check(loop, NULL);
1456
1457
0
    if (nfds == -1)
1458
0
      assert(errno == EINTR);
1459
0
    else if (nfds == 0)
1460
      /* Unlimited timeout should only return with events or signal. */
1461
0
      assert(timeout != -1);
1462
1463
0
    if (nfds == 0 || nfds == -1) {
1464
0
      if (reset_timeout != 0) {
1465
0
        timeout = user_timeout;
1466
0
        reset_timeout = 0;
1467
0
      } else if (nfds == 0) {
1468
0
        return;
1469
0
      }
1470
1471
      /* Interrupted by a signal. Update timeout and poll again. */
1472
0
      goto update_timeout;
1473
0
    }
1474
1475
0
    have_iou_events = 0;
1476
0
    have_signals = 0;
1477
0
    nevents = 0;
1478
1479
0
    inv.nfds = nfds;
1480
0
    lfields->inv = &inv;
1481
1482
0
    for (i = 0; i < nfds; i++) {
1483
0
      pe = events + i;
1484
0
      fd = pe->data.fd;
1485
1486
      /* Skip invalidated events, see uv__platform_invalidate_fd */
1487
0
      if (fd == -1)
1488
0
        continue;
1489
1490
0
      if (fd == iou->ringfd) {
1491
0
        uv__poll_io_uring(loop, iou);
1492
0
        have_iou_events = 1;
1493
0
        continue;
1494
0
      }
1495
1496
0
      assert(fd >= 0);
1497
0
      assert((unsigned) fd < loop->nwatchers);
1498
1499
0
      w = loop->watchers[fd];
1500
1501
0
      if (w == NULL) {
1502
        /* File descriptor that we've stopped watching, disarm it.
1503
         *
1504
         * Ignore all errors because we may be racing with another thread
1505
         * when the file descriptor is closed.
1506
         *
1507
         * Perform EPOLL_CTL_DEL immediately instead of going through
1508
         * io_uring's submit queue, otherwise the file descriptor may
1509
         * be closed by the time the kernel starts the operation.
1510
         */
1511
0
        epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
1512
0
        continue;
1513
0
      }
1514
1515
      /* Give users only events they're interested in. Prevents spurious
1516
       * callbacks when previous callback invocation in this loop has stopped
1517
       * the current watcher. Also, filters out events that users has not
1518
       * requested us to watch.
1519
       */
1520
0
      pe->events &= w->pevents | POLLERR | POLLHUP;
1521
1522
      /* Work around an epoll quirk where it sometimes reports just the
1523
       * EPOLLERR or EPOLLHUP event.  In order to force the event loop to
1524
       * move forward, we merge in the read/write events that the watcher
1525
       * is interested in; uv__read() and uv__write() will then deal with
1526
       * the error or hangup in the usual fashion.
1527
       *
1528
       * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
1529
       * reads the available data, calls uv_read_stop(), then sometime later
1530
       * calls uv_read_start() again.  By then, libuv has forgotten about the
1531
       * hangup and the kernel won't report EPOLLIN again because there's
1532
       * nothing left to read.  If anything, libuv is to blame here.  The
1533
       * current hack is just a quick bandaid; to properly fix it, libuv
1534
       * needs to remember the error/hangup event.  We should get that for
1535
       * free when we switch over to edge-triggered I/O.
1536
       */
1537
0
      if (pe->events == POLLERR || pe->events == POLLHUP)
1538
0
        pe->events |=
1539
0
          w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
1540
1541
0
      if (pe->events != 0) {
1542
        /* Run signal watchers last.  This also affects child process watchers
1543
         * because those are implemented in terms of signal watchers.
1544
         */
1545
0
        if (w == &loop->signal_io_watcher) {
1546
0
          have_signals = 1;
1547
0
        } else {
1548
0
          uv__metrics_update_idle_time(loop);
1549
0
          uv__io_cb(loop, w, pe->events);
1550
0
        }
1551
1552
0
        nevents++;
1553
0
      }
1554
0
    }
1555
1556
0
    uv__metrics_inc_events(loop, nevents);
1557
0
    if (reset_timeout != 0) {
1558
0
      timeout = user_timeout;
1559
0
      reset_timeout = 0;
1560
0
      uv__metrics_inc_events_waiting(loop, nevents);
1561
0
    }
1562
1563
0
    if (have_signals != 0) {
1564
0
      uv__metrics_update_idle_time(loop);
1565
0
      uv__signal_event(loop, &loop->signal_io_watcher, POLLIN);
1566
0
    }
1567
1568
0
    lfields->inv = NULL;
1569
1570
0
    if (have_iou_events != 0)
1571
0
      break;  /* Event loop should cycle now so don't poll again. */
1572
1573
0
    if (have_signals != 0)
1574
0
      break;  /* Event loop should cycle now so don't poll again. */
1575
1576
0
    if (nevents != 0) {
1577
0
      if (nfds == ARRAY_SIZE(events) && --count != 0) {
1578
        /* Poll for more events but don't block this time. */
1579
0
        timeout = 0;
1580
0
        continue;
1581
0
      }
1582
0
      break;
1583
0
    }
1584
1585
0
update_timeout:
1586
0
    if (timeout == 0)
1587
0
      break;
1588
1589
0
    if (timeout == -1)
1590
0
      continue;
1591
1592
0
    assert(timeout > 0);
1593
1594
0
    real_timeout -= (loop->time - base);
1595
0
    if (real_timeout <= 0)
1596
0
      break;
1597
1598
0
    timeout = real_timeout;
1599
0
  }
1600
1601
0
  if (ctl->ringfd != -1)
1602
0
    while (*ctl->sqhead != *ctl->sqtail)
1603
0
      uv__epoll_ctl_flush(epollfd, ctl, &prep);
1604
0
}
1605
1606
0
uint64_t uv__hrtime(uv_clocktype_t type) {
1607
0
  static _Atomic clock_t fast_clock_id = -1;
1608
0
  struct timespec t;
1609
0
  clock_t clock_id;
1610
1611
  /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
1612
   * millisecond granularity or better.  CLOCK_MONOTONIC_COARSE is
1613
   * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
1614
   * decide to make a costly system call.
1615
   */
1616
  /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
1617
   * when it has microsecond granularity or better (unlikely).
1618
   */
1619
0
  clock_id = CLOCK_MONOTONIC;
1620
0
  if (type != UV_CLOCK_FAST)
1621
0
    goto done;
1622
1623
0
  clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
1624
0
  if (clock_id != -1)
1625
0
    goto done;
1626
1627
0
  clock_id = CLOCK_MONOTONIC;
1628
0
  if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
1629
0
    if (t.tv_nsec <= 1 * 1000 * 1000)
1630
0
      clock_id = CLOCK_MONOTONIC_COARSE;
1631
1632
0
  atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
1633
1634
0
done:
1635
1636
0
  if (clock_gettime(clock_id, &t))
1637
0
    return 0;  /* Not really possible. */
1638
1639
0
  return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
1640
0
}
1641
1642
1643
0
int uv_resident_set_memory(size_t* rss) {
1644
0
  char buf[1024];
1645
0
  const char* s;
1646
0
  long val;
1647
0
  int rc;
1648
0
  int i;
1649
1650
  /* rss: 24th element */
1651
0
  rc = uv__slurp("/proc/self/stat", buf, sizeof(buf));
1652
0
  if (rc < 0)
1653
0
    return rc;
1654
1655
  /* find the last ')' */
1656
0
  s = strrchr(buf, ')');
1657
0
  if (s == NULL)
1658
0
    goto err;
1659
1660
0
  for (i = 1; i <= 22; i++) {
1661
0
    s = strchr(s + 1, ' ');
1662
0
    if (s == NULL)
1663
0
      goto err;
1664
0
  }
1665
1666
0
  errno = 0;
1667
0
  val = strtol(s, NULL, 10);
1668
0
  if (val < 0 || errno != 0)
1669
0
    goto err;
1670
1671
0
  *rss = val * getpagesize();
1672
0
  return 0;
1673
1674
0
err:
1675
0
  return UV_EINVAL;
1676
0
}
1677
1678
0
int uv_uptime(double* uptime) {
1679
0
  struct timespec now;
1680
0
  char buf[128];
1681
1682
  /* Consult /proc/uptime when present (common case), or fall back to
1683
   * clock_gettime. Why not always clock_gettime? It doesn't always return the
1684
   * right result under OpenVZ and possibly other containerized environments.
1685
   */
1686
0
  if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
1687
0
    if (1 == sscanf(buf, "%lf", uptime))
1688
0
      return 0;
1689
1690
0
  if (clock_gettime(CLOCK_BOOTTIME, &now))
1691
0
    return UV__ERR(errno);
1692
1693
0
  *uptime = now.tv_sec;
1694
0
  return 0;
1695
0
}
1696
1697
1698
0
int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
1699
#if defined(__PPC__)
1700
  static const char model_marker[] = "cpu\t\t: ";
1701
  static const char model_marker2[] = "";
1702
#elif defined(__arm__)
1703
  static const char model_marker[] = "model name\t: ";
1704
  static const char model_marker2[] = "Processor\t: ";
1705
#elif defined(__aarch64__)
1706
  static const char model_marker[] = "CPU part\t: ";
1707
  static const char model_marker2[] = "";
1708
#elif defined(__mips__)
1709
  static const char model_marker[] = "cpu model\t\t: ";
1710
  static const char model_marker2[] = "";
1711
#elif defined(__loongarch__)
1712
  static const char model_marker[] = "cpu family\t\t: ";
1713
  static const char model_marker2[] = "";
1714
#else
1715
0
  static const char model_marker[] = "model name\t: ";
1716
0
  static const char model_marker2[] = "";
1717
0
#endif
1718
0
  static const char parts[] =
1719
#ifdef __aarch64__
1720
    "0x811\nARM810\n"       "0x920\nARM920\n"      "0x922\nARM922\n"
1721
    "0x926\nARM926\n"       "0x940\nARM940\n"      "0x946\nARM946\n"
1722
    "0x966\nARM966\n"       "0xa20\nARM1020\n"      "0xa22\nARM1022\n"
1723
    "0xa26\nARM1026\n"      "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
1724
    "0xb56\nARM1156\n"      "0xb76\nARM1176\n"      "0xc05\nCortex-A5\n"
1725
    "0xc07\nCortex-A7\n"    "0xc08\nCortex-A8\n"    "0xc09\nCortex-A9\n"
1726
    "0xc0d\nCortex-A17\n"   /* Originally A12 */
1727
    "0xc0f\nCortex-A15\n"   "0xc0e\nCortex-A17\n"   "0xc14\nCortex-R4\n"
1728
    "0xc15\nCortex-R5\n"    "0xc17\nCortex-R7\n"    "0xc18\nCortex-R8\n"
1729
    "0xc20\nCortex-M0\n"    "0xc21\nCortex-M1\n"    "0xc23\nCortex-M3\n"
1730
    "0xc24\nCortex-M4\n"    "0xc27\nCortex-M7\n"    "0xc60\nCortex-M0+\n"
1731
    "0xd01\nCortex-A32\n"   "0xd03\nCortex-A53\n"   "0xd04\nCortex-A35\n"
1732
    "0xd05\nCortex-A55\n"   "0xd06\nCortex-A65\n"   "0xd07\nCortex-A57\n"
1733
    "0xd08\nCortex-A72\n"   "0xd09\nCortex-A73\n"   "0xd0a\nCortex-A75\n"
1734
    "0xd0b\nCortex-A76\n"   "0xd0c\nNeoverse-N1\n"  "0xd0d\nCortex-A77\n"
1735
    "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n"   "0xd20\nCortex-M23\n"
1736
    "0xd21\nCortex-M33\n"   "0xd41\nCortex-A78\n"   "0xd42\nCortex-A78AE\n"
1737
    "0xd4a\nNeoverse-E1\n"  "0xd4b\nCortex-A78C\n"  "0xd4f\nNeoverse-V2\n"
1738
#endif
1739
0
    "";
1740
0
  struct cpu {
1741
0
    unsigned long long freq, user, nice, sys, idle, irq;
1742
0
    unsigned model;
1743
0
  };
1744
0
  FILE* fp;
1745
0
  char* p;
1746
0
  int found;
1747
0
  int n;
1748
0
  unsigned i;
1749
0
  unsigned cpu;
1750
0
  unsigned maxcpu;
1751
0
  unsigned size;
1752
0
  unsigned long long skip;
1753
0
  struct cpu (*cpus)[8192];  /* Kernel maximum. */
1754
0
  struct cpu* c;
1755
0
  struct cpu t;
1756
0
  char (*model)[64];
1757
0
  unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
1758
  /* Assumption: even big.LITTLE systems will have only a handful
1759
   * of different CPU models. Most systems will just have one.
1760
   */
1761
0
  char models[8][64];
1762
0
  char buf[1024];
1763
1764
0
  memset(bitmap, 0, sizeof(bitmap));
1765
0
  memset(models, 0, sizeof(models));
1766
0
  snprintf(*models, sizeof(*models), "unknown");
1767
0
  maxcpu = 0;
1768
1769
0
  cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
1770
0
  if (cpus == NULL)
1771
0
    return UV_ENOMEM;
1772
1773
0
  fp = uv__open_file("/proc/stat");
1774
0
  if (fp == NULL) {
1775
0
    uv__free(cpus);
1776
0
    return UV__ERR(errno);
1777
0
  }
1778
1779
0
  if (NULL == fgets(buf, sizeof(buf), fp))
1780
0
    abort();
1781
1782
0
  for (;;) {
1783
0
    memset(&t, 0, sizeof(t));
1784
1785
0
    n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
1786
0
               &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
1787
1788
0
    if (n != 7)
1789
0
      break;
1790
1791
0
    if (NULL == fgets(buf, sizeof(buf), fp))
1792
0
      abort();
1793
1794
0
    if (cpu >= ARRAY_SIZE(*cpus))
1795
0
      continue;
1796
1797
0
    (*cpus)[cpu] = t;
1798
1799
0
    bitmap[cpu >> 3] |= 1 << (cpu & 7);
1800
1801
0
    if (cpu >= maxcpu)
1802
0
      maxcpu = cpu + 1;
1803
0
  }
1804
1805
0
  fclose(fp);
1806
1807
0
  fp = uv__open_file("/proc/cpuinfo");
1808
0
  if (fp == NULL)
1809
0
    goto nocpuinfo;
1810
1811
0
  for (;;) {
1812
0
    if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
1813
0
      break;  /* Parse error. */
1814
1815
0
    while (fgets(buf, sizeof(buf), fp)) {
1816
0
      if (!strncmp(buf, model_marker, sizeof(model_marker) - 1)) {
1817
0
        p = buf + sizeof(model_marker) - 1;
1818
0
        goto parts;
1819
0
      }
1820
0
      if (!*model_marker2)
1821
0
        continue;
1822
0
      if (!strncmp(buf, model_marker2, sizeof(model_marker2) - 1)) {
1823
0
        p = buf + sizeof(model_marker2) - 1;
1824
0
        goto parts;
1825
0
      }
1826
0
    }
1827
1828
0
    goto next;  /* Not found. */
1829
1830
0
parts:
1831
0
    n = (int) strcspn(p, "\n");
1832
1833
    /* arm64: translate CPU part code to model name. */
1834
0
    if (*parts) {
1835
0
      p = memmem(parts, sizeof(parts) - 1, p, n + 1);
1836
0
      if (p == NULL)
1837
0
        p = "unknown";
1838
0
      else
1839
0
        p += n + 1;
1840
0
      n = (int) strcspn(p, "\n");
1841
0
    }
1842
1843
0
    found = 0;
1844
0
    for (model = models; !found && model < ARRAY_END(models); model++)
1845
0
      found = !strncmp(p, *model, strlen(*model));
1846
1847
0
    if (!found)
1848
0
      goto next;
1849
1850
0
    if (**model == '\0')
1851
0
      snprintf(*model, sizeof(*model), "%.*s", n, p);
1852
1853
0
    if (cpu < maxcpu)
1854
0
      (*cpus)[cpu].model = model - models;
1855
1856
0
next:
1857
0
    while (fgets(buf, sizeof(buf), fp))
1858
0
      if (*buf == '\n')
1859
0
        break;
1860
0
  }
1861
1862
0
  fclose(fp);
1863
0
  fp = NULL;
1864
1865
0
nocpuinfo:
1866
1867
0
  n = 0;
1868
0
  for (cpu = 0; cpu < maxcpu; cpu++) {
1869
0
    if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1870
0
      continue;
1871
1872
0
    n++;
1873
0
    snprintf(buf, sizeof(buf),
1874
0
             "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
1875
1876
0
    fp = uv__open_file(buf);
1877
0
    if (fp == NULL)
1878
0
      continue;
1879
1880
0
    if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq))
1881
0
      abort();
1882
0
    fclose(fp);
1883
0
    fp = NULL;
1884
0
  }
1885
1886
0
  size = n * sizeof(**ci) + sizeof(models);
1887
0
  *ci = uv__malloc(size);
1888
0
  *count = 0;
1889
1890
0
  if (*ci == NULL) {
1891
0
    uv__free(cpus);
1892
0
    return UV_ENOMEM;
1893
0
  }
1894
1895
0
  *count = n;
1896
0
  p = memcpy(*ci + n, models, sizeof(models));
1897
1898
0
  i = 0;
1899
0
  for (cpu = 0; cpu < maxcpu; cpu++) {
1900
0
    if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1901
0
      continue;
1902
1903
0
    c = *cpus + cpu;
1904
1905
0
    (*ci)[i++] = (uv_cpu_info_t) {
1906
0
      .model     = p + c->model * sizeof(*model),
1907
0
      .speed     = c->freq / 1000,
1908
      /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
1909
       * therefore the multiplier is always 1000/100 = 10.
1910
       */
1911
0
      .cpu_times = (struct uv_cpu_times_s) {
1912
0
        .user = 10 * c->user,
1913
0
        .nice = 10 * c->nice,
1914
0
        .sys  = 10 * c->sys,
1915
0
        .idle = 10 * c->idle,
1916
0
        .irq  = 10 * c->irq,
1917
0
      },
1918
0
    };
1919
0
  }
1920
1921
0
  uv__free(cpus);
1922
1923
0
  return 0;
1924
0
}
1925
1926
1927
0
static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
1928
0
  if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
1929
0
    return 1;
1930
0
  if (ent->ifa_addr == NULL)
1931
0
    return 1;
1932
  /*
1933
   * On Linux getifaddrs returns information related to the raw underlying
1934
   * devices. We're not interested in this information yet.
1935
   */
1936
0
  if (ent->ifa_addr->sa_family == PF_PACKET)
1937
0
    return exclude_type;
1938
0
  return !exclude_type;
1939
0
}
1940
1941
/* TODO(bnoordhuis) share with bsd-ifaddrs.c */
1942
0
int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
1943
0
  uv_interface_address_t* address;
1944
0
  struct sockaddr_ll* sll;
1945
0
  struct ifaddrs* addrs;
1946
0
  struct ifaddrs* ent;
1947
0
  size_t namelen;
1948
0
  char* name;
1949
0
  int i;
1950
1951
0
  *count = 0;
1952
0
  *addresses = NULL;
1953
1954
0
  if (getifaddrs(&addrs))
1955
0
    return UV__ERR(errno);
1956
1957
  /* Count the number of interfaces */
1958
0
  namelen = 0;
1959
0
  for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1960
0
    if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1961
0
      continue;
1962
1963
0
    namelen += strlen(ent->ifa_name) + 1;
1964
0
    (*count)++;
1965
0
  }
1966
1967
0
  if (*count == 0) {
1968
0
    freeifaddrs(addrs);
1969
0
    return 0;
1970
0
  }
1971
1972
  /* Make sure the memory is initiallized to zero using calloc() */
1973
0
  *addresses = uv__calloc(1, *count * sizeof(**addresses) + namelen);
1974
0
  if (*addresses == NULL) {
1975
0
    freeifaddrs(addrs);
1976
0
    return UV_ENOMEM;
1977
0
  }
1978
1979
0
  name = (char*) &(*addresses)[*count];
1980
0
  address = *addresses;
1981
1982
0
  for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1983
0
    if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1984
0
      continue;
1985
1986
0
    namelen = strlen(ent->ifa_name) + 1;
1987
0
    address->name = memcpy(name, ent->ifa_name, namelen);
1988
0
    name += namelen;
1989
1990
0
    if (ent->ifa_addr->sa_family == AF_INET6) {
1991
0
      address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
1992
0
    } else {
1993
0
      address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
1994
0
    }
1995
1996
0
    if (ent->ifa_netmask->sa_family == AF_INET6) {
1997
0
      address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
1998
0
    } else {
1999
0
      address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
2000
0
    }
2001
2002
0
    address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
2003
2004
0
    address++;
2005
0
  }
2006
2007
  /* Fill in physical addresses for each interface */
2008
0
  for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
2009
0
    if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
2010
0
      continue;
2011
2012
0
    address = *addresses;
2013
2014
0
    for (i = 0; i < (*count); i++) {
2015
0
      size_t namelen = strlen(ent->ifa_name);
2016
      /* Alias interface share the same physical address */
2017
0
      if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
2018
0
          (address->name[namelen] == 0 || address->name[namelen] == ':')) {
2019
0
        sll = (struct sockaddr_ll*)ent->ifa_addr;
2020
0
        memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
2021
0
      }
2022
0
      address++;
2023
0
    }
2024
0
  }
2025
2026
0
  freeifaddrs(addrs);
2027
2028
0
  return 0;
2029
0
}
2030
2031
2032
0
void uv__set_process_title(const char* title) {
2033
0
#if defined(PR_SET_NAME)
2034
0
  prctl(PR_SET_NAME, title);  /* Only copies first 16 characters. */
2035
0
#endif
2036
0
}
2037
2038
2039
0
static uint64_t uv__read_proc_meminfo(const char* what) {
2040
0
  uint64_t rc;
2041
0
  char* p;
2042
0
  char buf[4096];  /* Large enough to hold all of /proc/meminfo. */
2043
2044
0
  if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
2045
0
    return 0;
2046
2047
0
  p = strstr(buf, what);
2048
2049
0
  if (p == NULL)
2050
0
    return 0;
2051
2052
0
  p += strlen(what);
2053
2054
0
  rc = 0;
2055
0
  sscanf(p, "%" PRIu64 " kB", &rc);
2056
2057
0
  return rc * 1024;
2058
0
}
2059
2060
2061
0
uint64_t uv_get_free_memory(void) {
2062
0
  struct sysinfo info;
2063
0
  uint64_t rc;
2064
2065
0
  rc = uv__read_proc_meminfo("MemAvailable:");
2066
2067
0
  if (rc != 0)
2068
0
    return rc;
2069
2070
0
  if (0 == sysinfo(&info))
2071
0
    return (uint64_t) info.freeram * info.mem_unit;
2072
2073
0
  return 0;
2074
0
}
2075
2076
2077
0
uint64_t uv_get_total_memory(void) {
2078
0
  struct sysinfo info;
2079
0
  uint64_t rc;
2080
2081
0
  rc = uv__read_proc_meminfo("MemTotal:");
2082
2083
0
  if (rc != 0)
2084
0
    return rc;
2085
2086
0
  if (0 == sysinfo(&info))
2087
0
    return (uint64_t) info.totalram * info.mem_unit;
2088
2089
0
  return 0;
2090
0
}
2091
2092
2093
0
static uint64_t uv__read_uint64(const char* filename) {
2094
0
  char buf[32];  /* Large enough to hold an encoded uint64_t. */
2095
0
  uint64_t rc;
2096
2097
0
  rc = 0;
2098
0
  if (0 == uv__slurp(filename, buf, sizeof(buf)))
2099
0
    if (1 != sscanf(buf, "%" PRIu64, &rc))
2100
0
      if (0 == strcmp(buf, "max\n"))
2101
0
        rc = UINT64_MAX;
2102
2103
0
  return rc;
2104
0
}
2105
2106
2107
/* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
2108
 * finds the location and length of the memory controller mount path.
2109
 * This disregards the leading / for easy concatenation of paths.
2110
 * Returns NULL if the memory controller wasn't found. */
2111
static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
2112
0
                                                int* n) {
2113
0
  char* p;
2114
2115
  /* Seek to the memory controller line. */
2116
0
  p = strchr(buf, ':');
2117
0
  while (p != NULL && strncmp(p, ":memory:", 8)) {
2118
0
    p = strchr(p, '\n');
2119
0
    if (p != NULL)
2120
0
      p = strchr(p, ':');
2121
0
  }
2122
2123
0
  if (p != NULL) {
2124
    /* Determine the length of the mount path. */
2125
0
    p = p + strlen(":memory:/");
2126
0
    *n = (int) strcspn(p, "\n");
2127
0
  }
2128
2129
0
  return p;
2130
0
}
2131
2132
static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
2133
0
                                          uint64_t* max) {
2134
0
  char filename[4097];
2135
0
  char* p;
2136
0
  int n;
2137
0
  uint64_t cgroup1_max;
2138
2139
  /* Find out where the controller is mounted. */
2140
0
  p = uv__cgroup1_find_memory_controller(buf, &n);
2141
0
  if (p != NULL) {
2142
0
    snprintf(filename, sizeof(filename),
2143
0
             "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
2144
0
    *high = uv__read_uint64(filename);
2145
2146
0
    snprintf(filename, sizeof(filename),
2147
0
             "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
2148
0
    *max = uv__read_uint64(filename);
2149
2150
    /* If the controller wasn't mounted, the reads above will have failed,
2151
     * as indicated by uv__read_uint64 returning 0.
2152
     */
2153
0
     if (*high != 0 && *max != 0)
2154
0
       goto update_limits;
2155
0
  }
2156
2157
  /* Fall back to the limits of the global memory controller. */
2158
0
  *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
2159
0
  *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
2160
2161
  /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
2162
   * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
2163
   */
2164
0
update_limits:
2165
0
  cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
2166
0
  if (*high == cgroup1_max)
2167
0
    *high = UINT64_MAX;
2168
0
  if (*max == cgroup1_max)
2169
0
    *max = UINT64_MAX;
2170
0
}
2171
2172
static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
2173
0
                                          uint64_t* max) {
2174
0
  char filename[4097];
2175
0
  char* p;
2176
0
  int n;
2177
2178
  /* Find out where the controller is mounted. */
2179
0
  p = buf + strlen("0::/");
2180
0
  n = (int) strcspn(p, "\n");
2181
2182
  /* Read the memory limits of the controller. */
2183
0
  snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
2184
0
  *max = uv__read_uint64(filename);
2185
0
  snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
2186
0
  *high = uv__read_uint64(filename);
2187
0
}
2188
2189
0
static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
2190
0
  uint64_t high;
2191
0
  uint64_t max;
2192
2193
  /* In the case of cgroupv2, we'll only have a single entry. */
2194
0
  if (strncmp(buf, "0::/", 4))
2195
0
    uv__get_cgroup1_memory_limits(buf, &high, &max);
2196
0
  else
2197
0
    uv__get_cgroup2_memory_limits(buf, &high, &max);
2198
2199
0
  if (high == 0 || max == 0)
2200
0
    return 0;
2201
2202
0
  return high < max ? high : max;
2203
0
}
2204
2205
0
uint64_t uv_get_constrained_memory(void) {
2206
0
  char buf[1024];
2207
2208
0
  if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2209
0
    return 0;
2210
2211
0
  return uv__get_cgroup_constrained_memory(buf);
2212
0
}
2213
2214
2215
0
static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
2216
0
  char filename[4097];
2217
0
  uint64_t current;
2218
0
  char* p;
2219
0
  int n;
2220
2221
  /* Find out where the controller is mounted. */
2222
0
  p = uv__cgroup1_find_memory_controller(buf, &n);
2223
0
  if (p != NULL) {
2224
0
    snprintf(filename, sizeof(filename),
2225
0
            "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
2226
0
    current = uv__read_uint64(filename);
2227
2228
    /* If the controller wasn't mounted, the reads above will have failed,
2229
     * as indicated by uv__read_uint64 returning 0.
2230
     */
2231
0
    if (current != 0)
2232
0
      return current;
2233
0
  }
2234
2235
  /* Fall back to the usage of the global memory controller. */
2236
0
  return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
2237
0
}
2238
2239
0
static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
2240
0
  char filename[4097];
2241
0
  char* p;
2242
0
  int n;
2243
2244
  /* Find out where the controller is mounted. */
2245
0
  p = buf + strlen("0::/");
2246
0
  n = (int) strcspn(p, "\n");
2247
2248
0
  snprintf(filename, sizeof(filename),
2249
0
           "/sys/fs/cgroup/%.*s/memory.current", n, p);
2250
0
  return uv__read_uint64(filename);
2251
0
}
2252
2253
0
uint64_t uv_get_available_memory(void) {
2254
0
  char buf[1024];
2255
0
  uint64_t constrained;
2256
0
  uint64_t current;
2257
0
  uint64_t total;
2258
2259
0
  if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2260
0
    return 0;
2261
2262
0
  constrained = uv__get_cgroup_constrained_memory(buf);
2263
0
  if (constrained == 0)
2264
0
    return uv_get_free_memory();
2265
2266
0
  total = uv_get_total_memory();
2267
0
  if (constrained > total)
2268
0
    return uv_get_free_memory();
2269
2270
  /* In the case of cgroupv2, we'll only have a single entry. */
2271
0
  if (strncmp(buf, "0::/", 4))
2272
0
    current = uv__get_cgroup1_current_memory(buf);
2273
0
  else
2274
0
    current = uv__get_cgroup2_current_memory(buf);
2275
2276
  /* memory usage can be higher than the limit (for short bursts of time) */
2277
0
  if (constrained < current)
2278
0
    return 0;
2279
2280
0
  return constrained - current;
2281
0
}
2282
2283
2284
static int uv__get_cgroupv2_constrained_cpu(const char* cgroup,
2285
0
                                            long long* quota) {
2286
0
  static const char cgroup_mount[] = "/sys/fs/cgroup";
2287
0
  const char* cgroup_trimmed;
2288
0
  char buf[1024];
2289
0
  char path[256];
2290
0
  char full_path[sizeof(path) + sizeof("/cpu.max")];
2291
0
  char quota_buf[16];
2292
0
  char* last_slash;
2293
0
  int cgroup_size;
2294
0
  long long limit;
2295
0
  long long min_quota;
2296
0
  long long period;
2297
2298
0
  if (strncmp(cgroup, "0::/", 4) != 0)
2299
0
    return UV_EINVAL;
2300
2301
  /* Trim ending \n by replacing it with a 0 */
2302
0
  cgroup_trimmed = cgroup + sizeof("0::/") - 1;      /* Skip the prefix "0::/" */
2303
0
  cgroup_size = (int)strcspn(cgroup_trimmed, "\n");  /* Find the first \n */
2304
0
  min_quota = LLONG_MAX;
2305
2306
  /* Construct the path to the cpu.max files */
2307
0
  snprintf(path, sizeof(path), "%s/%.*s/cgroup.controllers", cgroup_mount,
2308
0
           cgroup_size, cgroup_trimmed);
2309
2310
  /* Read controllers, if not exists, not really a cgroup */
2311
0
  if (uv__slurp(path, buf, sizeof(buf)) < 0)
2312
0
    return UV_EIO;
2313
2314
0
  snprintf(path, sizeof(path), "%s/%.*s", cgroup_mount, cgroup_size,
2315
0
           cgroup_trimmed);
2316
2317
  /*
2318
   * Traverse up the cgroup v2 hierarchy, starting from the current cgroup path.
2319
   * At each level, attempt to read the "cpu.max" file, which defines the CPU
2320
   * quota and period.
2321
   *
2322
   * This reflects how Linux applies cgroup limits hierarchically.
2323
   *
2324
   * e.g: given a path like /sys/fs/cgroup/foo/bar/baz, we check:
2325
   *   - /sys/fs/cgroup/foo/bar/baz/cpu.max
2326
   *   - /sys/fs/cgroup/foo/bar/cpu.max
2327
   *   - /sys/fs/cgroup/foo/cpu.max
2328
   *   - /sys/fs/cgroup/cpu.max
2329
   */
2330
0
  while (strncmp(path, cgroup_mount, strlen(cgroup_mount)) == 0) {
2331
0
    snprintf(full_path, sizeof(full_path), "%s/cpu.max", path);
2332
2333
    /* Silently ignore and continue if the file does not exist */
2334
0
    if (uv__slurp(full_path, quota_buf, sizeof(quota_buf)) < 0)
2335
0
      goto next;
2336
2337
    /* No limit, move on */
2338
0
    if (strncmp(quota_buf, "max", 3) == 0)
2339
0
      goto next;
2340
2341
    /* Read cpu.max */
2342
0
    if (sscanf(quota_buf, "%lld %lld", &limit, &period) != 2)
2343
0
      goto next;
2344
2345
    /* Can't divide by 0 */
2346
0
    if (period == 0)
2347
0
      goto next;
2348
2349
0
    *quota = limit / period;
2350
0
    if (*quota == 0)
2351
0
        *quota = 1;
2352
0
    if (*quota < min_quota)
2353
0
      min_quota = *quota;
2354
2355
0
next:
2356
    /* Move up one level in the cgroup hierarchy by trimming the last path.
2357
     * The loop ends once we reach the cgroup root mount point.
2358
     */
2359
0
    last_slash = strrchr(path, '/');
2360
0
    if (last_slash == NULL || strcmp(path, cgroup_mount) == 0)
2361
0
      break;
2362
0
    *last_slash = '\0';
2363
0
  }
2364
2365
0
  return 0;
2366
0
}
2367
2368
static char* uv__cgroup1_find_cpu_controller(const char* cgroup,
2369
0
                                             int* cgroup_size) {
2370
  /* Seek to the cpu controller line. */
2371
0
  char* cgroup_cpu = strstr(cgroup, ":cpu,");
2372
2373
0
  if (cgroup_cpu != NULL) {
2374
    /* Skip the controller prefix to the start of the cgroup path. */
2375
0
    cgroup_cpu += sizeof(":cpu,") - 1;
2376
    /* Determine the length of the cgroup path, excluding the newline. */
2377
0
    *cgroup_size = (int)strcspn(cgroup_cpu, "\n");
2378
0
  }
2379
2380
0
  return cgroup_cpu;
2381
0
}
2382
2383
static int uv__get_cgroupv1_constrained_cpu(const char* cgroup,
2384
0
                                            long long* quota) {
2385
0
  char path[256];
2386
0
  char buf[1024];
2387
0
  int cgroup_size;
2388
0
  char* cgroup_cpu;
2389
0
  long long period_length;
2390
0
  long long quota_per_period;
2391
2392
0
  cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size);
2393
2394
0
  if (cgroup_cpu == NULL)
2395
0
    return UV_EIO;
2396
2397
  /* Construct the path to the cpu.cfs_quota_us file */
2398
0
  snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us",
2399
0
           cgroup_size, cgroup_cpu);
2400
2401
  /* Read cpu.cfs_quota_us */
2402
0
  if (uv__slurp(path, buf, sizeof(buf)) < 0)
2403
0
    return UV_EIO;
2404
2405
0
  if (sscanf(buf, "%lld", &quota_per_period) != 1)
2406
0
    return UV_EINVAL;
2407
2408
  /* Construct the path to the cpu.cfs_period_us file */
2409
0
  snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us",
2410
0
           cgroup_size, cgroup_cpu);
2411
2412
  /* Read cpu.cfs_period_us */
2413
0
  if (uv__slurp(path, buf, sizeof(buf)) < 0)
2414
0
    return UV_EIO;
2415
2416
0
  if (sscanf(buf, "%lld", &period_length) != 1)
2417
0
    return UV_EINVAL;
2418
2419
  /* Can't divide by 0 */
2420
0
  if (period_length == 0)
2421
0
    return UV_EINVAL;
2422
2423
0
  *quota = quota_per_period / period_length;
2424
2425
0
  return 0;
2426
0
}
2427
2428
0
int uv__get_constrained_cpu(long long* quota) {
2429
0
  char cgroup[1024];
2430
2431
  /* Read the cgroup from /proc/self/cgroup */
2432
0
  if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0)
2433
0
    return UV_EIO;
2434
2435
  /* Check if the system is using cgroup v2 by examining /proc/self/cgroup
2436
   * The entry for cgroup v2 is always in the format "0::$PATH"
2437
   * see https://docs.kernel.org/admin-guide/cgroup-v2.html */
2438
0
  if (strncmp(cgroup, "0::/", 4) == 0)
2439
0
    return uv__get_cgroupv2_constrained_cpu(cgroup, quota);
2440
0
  else
2441
0
    return uv__get_cgroupv1_constrained_cpu(cgroup, quota);
2442
0
}
2443
2444
2445
0
void uv_loadavg(double avg[3]) {
2446
0
  struct sysinfo info;
2447
0
  char buf[128];  /* Large enough to hold all of /proc/loadavg. */
2448
2449
0
  if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
2450
0
    if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
2451
0
      return;
2452
2453
0
  if (sysinfo(&info) < 0)
2454
0
    return;
2455
2456
0
  avg[0] = (double) info.loads[0] / 65536.0;
2457
0
  avg[1] = (double) info.loads[1] / 65536.0;
2458
0
  avg[2] = (double) info.loads[2] / 65536.0;
2459
0
}
2460
2461
2462
static int compare_watchers(const struct watcher_list* a,
2463
0
                            const struct watcher_list* b) {
2464
0
  if (a->wd < b->wd) return -1;
2465
0
  if (a->wd > b->wd) return 1;
2466
0
  return 0;
2467
0
}
2468
2469
2470
0
static int init_inotify(uv_loop_t* loop) {
2471
0
  int err;
2472
0
  int fd;
2473
2474
0
  if (loop->inotify_fd != -1)
2475
0
    return 0;
2476
2477
0
  fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
2478
0
  if (fd < 0)
2479
0
    return UV__ERR(errno);
2480
2481
0
  err = uv__io_init_start(loop, &loop->inotify_read_watcher, UV__INOTIFY_READ,
2482
0
                          fd, POLLIN);
2483
0
  if (err) {
2484
0
    uv__close(fd);
2485
0
    return err;
2486
0
  }
2487
2488
0
  loop->inotify_fd = fd;
2489
0
  return 0;
2490
0
}
2491
2492
2493
0
static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
2494
  /* Open the inotify_fd, and re-arm all the inotify watchers. */
2495
0
  int err;
2496
0
  struct watcher_list* tmp_watcher_list_iter;
2497
0
  struct watcher_list* watcher_list;
2498
0
  struct watcher_list tmp_watcher_list;
2499
0
  struct uv__queue queue;
2500
0
  struct uv__queue* q;
2501
0
  uv_fs_event_t* handle;
2502
0
  char* tmp_path;
2503
2504
0
  if (root == NULL)
2505
0
    return 0;
2506
2507
  /* We must restore the old watcher list to be able to close items
2508
   * out of it.
2509
   */
2510
0
  loop->inotify_watchers = root;
2511
2512
0
  uv__queue_init(&tmp_watcher_list.watchers);
2513
  /* Note that the queue we use is shared with the start and stop()
2514
   * functions, making uv__queue_foreach unsafe to use. So we use the
2515
   * uv__queue_move trick to safely iterate. Also don't free the watcher
2516
   * list until we're done iterating. c.f. uv__inotify_read.
2517
   */
2518
0
  RB_FOREACH_SAFE(watcher_list, watcher_root,
2519
0
                  uv__inotify_watchers(loop), tmp_watcher_list_iter) {
2520
0
    watcher_list->iterating = 1;
2521
0
    uv__queue_move(&watcher_list->watchers, &queue);
2522
0
    while (!uv__queue_empty(&queue)) {
2523
0
      q = uv__queue_head(&queue);
2524
0
      handle = uv__queue_data(q, uv_fs_event_t, watchers);
2525
      /* It's critical to keep a copy of path here, because it
2526
       * will be set to NULL by stop() and then deallocated by
2527
       * maybe_free_watcher_list
2528
       */
2529
0
      tmp_path = uv__strdup(handle->path);
2530
0
      assert(tmp_path != NULL);
2531
0
      uv__queue_remove(q);
2532
0
      uv__queue_insert_tail(&watcher_list->watchers, q);
2533
0
      uv_fs_event_stop(handle);
2534
2535
0
      uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
2536
0
      handle->path = tmp_path;
2537
0
    }
2538
0
    watcher_list->iterating = 0;
2539
0
    maybe_free_watcher_list(watcher_list, loop);
2540
0
  }
2541
2542
0
  uv__queue_move(&tmp_watcher_list.watchers, &queue);
2543
0
  while (!uv__queue_empty(&queue)) {
2544
0
      q = uv__queue_head(&queue);
2545
0
      uv__queue_remove(q);
2546
0
      handle = uv__queue_data(q, uv_fs_event_t, watchers);
2547
0
      tmp_path = handle->path;
2548
0
      handle->path = NULL;
2549
0
      err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
2550
0
      uv__free(tmp_path);
2551
0
      if (err)
2552
0
        return err;
2553
0
  }
2554
2555
0
  return 0;
2556
0
}
2557
2558
2559
0
static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
2560
0
  struct watcher_list w;
2561
0
  w.wd = wd;
2562
0
  return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
2563
0
}
2564
2565
2566
0
static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
2567
  /* if the watcher_list->watchers is being iterated over, we can't free it. */
2568
0
  if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
2569
    /* No watchers left for this path. Clean up. */
2570
0
    RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
2571
0
    inotify_rm_watch(loop->inotify_fd, w->wd);
2572
0
    uv__free(w);
2573
0
  }
2574
0
}
2575
2576
2577
0
void uv__inotify_read(uv_loop_t* loop, uv__io_t* dummy, unsigned int events) {
2578
0
  const struct inotify_event* e;
2579
0
  struct watcher_list* w;
2580
0
  uv_fs_event_t* h;
2581
0
  struct uv__queue queue;
2582
0
  struct uv__queue* q;
2583
0
  const char* path;
2584
0
  ssize_t size;
2585
0
  const char *p;
2586
  /* needs to be large enough for sizeof(inotify_event) + strlen(path) */
2587
0
  char buf[4096];
2588
2589
0
  for (;;) {
2590
0
    do
2591
0
      size = read(loop->inotify_fd, buf, sizeof(buf));
2592
0
    while (size == -1 && errno == EINTR);
2593
2594
0
    if (size == -1) {
2595
0
      assert(errno == EAGAIN || errno == EWOULDBLOCK);
2596
0
      break;
2597
0
    }
2598
2599
0
    assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
2600
2601
    /* Now we have one or more inotify_event structs. */
2602
0
    for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
2603
0
      e = (const struct inotify_event*) p;
2604
2605
0
      events = 0;
2606
0
      if (e->mask & (IN_ATTRIB|IN_MODIFY))
2607
0
        events |= UV_CHANGE;
2608
0
      if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
2609
0
        events |= UV_RENAME;
2610
2611
0
      w = find_watcher(loop, e->wd);
2612
0
      if (w == NULL)
2613
0
        continue; /* Stale event, no watchers left. */
2614
2615
      /* inotify does not return the filename when monitoring a single file
2616
       * for modifications. Repurpose the filename for API compatibility.
2617
       * I'm not convinced this is a good thing, maybe it should go.
2618
       */
2619
0
      path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
2620
2621
      /* We're about to iterate over the queue and call user's callbacks.
2622
       * What can go wrong?
2623
       * A callback could call uv_fs_event_stop()
2624
       * and the queue can change under our feet.
2625
       * So, we use uv__queue_move() trick to safely iterate over the queue.
2626
       * And we don't free the watcher_list until we're done iterating.
2627
       *
2628
       * First,
2629
       * tell uv_fs_event_stop() (that could be called from a user's callback)
2630
       * not to free watcher_list.
2631
       */
2632
0
      w->iterating = 1;
2633
0
      uv__queue_move(&w->watchers, &queue);
2634
0
      while (!uv__queue_empty(&queue)) {
2635
0
        q = uv__queue_head(&queue);
2636
0
        h = uv__queue_data(q, uv_fs_event_t, watchers);
2637
2638
0
        uv__queue_remove(q);
2639
0
        uv__queue_insert_tail(&w->watchers, q);
2640
2641
0
        h->cb(h, path, events, 0);
2642
0
      }
2643
      /* done iterating, time to (maybe) free empty watcher_list */
2644
0
      w->iterating = 0;
2645
0
      maybe_free_watcher_list(w, loop);
2646
0
    }
2647
0
  }
2648
0
}
2649
2650
2651
0
int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
2652
0
  uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
2653
0
  return 0;
2654
0
}
2655
2656
2657
int uv_fs_event_start(uv_fs_event_t* handle,
2658
                      uv_fs_event_cb cb,
2659
                      const char* path,
2660
0
                      unsigned int flags) {
2661
0
  struct watcher_list* w;
2662
0
  uv_loop_t* loop;
2663
0
  size_t len;
2664
0
  int events;
2665
0
  int err;
2666
0
  int wd;
2667
2668
0
  if (uv__is_active(handle))
2669
0
    return UV_EINVAL;
2670
2671
0
  loop = handle->loop;
2672
2673
0
  err = init_inotify(loop);
2674
0
  if (err)
2675
0
    return err;
2676
2677
0
  events = IN_ATTRIB
2678
0
         | IN_CREATE
2679
0
         | IN_MODIFY
2680
0
         | IN_DELETE
2681
0
         | IN_DELETE_SELF
2682
0
         | IN_MOVE_SELF
2683
0
         | IN_MOVED_FROM
2684
0
         | IN_MOVED_TO;
2685
2686
0
  wd = inotify_add_watch(loop->inotify_fd, path, events);
2687
0
  if (wd == -1)
2688
0
    return UV__ERR(errno);
2689
2690
0
  w = find_watcher(loop, wd);
2691
0
  if (w)
2692
0
    goto no_insert;
2693
2694
0
  len = strlen(path) + 1;
2695
0
  w = uv__malloc(sizeof(*w) + len);
2696
0
  if (w == NULL)
2697
0
    return UV_ENOMEM;
2698
2699
0
  w->wd = wd;
2700
0
  w->path = memcpy(w + 1, path, len);
2701
0
  uv__queue_init(&w->watchers);
2702
0
  w->iterating = 0;
2703
0
  RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
2704
2705
0
no_insert:
2706
0
  uv__handle_start(handle);
2707
0
  uv__queue_insert_tail(&w->watchers, &handle->watchers);
2708
0
  handle->path = w->path;
2709
0
  handle->cb = cb;
2710
0
  handle->wd = wd;
2711
2712
0
  return 0;
2713
0
}
2714
2715
2716
0
int uv_fs_event_stop(uv_fs_event_t* handle) {
2717
0
  struct watcher_list* w;
2718
2719
0
  if (!uv__is_active(handle))
2720
0
    return 0;
2721
2722
0
  w = find_watcher(handle->loop, handle->wd);
2723
0
  assert(w != NULL);
2724
2725
0
  handle->wd = -1;
2726
0
  handle->path = NULL;
2727
0
  uv__handle_stop(handle);
2728
0
  uv__queue_remove(&handle->watchers);
2729
2730
0
  maybe_free_watcher_list(w, handle->loop);
2731
2732
0
  return 0;
2733
0
}
2734
2735
2736
0
void uv__fs_event_close(uv_fs_event_t* handle) {
2737
0
  uv_fs_event_stop(handle);
2738
0
}