/src/CMake/Utilities/cmlibuv/src/unix/linux.c
Line | Count | Source |
1 | | /* Copyright Joyent, Inc. and other Node contributors. All rights reserved. |
2 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
3 | | * of this software and associated documentation files (the "Software"), to |
4 | | * deal in the Software without restriction, including without limitation the |
5 | | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
6 | | * sell copies of the Software, and to permit persons to whom the Software is |
7 | | * furnished to do so, subject to the following conditions: |
8 | | * |
9 | | * The above copyright notice and this permission notice shall be included in |
10 | | * all copies or substantial portions of the Software. |
11 | | * |
12 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
13 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
14 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
15 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
16 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
17 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
18 | | * IN THE SOFTWARE. |
19 | | */ |
20 | | |
21 | | /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their |
22 | | * EPOLL* counterparts. We use the POLL* variants in this file because that |
23 | | * is what libuv uses elsewhere. |
24 | | */ |
25 | | |
26 | | #include "uv.h" |
27 | | #include "internal.h" |
28 | | |
29 | | #include <inttypes.h> |
30 | | #include <stdatomic.h> |
31 | | #include <stddef.h> /* offsetof */ |
32 | | #include <stdint.h> |
33 | | #include <stdio.h> |
34 | | #include <stdlib.h> |
35 | | #include <string.h> |
36 | | #include <assert.h> |
37 | | #include <errno.h> |
38 | | |
39 | | #include <fcntl.h> |
40 | | #include <ifaddrs.h> |
41 | | #include <net/ethernet.h> |
42 | | #include <net/if.h> |
43 | | #include <netpacket/packet.h> |
44 | | #include <sys/epoll.h> |
45 | | #include <sys/inotify.h> |
46 | | #include <sys/mman.h> |
47 | | #include <sys/param.h> |
48 | | #include <sys/prctl.h> |
49 | | #include <sys/socket.h> |
50 | | #include <sys/stat.h> |
51 | | #include <sys/syscall.h> |
52 | | #include <sys/sysinfo.h> |
53 | | #include <sys/sysmacros.h> |
54 | | #include <sys/types.h> |
55 | | #include <sys/utsname.h> |
56 | | #include <time.h> |
57 | | #include <unistd.h> |
58 | | |
59 | | #ifndef __NR_io_uring_setup |
60 | | # define __NR_io_uring_setup 425 |
61 | | #endif |
62 | | |
63 | | #ifndef __NR_io_uring_enter |
64 | | # define __NR_io_uring_enter 426 |
65 | | #endif |
66 | | |
67 | | #ifndef __NR_io_uring_register |
68 | | # define __NR_io_uring_register 427 |
69 | | #endif |
70 | | |
71 | | #ifndef __NR_copy_file_range |
72 | | # if defined(__x86_64__) |
73 | | # define __NR_copy_file_range 326 |
74 | | # elif defined(__i386__) |
75 | | # define __NR_copy_file_range 377 |
76 | | # elif defined(__s390__) |
77 | | # define __NR_copy_file_range 375 |
78 | | # elif defined(__arm__) |
79 | | # define __NR_copy_file_range 391 |
80 | | # elif defined(__aarch64__) |
81 | | # define __NR_copy_file_range 285 |
82 | | # elif defined(__powerpc__) |
83 | | # define __NR_copy_file_range 379 |
84 | | # elif defined(__arc__) |
85 | | # define __NR_copy_file_range 285 |
86 | | # elif defined(__riscv) |
87 | | # define __NR_copy_file_range 285 |
88 | | # endif |
89 | | #endif /* __NR_copy_file_range */ |
90 | | |
91 | | #ifndef __NR_statx |
92 | | # if defined(__x86_64__) |
93 | | # define __NR_statx 332 |
94 | | # elif defined(__i386__) |
95 | | # define __NR_statx 383 |
96 | | # elif defined(__aarch64__) |
97 | | # define __NR_statx 397 |
98 | | # elif defined(__arm__) |
99 | | # define __NR_statx 397 |
100 | | # elif defined(__ppc__) |
101 | | # define __NR_statx 383 |
102 | | # elif defined(__s390__) |
103 | | # define __NR_statx 379 |
104 | | # elif defined(__riscv) |
105 | | # define __NR_statx 291 |
106 | | # endif |
107 | | #endif /* __NR_statx */ |
108 | | |
109 | | #ifndef __NR_getrandom |
110 | | # if defined(__x86_64__) |
111 | | # define __NR_getrandom 318 |
112 | | # elif defined(__i386__) |
113 | | # define __NR_getrandom 355 |
114 | | # elif defined(__aarch64__) |
115 | | # define __NR_getrandom 384 |
116 | | # elif defined(__arm__) |
117 | | # define __NR_getrandom 384 |
118 | | # elif defined(__ppc__) |
119 | | # define __NR_getrandom 359 |
120 | | # elif defined(__s390__) |
121 | | # define __NR_getrandom 349 |
122 | | # elif defined(__riscv) |
123 | | # define __NR_getrandom 278 |
124 | | # endif |
125 | | #endif /* __NR_getrandom */ |
126 | | |
127 | | enum { |
128 | | UV__IORING_SETUP_SQPOLL = 2u, |
129 | | UV__IORING_SETUP_NO_SQARRAY = 0x10000u, |
130 | | }; |
131 | | |
132 | | enum { |
133 | | UV__IORING_FEAT_SINGLE_MMAP = 1u, |
134 | | UV__IORING_FEAT_NODROP = 2u, |
135 | | UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */ |
136 | | }; |
137 | | |
138 | | enum { |
139 | | UV__IORING_OP_READV = 1, |
140 | | UV__IORING_OP_WRITEV = 2, |
141 | | UV__IORING_OP_FSYNC = 3, |
142 | | UV__IORING_OP_OPENAT = 18, |
143 | | UV__IORING_OP_CLOSE = 19, |
144 | | UV__IORING_OP_STATX = 21, |
145 | | UV__IORING_OP_EPOLL_CTL = 29, |
146 | | UV__IORING_OP_RENAMEAT = 35, |
147 | | UV__IORING_OP_UNLINKAT = 36, |
148 | | UV__IORING_OP_MKDIRAT = 37, |
149 | | UV__IORING_OP_SYMLINKAT = 38, |
150 | | UV__IORING_OP_LINKAT = 39, |
151 | | UV__IORING_OP_FTRUNCATE = 55, |
152 | | }; |
153 | | |
154 | | enum { |
155 | | UV__IORING_ENTER_GETEVENTS = 1u, |
156 | | UV__IORING_ENTER_SQ_WAKEUP = 2u, |
157 | | }; |
158 | | |
159 | | enum { |
160 | | UV__IORING_SQ_NEED_WAKEUP = 1u, |
161 | | UV__IORING_SQ_CQ_OVERFLOW = 2u, |
162 | | }; |
163 | | |
164 | | struct uv__io_cqring_offsets { |
165 | | uint32_t head; |
166 | | uint32_t tail; |
167 | | uint32_t ring_mask; |
168 | | uint32_t ring_entries; |
169 | | uint32_t overflow; |
170 | | uint32_t cqes; |
171 | | uint64_t reserved0; |
172 | | uint64_t reserved1; |
173 | | }; |
174 | | |
175 | | STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets)); |
176 | | |
177 | | struct uv__io_sqring_offsets { |
178 | | uint32_t head; |
179 | | uint32_t tail; |
180 | | uint32_t ring_mask; |
181 | | uint32_t ring_entries; |
182 | | uint32_t flags; |
183 | | uint32_t dropped; |
184 | | uint32_t array; |
185 | | uint32_t reserved0; |
186 | | uint64_t reserved1; |
187 | | }; |
188 | | |
189 | | STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets)); |
190 | | |
191 | | struct uv__io_uring_cqe { |
192 | | uint64_t user_data; |
193 | | int32_t res; |
194 | | uint32_t flags; |
195 | | }; |
196 | | |
197 | | STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe)); |
198 | | |
199 | | struct uv__io_uring_sqe { |
200 | | uint8_t opcode; |
201 | | uint8_t flags; |
202 | | uint16_t ioprio; |
203 | | int32_t fd; |
204 | | union { |
205 | | uint64_t off; |
206 | | uint64_t addr2; |
207 | | }; |
208 | | union { |
209 | | uint64_t addr; |
210 | | }; |
211 | | uint32_t len; |
212 | | union { |
213 | | uint32_t rw_flags; |
214 | | uint32_t fsync_flags; |
215 | | uint32_t open_flags; |
216 | | uint32_t statx_flags; |
217 | | }; |
218 | | uint64_t user_data; |
219 | | union { |
220 | | uint16_t buf_index; |
221 | | uint64_t pad[3]; |
222 | | }; |
223 | | }; |
224 | | |
225 | | STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe)); |
226 | | STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode)); |
227 | | STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags)); |
228 | | STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio)); |
229 | | STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd)); |
230 | | STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off)); |
231 | | STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr)); |
232 | | STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len)); |
233 | | STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags)); |
234 | | STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data)); |
235 | | STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index)); |
236 | | |
237 | | struct uv__io_uring_params { |
238 | | uint32_t sq_entries; |
239 | | uint32_t cq_entries; |
240 | | uint32_t flags; |
241 | | uint32_t sq_thread_cpu; |
242 | | uint32_t sq_thread_idle; |
243 | | uint32_t features; |
244 | | uint32_t reserved[4]; |
245 | | struct uv__io_sqring_offsets sq_off; /* 40 bytes */ |
246 | | struct uv__io_cqring_offsets cq_off; /* 40 bytes */ |
247 | | }; |
248 | | |
249 | | STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params)); |
250 | | STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off)); |
251 | | STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off)); |
252 | | |
253 | | STATIC_ASSERT(EPOLL_CTL_ADD < 4); |
254 | | STATIC_ASSERT(EPOLL_CTL_DEL < 4); |
255 | | STATIC_ASSERT(EPOLL_CTL_MOD < 4); |
256 | | |
257 | | struct watcher_list { |
258 | | RB_ENTRY(watcher_list) entry; |
259 | | struct uv__queue watchers; |
260 | | int iterating; |
261 | | char* path; |
262 | | int wd; |
263 | | }; |
264 | | |
265 | | struct watcher_root { |
266 | | struct watcher_list* rbh_root; |
267 | | }; |
268 | | |
269 | | static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root); |
270 | | static int compare_watchers(const struct watcher_list* a, |
271 | | const struct watcher_list* b); |
272 | | static void maybe_free_watcher_list(struct watcher_list* w, |
273 | | uv_loop_t* loop); |
274 | | |
275 | | static void uv__epoll_ctl_flush(int epollfd, |
276 | | struct uv__iou* ctl, |
277 | | struct epoll_event (*events)[256]); |
278 | | |
279 | | static void uv__epoll_ctl_prep(int epollfd, |
280 | | struct uv__iou* ctl, |
281 | | struct epoll_event (*events)[256], |
282 | | int op, |
283 | | int fd, |
284 | | struct epoll_event* e); |
285 | | |
286 | 0 | RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers) Unexecuted instantiation: linux.c:watcher_root_RB_MINMAX Unexecuted instantiation: linux.c:watcher_root_RB_FIND Unexecuted instantiation: linux.c:watcher_root_RB_REMOVE Unexecuted instantiation: linux.c:watcher_root_RB_REMOVE_COLOR Unexecuted instantiation: linux.c:watcher_root_RB_INSERT Unexecuted instantiation: linux.c:watcher_root_RB_INSERT_COLOR |
287 | 0 |
|
288 | 0 |
|
289 | 0 | static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) { |
290 | | /* This cast works because watcher_root is a struct with a pointer as its |
291 | | * sole member. Such type punning is unsafe in the presence of strict |
292 | | * pointer aliasing (and is just plain nasty) but that is why libuv |
293 | | * is compiled with -fno-strict-aliasing. |
294 | | */ |
295 | 0 | return (struct watcher_root*) &loop->inotify_watchers; |
296 | 0 | } |
297 | | |
298 | | |
299 | 0 | unsigned uv__kernel_version(void) { |
300 | 0 | static _Atomic unsigned cached_version; |
301 | 0 | struct utsname u; |
302 | 0 | unsigned version; |
303 | 0 | unsigned major; |
304 | 0 | unsigned minor; |
305 | 0 | unsigned patch; |
306 | 0 | char v_sig[256]; |
307 | 0 | char* needle; |
308 | |
|
309 | 0 | version = atomic_load_explicit(&cached_version, memory_order_relaxed); |
310 | 0 | if (version != 0) |
311 | 0 | return version; |
312 | | |
313 | | /* Check /proc/version_signature first as it's the way to get the mainline |
314 | | * kernel version in Ubuntu. The format is: |
315 | | * Ubuntu ubuntu_kernel_version mainline_kernel_version |
316 | | * For example: |
317 | | * Ubuntu 5.15.0-79.86-generic 5.15.111 |
318 | | */ |
319 | 0 | if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig))) |
320 | 0 | if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch)) |
321 | 0 | goto calculate_version; |
322 | | |
323 | 0 | if (-1 == uname(&u)) |
324 | 0 | return 0; |
325 | | |
326 | | /* In Debian we need to check `version` instead of `release` to extract the |
327 | | * mainline kernel version. This is an example of how it looks like: |
328 | | * #1 SMP Debian 5.10.46-4 (2021-08-03) |
329 | | */ |
330 | 0 | needle = strstr(u.version, "Debian "); |
331 | 0 | if (needle != NULL) |
332 | 0 | if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch)) |
333 | 0 | goto calculate_version; |
334 | | |
335 | 0 | if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch)) |
336 | 0 | return 0; |
337 | | |
338 | | /* Handle it when the process runs under the UNAME26 personality: |
339 | | * |
340 | | * - kernels >= 3.x identify as 2.6.40+x |
341 | | * - kernels >= 4.x identify as 2.6.60+x |
342 | | * |
343 | | * UNAME26 is a poorly conceived hack that doesn't let us distinguish |
344 | | * between 4.x kernels and 5.x/6.x kernels so we conservatively assume |
345 | | * that 2.6.60+x means 4.x. |
346 | | * |
347 | | * Fun fact of the day: it's technically possible to observe the actual |
348 | | * kernel version for a brief moment because uname() first copies out the |
349 | | * real release string before overwriting it with the backcompat string. |
350 | | */ |
351 | 0 | if (major == 2 && minor == 6) { |
352 | 0 | if (patch >= 60) { |
353 | 0 | major = 4; |
354 | 0 | minor = patch - 60; |
355 | 0 | patch = 0; |
356 | 0 | } else if (patch >= 40) { |
357 | 0 | major = 3; |
358 | 0 | minor = patch - 40; |
359 | 0 | patch = 0; |
360 | 0 | } |
361 | 0 | } |
362 | |
|
363 | 0 | calculate_version: |
364 | 0 | version = major * 65536 + minor * 256 + patch; |
365 | 0 | atomic_store_explicit(&cached_version, version, memory_order_relaxed); |
366 | |
|
367 | 0 | return version; |
368 | 0 | } |
369 | | |
370 | | |
371 | | ssize_t |
372 | | uv__fs_copy_file_range(int fd_in, |
373 | | off_t* off_in, |
374 | | int fd_out, |
375 | | off_t* off_out, |
376 | | size_t len, |
377 | | unsigned int flags) |
378 | 0 | { |
379 | 0 | #ifdef __NR_copy_file_range |
380 | 0 | return syscall(__NR_copy_file_range, |
381 | 0 | fd_in, |
382 | 0 | off_in, |
383 | 0 | fd_out, |
384 | 0 | off_out, |
385 | 0 | len, |
386 | 0 | flags); |
387 | | #else |
388 | | return errno = ENOSYS, -1; |
389 | | #endif |
390 | 0 | } |
391 | | |
392 | | |
393 | | int uv__statx(int dirfd, |
394 | | const char* path, |
395 | | int flags, |
396 | | unsigned int mask, |
397 | 0 | struct uv__statx* statxbuf) { |
398 | | #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30 |
399 | | return errno = ENOSYS, -1; |
400 | | #else |
401 | 0 | int rc; |
402 | |
|
403 | 0 | rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf); |
404 | 0 | if (rc >= 0) |
405 | 0 | uv__msan_unpoison(statxbuf, sizeof(*statxbuf)); |
406 | |
|
407 | 0 | return rc; |
408 | 0 | #endif |
409 | 0 | } |
410 | | |
411 | | |
412 | 0 | ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) { |
413 | | #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28 |
414 | | return errno = ENOSYS, -1; |
415 | | #else |
416 | 0 | ssize_t rc; |
417 | |
|
418 | 0 | rc = syscall(__NR_getrandom, buf, buflen, flags); |
419 | 0 | if (rc >= 0) |
420 | 0 | uv__msan_unpoison(buf, buflen); |
421 | |
|
422 | 0 | return rc; |
423 | 0 | #endif |
424 | 0 | } |
425 | | |
426 | | |
427 | 0 | int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) { |
428 | 0 | return syscall(__NR_io_uring_setup, entries, params); |
429 | 0 | } |
430 | | |
431 | | |
432 | | int uv__io_uring_enter(int fd, |
433 | | unsigned to_submit, |
434 | | unsigned min_complete, |
435 | 0 | unsigned flags) { |
436 | | /* io_uring_enter used to take a sigset_t but it's unused |
437 | | * in newer kernels unless IORING_ENTER_EXT_ARG is set, |
438 | | * in which case it takes a struct io_uring_getevents_arg. |
439 | | */ |
440 | 0 | return syscall(__NR_io_uring_enter, |
441 | 0 | fd, |
442 | 0 | to_submit, |
443 | 0 | min_complete, |
444 | 0 | flags, |
445 | 0 | NULL, |
446 | 0 | 0L); |
447 | 0 | } |
448 | | |
449 | | |
450 | 0 | int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) { |
451 | 0 | return syscall(__NR_io_uring_register, fd, opcode, arg, nargs); |
452 | 0 | } |
453 | | |
454 | | |
455 | 0 | static int uv__use_io_uring(uint32_t flags) { |
456 | | #if defined(__ANDROID_API__) |
457 | | return 0; /* Possibly available but blocked by seccomp. */ |
458 | | #elif defined(__arm__) && __SIZEOF_POINTER__ == 4 |
459 | | /* See https://github.com/libuv/libuv/issues/4158. */ |
460 | | return 0; /* All 32 bits kernels appear buggy. */ |
461 | | #elif defined(__powerpc64__) || defined(__ppc64__) |
462 | | /* See https://github.com/libuv/libuv/issues/4283. */ |
463 | | return 0; /* Random SIGSEGV in signal handler. */ |
464 | | #else |
465 | | /* Ternary: unknown=0, yes=1, no=-1 */ |
466 | 0 | static _Atomic int use_io_uring; |
467 | 0 | char* val; |
468 | 0 | int use; |
469 | |
|
470 | | #if defined(__hppa__) |
471 | | /* io_uring first supported on parisc in 6.1, functional in .51 |
472 | | * https://lore.kernel.org/all/cb912694-b1fe-dbb0-4d8c-d608f3526905@gmx.de/ |
473 | | */ |
474 | | if (uv__kernel_version() < /*6.1.51*/0x060133) |
475 | | return 0; |
476 | | #endif |
477 | | |
478 | | /* SQPOLL is all kinds of buggy but epoll batching should work fine. */ |
479 | 0 | if (0 == (flags & UV__IORING_SETUP_SQPOLL)) |
480 | 0 | return 1; |
481 | | |
482 | | /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */ |
483 | 0 | if (uv__kernel_version() < /*5.10.186*/0x050ABA) |
484 | 0 | return 0; |
485 | | |
486 | 0 | use = atomic_load_explicit(&use_io_uring, memory_order_relaxed); |
487 | |
|
488 | 0 | if (use == 0) { |
489 | 0 | val = getenv("UV_USE_IO_URING"); |
490 | 0 | use = val != NULL && atoi(val) > 0 ? 1 : -1; |
491 | 0 | atomic_store_explicit(&use_io_uring, use, memory_order_relaxed); |
492 | 0 | } |
493 | |
|
494 | 0 | return use > 0; |
495 | 0 | #endif |
496 | 0 | } |
497 | | |
498 | | |
499 | | static void uv__iou_init(int epollfd, |
500 | | struct uv__iou* iou, |
501 | | uint32_t entries, |
502 | 0 | uint32_t flags) { |
503 | 0 | struct uv__io_uring_params params; |
504 | 0 | struct epoll_event e; |
505 | 0 | size_t cqlen; |
506 | 0 | size_t sqlen; |
507 | 0 | size_t maxlen; |
508 | 0 | size_t sqelen; |
509 | 0 | unsigned kernel_version; |
510 | 0 | uint32_t* sqarray; |
511 | 0 | uint32_t i; |
512 | 0 | char* sq; |
513 | 0 | char* sqe; |
514 | 0 | int ringfd; |
515 | 0 | int no_sqarray; |
516 | |
|
517 | 0 | sq = MAP_FAILED; |
518 | 0 | sqe = MAP_FAILED; |
519 | |
|
520 | 0 | if (!uv__use_io_uring(flags)) |
521 | 0 | return; |
522 | | |
523 | 0 | kernel_version = uv__kernel_version(); |
524 | 0 | no_sqarray = |
525 | 0 | UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600); |
526 | | |
527 | | /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement. |
528 | | * Mostly academic because we check for a v5.13 kernel afterwards anyway. |
529 | | */ |
530 | 0 | memset(¶ms, 0, sizeof(params)); |
531 | 0 | params.flags = flags | no_sqarray; |
532 | |
|
533 | 0 | if (flags & UV__IORING_SETUP_SQPOLL) |
534 | 0 | params.sq_thread_idle = 10; /* milliseconds */ |
535 | | |
536 | | /* Kernel returns a file descriptor with O_CLOEXEC flag set. */ |
537 | 0 | ringfd = uv__io_uring_setup(entries, ¶ms); |
538 | 0 | if (ringfd == -1) |
539 | 0 | return; |
540 | | |
541 | | /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're |
542 | | * actually detecting is whether IORING_OP_STATX works with SQPOLL. |
543 | | */ |
544 | 0 | if (!(params.features & UV__IORING_FEAT_RSRC_TAGS)) |
545 | 0 | goto fail; |
546 | | |
547 | | /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */ |
548 | 0 | if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP)) |
549 | 0 | goto fail; |
550 | | |
551 | | /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */ |
552 | 0 | if (!(params.features & UV__IORING_FEAT_NODROP)) |
553 | 0 | goto fail; |
554 | | |
555 | 0 | sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t); |
556 | 0 | cqlen = |
557 | 0 | params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe); |
558 | 0 | maxlen = sqlen < cqlen ? cqlen : sqlen; |
559 | 0 | sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe); |
560 | |
|
561 | 0 | sq = mmap(0, |
562 | 0 | maxlen, |
563 | 0 | PROT_READ | PROT_WRITE, |
564 | 0 | MAP_SHARED | MAP_POPULATE, |
565 | 0 | ringfd, |
566 | 0 | 0); /* IORING_OFF_SQ_RING */ |
567 | |
|
568 | 0 | sqe = mmap(0, |
569 | 0 | sqelen, |
570 | 0 | PROT_READ | PROT_WRITE, |
571 | 0 | MAP_SHARED | MAP_POPULATE, |
572 | 0 | ringfd, |
573 | 0 | 0x10000000ull); /* IORING_OFF_SQES */ |
574 | |
|
575 | 0 | if (sq == MAP_FAILED || sqe == MAP_FAILED) |
576 | 0 | goto fail; |
577 | | |
578 | 0 | if (flags & UV__IORING_SETUP_SQPOLL) { |
579 | | /* Only interested in completion events. To get notified when |
580 | | * the kernel pulls items from the submission ring, add POLLOUT. |
581 | | */ |
582 | 0 | memset(&e, 0, sizeof(e)); |
583 | 0 | e.events = POLLIN; |
584 | 0 | e.data.fd = ringfd; |
585 | |
|
586 | 0 | if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e)) |
587 | 0 | goto fail; |
588 | 0 | } |
589 | | |
590 | 0 | iou->sqhead = (uint32_t*) (sq + params.sq_off.head); |
591 | 0 | iou->sqtail = (uint32_t*) (sq + params.sq_off.tail); |
592 | 0 | iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask); |
593 | 0 | iou->sqflags = (uint32_t*) (sq + params.sq_off.flags); |
594 | 0 | iou->cqhead = (uint32_t*) (sq + params.cq_off.head); |
595 | 0 | iou->cqtail = (uint32_t*) (sq + params.cq_off.tail); |
596 | 0 | iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask); |
597 | 0 | iou->sq = sq; |
598 | 0 | iou->cqe = sq + params.cq_off.cqes; |
599 | 0 | iou->sqe = sqe; |
600 | 0 | iou->sqlen = sqlen; |
601 | 0 | iou->cqlen = cqlen; |
602 | 0 | iou->maxlen = maxlen; |
603 | 0 | iou->sqelen = sqelen; |
604 | 0 | iou->ringfd = ringfd; |
605 | 0 | iou->in_flight = 0; |
606 | |
|
607 | 0 | if (no_sqarray) |
608 | 0 | return; |
609 | | |
610 | 0 | sqarray = (uint32_t*) (sq + params.sq_off.array); |
611 | 0 | for (i = 0; i <= iou->sqmask; i++) |
612 | 0 | sqarray[i] = i; /* Slot -> sqe identity mapping. */ |
613 | |
|
614 | 0 | return; |
615 | | |
616 | 0 | fail: |
617 | 0 | if (sq != MAP_FAILED) |
618 | 0 | munmap(sq, maxlen); |
619 | |
|
620 | 0 | if (sqe != MAP_FAILED) |
621 | 0 | munmap(sqe, sqelen); |
622 | |
|
623 | 0 | uv__close(ringfd); |
624 | 0 | } |
625 | | |
626 | | |
627 | 0 | static void uv__iou_delete(struct uv__iou* iou) { |
628 | 0 | if (iou->ringfd > -1) { |
629 | 0 | munmap(iou->sq, iou->maxlen); |
630 | 0 | munmap(iou->sqe, iou->sqelen); |
631 | 0 | uv__close(iou->ringfd); |
632 | 0 | iou->ringfd = -1; |
633 | 0 | } |
634 | 0 | } |
635 | | |
636 | | |
637 | 0 | int uv__platform_loop_init(uv_loop_t* loop) { |
638 | 0 | uv__loop_internal_fields_t* lfields; |
639 | |
|
640 | 0 | lfields = uv__get_internal_fields(loop); |
641 | 0 | lfields->ctl.ringfd = -1; |
642 | 0 | lfields->iou.ringfd = -2; /* "uninitialized" */ |
643 | |
|
644 | 0 | loop->inotify_watchers = NULL; |
645 | 0 | loop->inotify_fd = -1; |
646 | 0 | loop->backend_fd = epoll_create1(O_CLOEXEC); |
647 | |
|
648 | 0 | if (loop->backend_fd == -1) |
649 | 0 | return UV__ERR(errno); |
650 | | |
651 | 0 | uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0); |
652 | |
|
653 | 0 | return 0; |
654 | 0 | } |
655 | | |
656 | | |
657 | 0 | int uv__io_fork(uv_loop_t* loop) { |
658 | 0 | int err; |
659 | 0 | struct watcher_list* root; |
660 | |
|
661 | 0 | root = uv__inotify_watchers(loop)->rbh_root; |
662 | |
|
663 | 0 | uv__close(loop->backend_fd); |
664 | 0 | loop->backend_fd = -1; |
665 | | |
666 | | /* TODO(bnoordhuis) Loses items from the submission and completion rings. */ |
667 | 0 | uv__platform_loop_delete(loop); |
668 | |
|
669 | 0 | err = uv__platform_loop_init(loop); |
670 | 0 | if (err) |
671 | 0 | return err; |
672 | | |
673 | 0 | return uv__inotify_fork(loop, root); |
674 | 0 | } |
675 | | |
676 | | |
677 | 0 | void uv__platform_loop_delete(uv_loop_t* loop) { |
678 | 0 | uv__loop_internal_fields_t* lfields; |
679 | |
|
680 | 0 | lfields = uv__get_internal_fields(loop); |
681 | 0 | uv__iou_delete(&lfields->ctl); |
682 | 0 | uv__iou_delete(&lfields->iou); |
683 | |
|
684 | 0 | if (loop->inotify_fd != -1) { |
685 | 0 | uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN); |
686 | 0 | uv__close(loop->inotify_fd); |
687 | 0 | loop->inotify_fd = -1; |
688 | 0 | } |
689 | 0 | } |
690 | | |
691 | | |
692 | | struct uv__invalidate { |
693 | | struct epoll_event (*prep)[256]; |
694 | | struct epoll_event* events; |
695 | | int nfds; |
696 | | }; |
697 | | |
698 | | |
699 | 0 | void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) { |
700 | 0 | uv__loop_internal_fields_t* lfields; |
701 | 0 | struct uv__invalidate* inv; |
702 | 0 | struct epoll_event dummy; |
703 | 0 | int i; |
704 | |
|
705 | 0 | lfields = uv__get_internal_fields(loop); |
706 | 0 | inv = lfields->inv; |
707 | | |
708 | | /* Invalidate events with same file descriptor */ |
709 | 0 | if (inv != NULL) |
710 | 0 | for (i = 0; i < inv->nfds; i++) |
711 | 0 | if (inv->events[i].data.fd == fd) |
712 | 0 | inv->events[i].data.fd = -1; |
713 | | |
714 | | /* Remove the file descriptor from the epoll. |
715 | | * This avoids a problem where the same file description remains open |
716 | | * in another process, causing repeated junk epoll events. |
717 | | * |
718 | | * Perform EPOLL_CTL_DEL immediately instead of going through |
719 | | * io_uring's submit queue, otherwise the file descriptor may |
720 | | * be closed by the time the kernel starts the operation. |
721 | | * |
722 | | * We pass in a dummy epoll_event, to work around a bug in old kernels. |
723 | | * |
724 | | * Work around a bug in kernels 3.10 to 3.19 where passing a struct that |
725 | | * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings. |
726 | | */ |
727 | 0 | memset(&dummy, 0, sizeof(dummy)); |
728 | 0 | epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy); |
729 | 0 | } |
730 | | |
731 | | |
732 | 0 | int uv__io_check_fd(uv_loop_t* loop, int fd) { |
733 | 0 | struct epoll_event e; |
734 | 0 | int rc; |
735 | |
|
736 | 0 | memset(&e, 0, sizeof(e)); |
737 | 0 | e.events = POLLIN; |
738 | 0 | e.data.fd = -1; |
739 | |
|
740 | 0 | rc = 0; |
741 | 0 | if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e)) |
742 | 0 | if (errno != EEXIST) |
743 | 0 | rc = UV__ERR(errno); |
744 | |
|
745 | 0 | if (rc == 0) |
746 | 0 | if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e)) |
747 | 0 | abort(); |
748 | | |
749 | 0 | return rc; |
750 | 0 | } |
751 | | |
752 | | |
753 | | /* Caller must initialize SQE and call uv__iou_submit(). */ |
754 | | static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou, |
755 | | uv_loop_t* loop, |
756 | 0 | uv_fs_t* req) { |
757 | 0 | struct uv__io_uring_sqe* sqe; |
758 | 0 | uint32_t head; |
759 | 0 | uint32_t tail; |
760 | 0 | uint32_t mask; |
761 | 0 | uint32_t slot; |
762 | | |
763 | | /* Lazily create the ring. State machine: -2 means uninitialized, -1 means |
764 | | * initialization failed. Anything else is a valid ring file descriptor. |
765 | | */ |
766 | 0 | if (iou->ringfd == -2) { |
767 | | /* By default, the SQPOLL is not created. Enable only if the loop is |
768 | | * configured with UV_LOOP_USE_IO_URING_SQPOLL and the UV_USE_IO_URING |
769 | | * environment variable is unset or a positive number. |
770 | | */ |
771 | 0 | if (loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL) |
772 | 0 | if (uv__use_io_uring(UV__IORING_SETUP_SQPOLL)) |
773 | 0 | uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL); |
774 | |
|
775 | 0 | if (iou->ringfd == -2) |
776 | 0 | iou->ringfd = -1; /* "failed" */ |
777 | 0 | } |
778 | |
|
779 | 0 | if (iou->ringfd == -1) |
780 | 0 | return NULL; |
781 | | |
782 | 0 | head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead, |
783 | 0 | memory_order_acquire); |
784 | 0 | tail = *iou->sqtail; |
785 | 0 | mask = iou->sqmask; |
786 | |
|
787 | 0 | if ((head & mask) == ((tail + 1) & mask)) |
788 | 0 | return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */ |
789 | | |
790 | 0 | slot = tail & mask; |
791 | 0 | sqe = iou->sqe; |
792 | 0 | sqe = &sqe[slot]; |
793 | 0 | memset(sqe, 0, sizeof(*sqe)); |
794 | 0 | sqe->user_data = (uintptr_t) req; |
795 | | |
796 | | /* Pacify uv_cancel(). */ |
797 | 0 | req->work_req.loop = loop; |
798 | 0 | req->work_req.work = NULL; |
799 | 0 | req->work_req.done = NULL; |
800 | 0 | uv__queue_init(&req->work_req.wq); |
801 | |
|
802 | 0 | uv__req_register(loop); |
803 | 0 | iou->in_flight++; |
804 | |
|
805 | 0 | return sqe; |
806 | 0 | } |
807 | | |
808 | | |
809 | 0 | static void uv__iou_submit(struct uv__iou* iou) { |
810 | 0 | uint32_t flags; |
811 | |
|
812 | 0 | atomic_store_explicit((_Atomic uint32_t*) iou->sqtail, |
813 | 0 | *iou->sqtail + 1, |
814 | 0 | memory_order_release); |
815 | |
|
816 | 0 | flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags, |
817 | 0 | memory_order_acquire); |
818 | |
|
819 | 0 | if (flags & UV__IORING_SQ_NEED_WAKEUP) |
820 | 0 | if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP)) |
821 | 0 | if (errno != EOWNERDEAD) /* Kernel bug. Harmless, ignore. */ |
822 | 0 | perror("libuv: io_uring_enter(wakeup)"); /* Can't happen. */ |
823 | 0 | } |
824 | | |
825 | | |
826 | 0 | int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) { |
827 | 0 | struct uv__io_uring_sqe* sqe; |
828 | 0 | struct uv__iou* iou; |
829 | 0 | int kv; |
830 | |
|
831 | 0 | kv = uv__kernel_version(); |
832 | | /* Work around a poorly understood bug in older kernels where closing a file |
833 | | * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to |
834 | | * execve("/foo/bar") later on. The bug seems to have been fixed somewhere |
835 | | * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit |
836 | | * but good candidates are the several data race fixes. Interestingly, it |
837 | | * seems to manifest only when running under Docker so the possibility of |
838 | | * a Docker bug can't be completely ruled out either. Yay, computers. |
839 | | * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and |
840 | | * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be |
841 | | * solved. |
842 | | */ |
843 | 0 | if (kv < /* 5.15.90 */ 0x050F5A) |
844 | 0 | return 0; |
845 | | |
846 | 0 | if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100) |
847 | 0 | return 0; |
848 | | |
849 | | |
850 | 0 | iou = &uv__get_internal_fields(loop)->iou; |
851 | |
|
852 | 0 | sqe = uv__iou_get_sqe(iou, loop, req); |
853 | 0 | if (sqe == NULL) |
854 | 0 | return 0; |
855 | | |
856 | 0 | sqe->fd = req->file; |
857 | 0 | sqe->opcode = UV__IORING_OP_CLOSE; |
858 | |
|
859 | 0 | uv__iou_submit(iou); |
860 | |
|
861 | 0 | return 1; |
862 | 0 | } |
863 | | |
864 | | |
865 | 0 | int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) { |
866 | 0 | struct uv__io_uring_sqe* sqe; |
867 | 0 | struct uv__iou* iou; |
868 | |
|
869 | 0 | if (uv__kernel_version() < /* 6.9 */0x060900) |
870 | 0 | return 0; |
871 | | |
872 | 0 | iou = &uv__get_internal_fields(loop)->iou; |
873 | 0 | sqe = uv__iou_get_sqe(iou, loop, req); |
874 | 0 | if (sqe == NULL) |
875 | 0 | return 0; |
876 | | |
877 | 0 | sqe->fd = req->file; |
878 | 0 | sqe->off = req->off; |
879 | 0 | sqe->opcode = UV__IORING_OP_FTRUNCATE; |
880 | 0 | uv__iou_submit(iou); |
881 | |
|
882 | 0 | return 1; |
883 | 0 | } |
884 | | |
885 | | int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop, |
886 | | uv_fs_t* req, |
887 | 0 | uint32_t fsync_flags) { |
888 | 0 | struct uv__io_uring_sqe* sqe; |
889 | 0 | struct uv__iou* iou; |
890 | |
|
891 | 0 | iou = &uv__get_internal_fields(loop)->iou; |
892 | |
|
893 | 0 | sqe = uv__iou_get_sqe(iou, loop, req); |
894 | 0 | if (sqe == NULL) |
895 | 0 | return 0; |
896 | | |
897 | | /* Little known fact: setting seq->off and seq->len turns |
898 | | * it into an asynchronous sync_file_range() operation. |
899 | | */ |
900 | 0 | sqe->fd = req->file; |
901 | 0 | sqe->fsync_flags = fsync_flags; |
902 | 0 | sqe->opcode = UV__IORING_OP_FSYNC; |
903 | |
|
904 | 0 | uv__iou_submit(iou); |
905 | |
|
906 | 0 | return 1; |
907 | 0 | } |
908 | | |
909 | | |
910 | 0 | int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) { |
911 | 0 | struct uv__io_uring_sqe* sqe; |
912 | 0 | struct uv__iou* iou; |
913 | |
|
914 | 0 | if (uv__kernel_version() < /* 5.15.0 */0x050F00) |
915 | 0 | return 0; |
916 | | |
917 | 0 | iou = &uv__get_internal_fields(loop)->iou; |
918 | 0 | sqe = uv__iou_get_sqe(iou, loop, req); |
919 | 0 | if (sqe == NULL) |
920 | 0 | return 0; |
921 | | |
922 | 0 | sqe->addr = (uintptr_t) req->path; |
923 | 0 | sqe->fd = AT_FDCWD; |
924 | 0 | sqe->addr2 = (uintptr_t) req->new_path; |
925 | 0 | sqe->len = AT_FDCWD; |
926 | 0 | sqe->opcode = UV__IORING_OP_LINKAT; |
927 | |
|
928 | 0 | uv__iou_submit(iou); |
929 | |
|
930 | 0 | return 1; |
931 | 0 | } |
932 | | |
933 | | |
934 | 0 | int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) { |
935 | 0 | struct uv__io_uring_sqe* sqe; |
936 | 0 | struct uv__iou* iou; |
937 | |
|
938 | 0 | if (uv__kernel_version() < /* 5.15.0 */0x050F00) |
939 | 0 | return 0; |
940 | | |
941 | 0 | iou = &uv__get_internal_fields(loop)->iou; |
942 | 0 | sqe = uv__iou_get_sqe(iou, loop, req); |
943 | 0 | if (sqe == NULL) |
944 | 0 | return 0; |
945 | | |
946 | 0 | sqe->addr = (uintptr_t) req->path; |
947 | 0 | sqe->fd = AT_FDCWD; |
948 | 0 | sqe->len = req->mode; |
949 | 0 | sqe->opcode = UV__IORING_OP_MKDIRAT; |
950 | |
|
951 | 0 | uv__iou_submit(iou); |
952 | |
|
953 | 0 | return 1; |
954 | 0 | } |
955 | | |
956 | | |
957 | 0 | int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) { |
958 | 0 | struct uv__io_uring_sqe* sqe; |
959 | 0 | struct uv__iou* iou; |
960 | |
|
961 | 0 | iou = &uv__get_internal_fields(loop)->iou; |
962 | |
|
963 | 0 | sqe = uv__iou_get_sqe(iou, loop, req); |
964 | 0 | if (sqe == NULL) |
965 | 0 | return 0; |
966 | | |
967 | 0 | sqe->addr = (uintptr_t) req->path; |
968 | 0 | sqe->fd = AT_FDCWD; |
969 | 0 | sqe->len = req->mode; |
970 | 0 | sqe->opcode = UV__IORING_OP_OPENAT; |
971 | 0 | sqe->open_flags = req->flags | O_CLOEXEC; |
972 | |
|
973 | 0 | uv__iou_submit(iou); |
974 | |
|
975 | 0 | return 1; |
976 | 0 | } |
977 | | |
978 | | |
979 | 0 | int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) { |
980 | 0 | struct uv__io_uring_sqe* sqe; |
981 | 0 | struct uv__iou* iou; |
982 | |
|
983 | 0 | iou = &uv__get_internal_fields(loop)->iou; |
984 | |
|
985 | 0 | sqe = uv__iou_get_sqe(iou, loop, req); |
986 | 0 | if (sqe == NULL) |
987 | 0 | return 0; |
988 | | |
989 | 0 | sqe->addr = (uintptr_t) req->path; |
990 | 0 | sqe->fd = AT_FDCWD; |
991 | 0 | sqe->addr2 = (uintptr_t) req->new_path; |
992 | 0 | sqe->len = AT_FDCWD; |
993 | 0 | sqe->opcode = UV__IORING_OP_RENAMEAT; |
994 | |
|
995 | 0 | uv__iou_submit(iou); |
996 | |
|
997 | 0 | return 1; |
998 | 0 | } |
999 | | |
1000 | | |
1001 | 0 | int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) { |
1002 | 0 | struct uv__io_uring_sqe* sqe; |
1003 | 0 | struct uv__iou* iou; |
1004 | |
|
1005 | 0 | if (uv__kernel_version() < /* 5.15.0 */0x050F00) |
1006 | 0 | return 0; |
1007 | | |
1008 | 0 | iou = &uv__get_internal_fields(loop)->iou; |
1009 | 0 | sqe = uv__iou_get_sqe(iou, loop, req); |
1010 | 0 | if (sqe == NULL) |
1011 | 0 | return 0; |
1012 | | |
1013 | 0 | sqe->addr = (uintptr_t) req->path; |
1014 | 0 | sqe->fd = AT_FDCWD; |
1015 | 0 | sqe->addr2 = (uintptr_t) req->new_path; |
1016 | 0 | sqe->opcode = UV__IORING_OP_SYMLINKAT; |
1017 | |
|
1018 | 0 | uv__iou_submit(iou); |
1019 | |
|
1020 | 0 | return 1; |
1021 | 0 | } |
1022 | | |
1023 | | |
1024 | 0 | int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) { |
1025 | 0 | struct uv__io_uring_sqe* sqe; |
1026 | 0 | struct uv__iou* iou; |
1027 | |
|
1028 | 0 | iou = &uv__get_internal_fields(loop)->iou; |
1029 | |
|
1030 | 0 | sqe = uv__iou_get_sqe(iou, loop, req); |
1031 | 0 | if (sqe == NULL) |
1032 | 0 | return 0; |
1033 | | |
1034 | 0 | sqe->addr = (uintptr_t) req->path; |
1035 | 0 | sqe->fd = AT_FDCWD; |
1036 | 0 | sqe->opcode = UV__IORING_OP_UNLINKAT; |
1037 | |
|
1038 | 0 | uv__iou_submit(iou); |
1039 | |
|
1040 | 0 | return 1; |
1041 | 0 | } |
1042 | | |
1043 | | |
1044 | | int uv__iou_fs_read_or_write(uv_loop_t* loop, |
1045 | | uv_fs_t* req, |
1046 | 0 | int is_read) { |
1047 | 0 | struct uv__io_uring_sqe* sqe; |
1048 | 0 | struct uv__iou* iou; |
1049 | | |
1050 | | /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback |
1051 | | * to the threadpool on writes */ |
1052 | 0 | if (req->nbufs > IOV_MAX) { |
1053 | 0 | if (is_read) |
1054 | 0 | req->nbufs = IOV_MAX; |
1055 | 0 | else |
1056 | 0 | return 0; |
1057 | 0 | } |
1058 | | |
1059 | 0 | iou = &uv__get_internal_fields(loop)->iou; |
1060 | |
|
1061 | 0 | sqe = uv__iou_get_sqe(iou, loop, req); |
1062 | 0 | if (sqe == NULL) |
1063 | 0 | return 0; |
1064 | | |
1065 | 0 | sqe->addr = (uintptr_t) req->bufs; |
1066 | 0 | sqe->fd = req->file; |
1067 | 0 | sqe->len = req->nbufs; |
1068 | 0 | sqe->off = req->off < 0 ? -1 : req->off; |
1069 | 0 | sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV; |
1070 | |
|
1071 | 0 | uv__iou_submit(iou); |
1072 | |
|
1073 | 0 | return 1; |
1074 | 0 | } |
1075 | | |
1076 | | |
1077 | | int uv__iou_fs_statx(uv_loop_t* loop, |
1078 | | uv_fs_t* req, |
1079 | | int is_fstat, |
1080 | 0 | int is_lstat) { |
1081 | 0 | struct uv__io_uring_sqe* sqe; |
1082 | 0 | struct uv__statx* statxbuf; |
1083 | 0 | struct uv__iou* iou; |
1084 | |
|
1085 | 0 | statxbuf = uv__malloc(sizeof(*statxbuf)); |
1086 | 0 | if (statxbuf == NULL) |
1087 | 0 | return 0; |
1088 | | |
1089 | 0 | iou = &uv__get_internal_fields(loop)->iou; |
1090 | |
|
1091 | 0 | sqe = uv__iou_get_sqe(iou, loop, req); |
1092 | 0 | if (sqe == NULL) { |
1093 | 0 | uv__free(statxbuf); |
1094 | 0 | return 0; |
1095 | 0 | } |
1096 | | |
1097 | 0 | req->ptr = statxbuf; |
1098 | |
|
1099 | 0 | sqe->addr = (uintptr_t) req->path; |
1100 | 0 | sqe->addr2 = (uintptr_t) statxbuf; |
1101 | 0 | sqe->fd = AT_FDCWD; |
1102 | 0 | sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */ |
1103 | 0 | sqe->opcode = UV__IORING_OP_STATX; |
1104 | |
|
1105 | 0 | if (is_fstat) { |
1106 | 0 | sqe->addr = (uintptr_t) ""; |
1107 | 0 | sqe->fd = req->file; |
1108 | 0 | sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */ |
1109 | 0 | } |
1110 | |
|
1111 | 0 | if (is_lstat) |
1112 | 0 | sqe->statx_flags |= AT_SYMLINK_NOFOLLOW; |
1113 | |
|
1114 | 0 | uv__iou_submit(iou); |
1115 | |
|
1116 | 0 | return 1; |
1117 | 0 | } |
1118 | | |
1119 | | |
1120 | 0 | void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) { |
1121 | 0 | buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor); |
1122 | 0 | buf->st_mode = statxbuf->stx_mode; |
1123 | 0 | buf->st_nlink = statxbuf->stx_nlink; |
1124 | 0 | buf->st_uid = statxbuf->stx_uid; |
1125 | 0 | buf->st_gid = statxbuf->stx_gid; |
1126 | 0 | buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor); |
1127 | 0 | buf->st_ino = statxbuf->stx_ino; |
1128 | 0 | buf->st_size = statxbuf->stx_size; |
1129 | 0 | buf->st_blksize = statxbuf->stx_blksize; |
1130 | 0 | buf->st_blocks = statxbuf->stx_blocks; |
1131 | 0 | buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec; |
1132 | 0 | buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec; |
1133 | 0 | buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec; |
1134 | 0 | buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec; |
1135 | 0 | buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec; |
1136 | 0 | buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec; |
1137 | 0 | buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec; |
1138 | 0 | buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec; |
1139 | 0 | buf->st_flags = 0; |
1140 | 0 | buf->st_gen = 0; |
1141 | 0 | } |
1142 | | |
1143 | | |
1144 | 0 | static void uv__iou_fs_statx_post(uv_fs_t* req) { |
1145 | 0 | struct uv__statx* statxbuf; |
1146 | 0 | uv_stat_t* buf; |
1147 | |
|
1148 | 0 | buf = &req->statbuf; |
1149 | 0 | statxbuf = req->ptr; |
1150 | 0 | req->ptr = NULL; |
1151 | |
|
1152 | 0 | if (req->result == 0) { |
1153 | 0 | uv__msan_unpoison(statxbuf, sizeof(*statxbuf)); |
1154 | 0 | uv__statx_to_stat(statxbuf, buf); |
1155 | 0 | req->ptr = buf; |
1156 | 0 | } |
1157 | |
|
1158 | 0 | uv__free(statxbuf); |
1159 | 0 | } |
1160 | | |
1161 | | |
1162 | 0 | static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) { |
1163 | 0 | struct uv__io_uring_cqe* cqe; |
1164 | 0 | struct uv__io_uring_cqe* e; |
1165 | 0 | uv_fs_t* req; |
1166 | 0 | uint32_t head; |
1167 | 0 | uint32_t tail; |
1168 | 0 | uint32_t mask; |
1169 | 0 | uint32_t i; |
1170 | 0 | uint32_t flags; |
1171 | 0 | int nevents; |
1172 | 0 | int rc; |
1173 | |
|
1174 | 0 | head = *iou->cqhead; |
1175 | 0 | tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail, |
1176 | 0 | memory_order_acquire); |
1177 | 0 | mask = iou->cqmask; |
1178 | 0 | cqe = iou->cqe; |
1179 | 0 | nevents = 0; |
1180 | |
|
1181 | 0 | for (i = head; i != tail; i++) { |
1182 | 0 | e = &cqe[i & mask]; |
1183 | |
|
1184 | 0 | req = (uv_fs_t*) (uintptr_t) e->user_data; |
1185 | 0 | assert(req->type == UV_FS); |
1186 | |
|
1187 | 0 | uv__req_unregister(loop); |
1188 | 0 | iou->in_flight--; |
1189 | | |
1190 | | /* If the op is not supported by the kernel retry using the thread pool */ |
1191 | 0 | if (e->res == -EOPNOTSUPP) { |
1192 | 0 | uv__fs_post(loop, req); |
1193 | 0 | continue; |
1194 | 0 | } |
1195 | | |
1196 | | /* io_uring stores error codes as negative numbers, same as libuv. */ |
1197 | 0 | req->result = e->res; |
1198 | |
|
1199 | 0 | switch (req->fs_type) { |
1200 | 0 | case UV_FS_FSTAT: |
1201 | 0 | case UV_FS_LSTAT: |
1202 | 0 | case UV_FS_STAT: |
1203 | 0 | uv__iou_fs_statx_post(req); |
1204 | 0 | break; |
1205 | 0 | default: /* Squelch -Wswitch warnings. */ |
1206 | 0 | break; |
1207 | 0 | } |
1208 | | |
1209 | 0 | uv__metrics_update_idle_time(loop); |
1210 | 0 | req->cb(req); |
1211 | 0 | nevents++; |
1212 | 0 | } |
1213 | | |
1214 | 0 | atomic_store_explicit((_Atomic uint32_t*) iou->cqhead, |
1215 | 0 | tail, |
1216 | 0 | memory_order_release); |
1217 | | |
1218 | | /* Check whether CQE's overflowed, if so enter the kernel to make them |
1219 | | * available. Don't grab them immediately but in the next loop iteration to |
1220 | | * avoid loop starvation. */ |
1221 | 0 | flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags, |
1222 | 0 | memory_order_acquire); |
1223 | |
|
1224 | 0 | if (flags & UV__IORING_SQ_CQ_OVERFLOW) { |
1225 | 0 | do |
1226 | 0 | rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS); |
1227 | 0 | while (rc == -1 && errno == EINTR); |
1228 | |
|
1229 | 0 | if (rc < 0) |
1230 | 0 | perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */ |
1231 | 0 | } |
1232 | |
|
1233 | 0 | uv__metrics_inc_events(loop, nevents); |
1234 | 0 | if (uv__get_internal_fields(loop)->current_timeout == 0) |
1235 | 0 | uv__metrics_inc_events_waiting(loop, nevents); |
1236 | 0 | } |
1237 | | |
1238 | | |
1239 | | /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be |
1240 | | * executed immediately, otherwise the file descriptor may have been closed |
1241 | | * by the time the kernel starts the operation. |
1242 | | */ |
1243 | | static void uv__epoll_ctl_prep(int epollfd, |
1244 | | struct uv__iou* ctl, |
1245 | | struct epoll_event (*events)[256], |
1246 | | int op, |
1247 | | int fd, |
1248 | 0 | struct epoll_event* e) { |
1249 | | /* FIXME: Avoid dangling pointer to uv__io_poll stack frame. */ |
1250 | 0 | #ifndef __clang_analyzer__ /* core.StackAddressEscape */ |
1251 | 0 | struct uv__io_uring_sqe* sqe; |
1252 | 0 | struct epoll_event* pe; |
1253 | 0 | uint32_t mask; |
1254 | 0 | uint32_t slot; |
1255 | |
|
1256 | 0 | assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD); |
1257 | 0 | assert(ctl->ringfd != -1); |
1258 | |
|
1259 | 0 | mask = ctl->sqmask; |
1260 | 0 | slot = (*ctl->sqtail)++ & mask; |
1261 | |
|
1262 | 0 | pe = &(*events)[slot]; |
1263 | 0 | *pe = *e; |
1264 | |
|
1265 | 0 | sqe = ctl->sqe; |
1266 | 0 | sqe = &sqe[slot]; |
1267 | |
|
1268 | 0 | memset(sqe, 0, sizeof(*sqe)); |
1269 | 0 | sqe->addr = (uintptr_t) pe; |
1270 | 0 | sqe->fd = epollfd; |
1271 | 0 | sqe->len = op; |
1272 | 0 | sqe->off = fd; |
1273 | 0 | sqe->opcode = UV__IORING_OP_EPOLL_CTL; |
1274 | 0 | sqe->user_data = op | slot << 2 | (int64_t) fd << 32; |
1275 | |
|
1276 | 0 | if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask)) |
1277 | 0 | uv__epoll_ctl_flush(epollfd, ctl, events); |
1278 | 0 | #endif |
1279 | 0 | } |
1280 | | |
1281 | | |
1282 | | static void uv__epoll_ctl_flush(int epollfd, |
1283 | | struct uv__iou* ctl, |
1284 | 0 | struct epoll_event (*events)[256]) { |
1285 | 0 | struct epoll_event oldevents[256]; |
1286 | 0 | struct uv__io_uring_cqe* cqe; |
1287 | 0 | uint32_t oldslot; |
1288 | 0 | uint32_t slot; |
1289 | 0 | uint32_t n; |
1290 | 0 | int fd; |
1291 | 0 | int op; |
1292 | 0 | int rc; |
1293 | |
|
1294 | 0 | STATIC_ASSERT(sizeof(oldevents) == sizeof(*events)); |
1295 | 0 | assert(ctl->ringfd != -1); |
1296 | 0 | assert(*ctl->sqhead != *ctl->sqtail); |
1297 | |
|
1298 | 0 | n = *ctl->sqtail - *ctl->sqhead; |
1299 | 0 | do |
1300 | 0 | rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS); |
1301 | 0 | while (rc == -1 && errno == EINTR); |
1302 | |
|
1303 | 0 | if (rc < 0) |
1304 | 0 | perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */ |
1305 | |
|
1306 | 0 | if (rc != (int) n) |
1307 | 0 | abort(); |
1308 | | |
1309 | 0 | assert(*ctl->sqhead == *ctl->sqtail); |
1310 | |
|
1311 | 0 | memcpy(oldevents, *events, sizeof(*events)); |
1312 | | |
1313 | | /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors |
1314 | | * that have been closed, or EPOLL_CTL_ADD commands for file descriptors |
1315 | | * that we are already watching. Ignore the former and retry the latter |
1316 | | * with EPOLL_CTL_MOD. |
1317 | | */ |
1318 | 0 | while (*ctl->cqhead != *ctl->cqtail) { |
1319 | 0 | slot = (*ctl->cqhead)++ & ctl->cqmask; |
1320 | |
|
1321 | 0 | cqe = ctl->cqe; |
1322 | 0 | cqe = &cqe[slot]; |
1323 | |
|
1324 | 0 | if (cqe->res == 0) |
1325 | 0 | continue; |
1326 | | |
1327 | 0 | fd = cqe->user_data >> 32; |
1328 | 0 | op = 3 & cqe->user_data; |
1329 | 0 | oldslot = 255 & (cqe->user_data >> 2); |
1330 | |
|
1331 | 0 | if (op == EPOLL_CTL_DEL) |
1332 | 0 | continue; |
1333 | | |
1334 | 0 | if (op != EPOLL_CTL_ADD) |
1335 | 0 | abort(); |
1336 | | |
1337 | 0 | if (cqe->res != -EEXIST) |
1338 | 0 | abort(); |
1339 | | |
1340 | 0 | uv__epoll_ctl_prep(epollfd, |
1341 | 0 | ctl, |
1342 | 0 | events, |
1343 | 0 | EPOLL_CTL_MOD, |
1344 | 0 | fd, |
1345 | 0 | &oldevents[oldslot]); |
1346 | 0 | } |
1347 | 0 | } |
1348 | | |
1349 | | |
1350 | 0 | void uv__io_poll(uv_loop_t* loop, int timeout) { |
1351 | 0 | uv__loop_internal_fields_t* lfields; |
1352 | 0 | struct epoll_event events[1024]; |
1353 | 0 | struct epoll_event prep[256]; |
1354 | 0 | struct uv__invalidate inv; |
1355 | 0 | struct epoll_event* pe; |
1356 | 0 | struct epoll_event e; |
1357 | 0 | struct uv__iou* ctl; |
1358 | 0 | struct uv__iou* iou; |
1359 | 0 | int real_timeout; |
1360 | 0 | struct uv__queue* q; |
1361 | 0 | uv__io_t* w; |
1362 | 0 | sigset_t* sigmask; |
1363 | 0 | sigset_t sigset; |
1364 | 0 | uint64_t base; |
1365 | 0 | int have_iou_events; |
1366 | 0 | int have_signals; |
1367 | 0 | int nevents; |
1368 | 0 | int epollfd; |
1369 | 0 | int count; |
1370 | 0 | int nfds; |
1371 | 0 | int fd; |
1372 | 0 | int op; |
1373 | 0 | int i; |
1374 | 0 | int user_timeout; |
1375 | 0 | int reset_timeout; |
1376 | |
|
1377 | 0 | lfields = uv__get_internal_fields(loop); |
1378 | 0 | ctl = &lfields->ctl; |
1379 | 0 | iou = &lfields->iou; |
1380 | |
|
1381 | 0 | sigmask = NULL; |
1382 | 0 | if (loop->flags & UV_LOOP_BLOCK_SIGPROF) { |
1383 | 0 | sigemptyset(&sigset); |
1384 | 0 | sigaddset(&sigset, SIGPROF); |
1385 | 0 | sigmask = &sigset; |
1386 | 0 | } |
1387 | |
|
1388 | 0 | assert(timeout >= -1); |
1389 | 0 | base = loop->time; |
1390 | 0 | count = 48; /* Benchmarks suggest this gives the best throughput. */ |
1391 | 0 | real_timeout = timeout; |
1392 | |
|
1393 | 0 | if (lfields->flags & UV_METRICS_IDLE_TIME) { |
1394 | 0 | reset_timeout = 1; |
1395 | 0 | user_timeout = timeout; |
1396 | 0 | timeout = 0; |
1397 | 0 | } else { |
1398 | 0 | reset_timeout = 0; |
1399 | 0 | user_timeout = 0; |
1400 | 0 | } |
1401 | |
|
1402 | 0 | epollfd = loop->backend_fd; |
1403 | |
|
1404 | 0 | memset(&e, 0, sizeof(e)); |
1405 | |
|
1406 | 0 | while (!uv__queue_empty(&loop->watcher_queue)) { |
1407 | 0 | q = uv__queue_head(&loop->watcher_queue); |
1408 | 0 | w = uv__queue_data(q, uv__io_t, watcher_queue); |
1409 | 0 | uv__queue_remove(q); |
1410 | 0 | uv__queue_init(q); |
1411 | |
|
1412 | 0 | op = EPOLL_CTL_MOD; |
1413 | 0 | if (w->events == 0) |
1414 | 0 | op = EPOLL_CTL_ADD; |
1415 | |
|
1416 | 0 | w->events = w->pevents; |
1417 | 0 | e.events = w->pevents; |
1418 | 0 | e.data.fd = w->fd; |
1419 | 0 | fd = w->fd; |
1420 | |
|
1421 | 0 | if (ctl->ringfd != -1) { |
1422 | 0 | uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e); |
1423 | 0 | continue; |
1424 | 0 | } |
1425 | | |
1426 | 0 | if (!epoll_ctl(epollfd, op, fd, &e)) |
1427 | 0 | continue; |
1428 | | |
1429 | 0 | assert(op == EPOLL_CTL_ADD); |
1430 | 0 | assert(errno == EEXIST); |
1431 | | |
1432 | | /* File descriptor that's been watched before, update event mask. */ |
1433 | 0 | if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e)) |
1434 | 0 | abort(); |
1435 | 0 | } |
1436 | | |
1437 | 0 | inv.events = events; |
1438 | 0 | inv.prep = &prep; |
1439 | 0 | inv.nfds = -1; |
1440 | |
|
1441 | 0 | for (;;) { |
1442 | 0 | if (loop->nfds == 0) |
1443 | 0 | if (iou->in_flight == 0) |
1444 | 0 | break; |
1445 | | |
1446 | | /* All event mask mutations should be visible to the kernel before |
1447 | | * we enter epoll_pwait(). |
1448 | | */ |
1449 | 0 | if (ctl->ringfd != -1) |
1450 | 0 | while (*ctl->sqhead != *ctl->sqtail) |
1451 | 0 | uv__epoll_ctl_flush(epollfd, ctl, &prep); |
1452 | |
|
1453 | 0 | uv__io_poll_prepare(loop, NULL, timeout); |
1454 | 0 | nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask); |
1455 | 0 | uv__io_poll_check(loop, NULL); |
1456 | |
|
1457 | 0 | if (nfds == -1) |
1458 | 0 | assert(errno == EINTR); |
1459 | 0 | else if (nfds == 0) |
1460 | | /* Unlimited timeout should only return with events or signal. */ |
1461 | 0 | assert(timeout != -1); |
1462 | |
|
1463 | 0 | if (nfds == 0 || nfds == -1) { |
1464 | 0 | if (reset_timeout != 0) { |
1465 | 0 | timeout = user_timeout; |
1466 | 0 | reset_timeout = 0; |
1467 | 0 | } else if (nfds == 0) { |
1468 | 0 | return; |
1469 | 0 | } |
1470 | | |
1471 | | /* Interrupted by a signal. Update timeout and poll again. */ |
1472 | 0 | goto update_timeout; |
1473 | 0 | } |
1474 | | |
1475 | 0 | have_iou_events = 0; |
1476 | 0 | have_signals = 0; |
1477 | 0 | nevents = 0; |
1478 | |
|
1479 | 0 | inv.nfds = nfds; |
1480 | 0 | lfields->inv = &inv; |
1481 | |
|
1482 | 0 | for (i = 0; i < nfds; i++) { |
1483 | 0 | pe = events + i; |
1484 | 0 | fd = pe->data.fd; |
1485 | | |
1486 | | /* Skip invalidated events, see uv__platform_invalidate_fd */ |
1487 | 0 | if (fd == -1) |
1488 | 0 | continue; |
1489 | | |
1490 | 0 | if (fd == iou->ringfd) { |
1491 | 0 | uv__poll_io_uring(loop, iou); |
1492 | 0 | have_iou_events = 1; |
1493 | 0 | continue; |
1494 | 0 | } |
1495 | | |
1496 | 0 | assert(fd >= 0); |
1497 | 0 | assert((unsigned) fd < loop->nwatchers); |
1498 | |
|
1499 | 0 | w = loop->watchers[fd]; |
1500 | |
|
1501 | 0 | if (w == NULL) { |
1502 | | /* File descriptor that we've stopped watching, disarm it. |
1503 | | * |
1504 | | * Ignore all errors because we may be racing with another thread |
1505 | | * when the file descriptor is closed. |
1506 | | * |
1507 | | * Perform EPOLL_CTL_DEL immediately instead of going through |
1508 | | * io_uring's submit queue, otherwise the file descriptor may |
1509 | | * be closed by the time the kernel starts the operation. |
1510 | | */ |
1511 | 0 | epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe); |
1512 | 0 | continue; |
1513 | 0 | } |
1514 | | |
1515 | | /* Give users only events they're interested in. Prevents spurious |
1516 | | * callbacks when previous callback invocation in this loop has stopped |
1517 | | * the current watcher. Also, filters out events that users has not |
1518 | | * requested us to watch. |
1519 | | */ |
1520 | 0 | pe->events &= w->pevents | POLLERR | POLLHUP; |
1521 | | |
1522 | | /* Work around an epoll quirk where it sometimes reports just the |
1523 | | * EPOLLERR or EPOLLHUP event. In order to force the event loop to |
1524 | | * move forward, we merge in the read/write events that the watcher |
1525 | | * is interested in; uv__read() and uv__write() will then deal with |
1526 | | * the error or hangup in the usual fashion. |
1527 | | * |
1528 | | * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user |
1529 | | * reads the available data, calls uv_read_stop(), then sometime later |
1530 | | * calls uv_read_start() again. By then, libuv has forgotten about the |
1531 | | * hangup and the kernel won't report EPOLLIN again because there's |
1532 | | * nothing left to read. If anything, libuv is to blame here. The |
1533 | | * current hack is just a quick bandaid; to properly fix it, libuv |
1534 | | * needs to remember the error/hangup event. We should get that for |
1535 | | * free when we switch over to edge-triggered I/O. |
1536 | | */ |
1537 | 0 | if (pe->events == POLLERR || pe->events == POLLHUP) |
1538 | 0 | pe->events |= |
1539 | 0 | w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI); |
1540 | |
|
1541 | 0 | if (pe->events != 0) { |
1542 | | /* Run signal watchers last. This also affects child process watchers |
1543 | | * because those are implemented in terms of signal watchers. |
1544 | | */ |
1545 | 0 | if (w == &loop->signal_io_watcher) { |
1546 | 0 | have_signals = 1; |
1547 | 0 | } else { |
1548 | 0 | uv__metrics_update_idle_time(loop); |
1549 | 0 | uv__io_cb(loop, w, pe->events); |
1550 | 0 | } |
1551 | |
|
1552 | 0 | nevents++; |
1553 | 0 | } |
1554 | 0 | } |
1555 | |
|
1556 | 0 | uv__metrics_inc_events(loop, nevents); |
1557 | 0 | if (reset_timeout != 0) { |
1558 | 0 | timeout = user_timeout; |
1559 | 0 | reset_timeout = 0; |
1560 | 0 | uv__metrics_inc_events_waiting(loop, nevents); |
1561 | 0 | } |
1562 | |
|
1563 | 0 | if (have_signals != 0) { |
1564 | 0 | uv__metrics_update_idle_time(loop); |
1565 | 0 | uv__signal_event(loop, &loop->signal_io_watcher, POLLIN); |
1566 | 0 | } |
1567 | |
|
1568 | 0 | lfields->inv = NULL; |
1569 | |
|
1570 | 0 | if (have_iou_events != 0) |
1571 | 0 | break; /* Event loop should cycle now so don't poll again. */ |
1572 | | |
1573 | 0 | if (have_signals != 0) |
1574 | 0 | break; /* Event loop should cycle now so don't poll again. */ |
1575 | | |
1576 | 0 | if (nevents != 0) { |
1577 | 0 | if (nfds == ARRAY_SIZE(events) && --count != 0) { |
1578 | | /* Poll for more events but don't block this time. */ |
1579 | 0 | timeout = 0; |
1580 | 0 | continue; |
1581 | 0 | } |
1582 | 0 | break; |
1583 | 0 | } |
1584 | | |
1585 | 0 | update_timeout: |
1586 | 0 | if (timeout == 0) |
1587 | 0 | break; |
1588 | | |
1589 | 0 | if (timeout == -1) |
1590 | 0 | continue; |
1591 | | |
1592 | 0 | assert(timeout > 0); |
1593 | |
|
1594 | 0 | real_timeout -= (loop->time - base); |
1595 | 0 | if (real_timeout <= 0) |
1596 | 0 | break; |
1597 | | |
1598 | 0 | timeout = real_timeout; |
1599 | 0 | } |
1600 | | |
1601 | 0 | if (ctl->ringfd != -1) |
1602 | 0 | while (*ctl->sqhead != *ctl->sqtail) |
1603 | 0 | uv__epoll_ctl_flush(epollfd, ctl, &prep); |
1604 | 0 | } |
1605 | | |
1606 | 0 | uint64_t uv__hrtime(uv_clocktype_t type) { |
1607 | 0 | static _Atomic clock_t fast_clock_id = -1; |
1608 | 0 | struct timespec t; |
1609 | 0 | clock_t clock_id; |
1610 | | |
1611 | | /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has |
1612 | | * millisecond granularity or better. CLOCK_MONOTONIC_COARSE is |
1613 | | * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may |
1614 | | * decide to make a costly system call. |
1615 | | */ |
1616 | | /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE |
1617 | | * when it has microsecond granularity or better (unlikely). |
1618 | | */ |
1619 | 0 | clock_id = CLOCK_MONOTONIC; |
1620 | 0 | if (type != UV_CLOCK_FAST) |
1621 | 0 | goto done; |
1622 | | |
1623 | 0 | clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed); |
1624 | 0 | if (clock_id != -1) |
1625 | 0 | goto done; |
1626 | | |
1627 | 0 | clock_id = CLOCK_MONOTONIC; |
1628 | 0 | if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t)) |
1629 | 0 | if (t.tv_nsec <= 1 * 1000 * 1000) |
1630 | 0 | clock_id = CLOCK_MONOTONIC_COARSE; |
1631 | |
|
1632 | 0 | atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed); |
1633 | |
|
1634 | 0 | done: |
1635 | |
|
1636 | 0 | if (clock_gettime(clock_id, &t)) |
1637 | 0 | return 0; /* Not really possible. */ |
1638 | | |
1639 | 0 | return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec; |
1640 | 0 | } |
1641 | | |
1642 | | |
1643 | 0 | int uv_resident_set_memory(size_t* rss) { |
1644 | 0 | char buf[1024]; |
1645 | 0 | const char* s; |
1646 | 0 | long val; |
1647 | 0 | int rc; |
1648 | 0 | int i; |
1649 | | |
1650 | | /* rss: 24th element */ |
1651 | 0 | rc = uv__slurp("/proc/self/stat", buf, sizeof(buf)); |
1652 | 0 | if (rc < 0) |
1653 | 0 | return rc; |
1654 | | |
1655 | | /* find the last ')' */ |
1656 | 0 | s = strrchr(buf, ')'); |
1657 | 0 | if (s == NULL) |
1658 | 0 | goto err; |
1659 | | |
1660 | 0 | for (i = 1; i <= 22; i++) { |
1661 | 0 | s = strchr(s + 1, ' '); |
1662 | 0 | if (s == NULL) |
1663 | 0 | goto err; |
1664 | 0 | } |
1665 | | |
1666 | 0 | errno = 0; |
1667 | 0 | val = strtol(s, NULL, 10); |
1668 | 0 | if (val < 0 || errno != 0) |
1669 | 0 | goto err; |
1670 | | |
1671 | 0 | *rss = val * getpagesize(); |
1672 | 0 | return 0; |
1673 | | |
1674 | 0 | err: |
1675 | 0 | return UV_EINVAL; |
1676 | 0 | } |
1677 | | |
1678 | 0 | int uv_uptime(double* uptime) { |
1679 | 0 | struct timespec now; |
1680 | 0 | char buf[128]; |
1681 | | |
1682 | | /* Consult /proc/uptime when present (common case), or fall back to |
1683 | | * clock_gettime. Why not always clock_gettime? It doesn't always return the |
1684 | | * right result under OpenVZ and possibly other containerized environments. |
1685 | | */ |
1686 | 0 | if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf))) |
1687 | 0 | if (1 == sscanf(buf, "%lf", uptime)) |
1688 | 0 | return 0; |
1689 | | |
1690 | 0 | if (clock_gettime(CLOCK_BOOTTIME, &now)) |
1691 | 0 | return UV__ERR(errno); |
1692 | | |
1693 | 0 | *uptime = now.tv_sec; |
1694 | 0 | return 0; |
1695 | 0 | } |
1696 | | |
1697 | | |
1698 | 0 | int uv_cpu_info(uv_cpu_info_t** ci, int* count) { |
1699 | | #if defined(__PPC__) |
1700 | | static const char model_marker[] = "cpu\t\t: "; |
1701 | | static const char model_marker2[] = ""; |
1702 | | #elif defined(__arm__) |
1703 | | static const char model_marker[] = "model name\t: "; |
1704 | | static const char model_marker2[] = "Processor\t: "; |
1705 | | #elif defined(__aarch64__) |
1706 | | static const char model_marker[] = "CPU part\t: "; |
1707 | | static const char model_marker2[] = ""; |
1708 | | #elif defined(__mips__) |
1709 | | static const char model_marker[] = "cpu model\t\t: "; |
1710 | | static const char model_marker2[] = ""; |
1711 | | #elif defined(__loongarch__) |
1712 | | static const char model_marker[] = "cpu family\t\t: "; |
1713 | | static const char model_marker2[] = ""; |
1714 | | #else |
1715 | 0 | static const char model_marker[] = "model name\t: "; |
1716 | 0 | static const char model_marker2[] = ""; |
1717 | 0 | #endif |
1718 | 0 | static const char parts[] = |
1719 | | #ifdef __aarch64__ |
1720 | | "0x811\nARM810\n" "0x920\nARM920\n" "0x922\nARM922\n" |
1721 | | "0x926\nARM926\n" "0x940\nARM940\n" "0x946\nARM946\n" |
1722 | | "0x966\nARM966\n" "0xa20\nARM1020\n" "0xa22\nARM1022\n" |
1723 | | "0xa26\nARM1026\n" "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n" |
1724 | | "0xb56\nARM1156\n" "0xb76\nARM1176\n" "0xc05\nCortex-A5\n" |
1725 | | "0xc07\nCortex-A7\n" "0xc08\nCortex-A8\n" "0xc09\nCortex-A9\n" |
1726 | | "0xc0d\nCortex-A17\n" /* Originally A12 */ |
1727 | | "0xc0f\nCortex-A15\n" "0xc0e\nCortex-A17\n" "0xc14\nCortex-R4\n" |
1728 | | "0xc15\nCortex-R5\n" "0xc17\nCortex-R7\n" "0xc18\nCortex-R8\n" |
1729 | | "0xc20\nCortex-M0\n" "0xc21\nCortex-M1\n" "0xc23\nCortex-M3\n" |
1730 | | "0xc24\nCortex-M4\n" "0xc27\nCortex-M7\n" "0xc60\nCortex-M0+\n" |
1731 | | "0xd01\nCortex-A32\n" "0xd03\nCortex-A53\n" "0xd04\nCortex-A35\n" |
1732 | | "0xd05\nCortex-A55\n" "0xd06\nCortex-A65\n" "0xd07\nCortex-A57\n" |
1733 | | "0xd08\nCortex-A72\n" "0xd09\nCortex-A73\n" "0xd0a\nCortex-A75\n" |
1734 | | "0xd0b\nCortex-A76\n" "0xd0c\nNeoverse-N1\n" "0xd0d\nCortex-A77\n" |
1735 | | "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n" "0xd20\nCortex-M23\n" |
1736 | | "0xd21\nCortex-M33\n" "0xd41\nCortex-A78\n" "0xd42\nCortex-A78AE\n" |
1737 | | "0xd4a\nNeoverse-E1\n" "0xd4b\nCortex-A78C\n" "0xd4f\nNeoverse-V2\n" |
1738 | | #endif |
1739 | 0 | ""; |
1740 | 0 | struct cpu { |
1741 | 0 | unsigned long long freq, user, nice, sys, idle, irq; |
1742 | 0 | unsigned model; |
1743 | 0 | }; |
1744 | 0 | FILE* fp; |
1745 | 0 | char* p; |
1746 | 0 | int found; |
1747 | 0 | int n; |
1748 | 0 | unsigned i; |
1749 | 0 | unsigned cpu; |
1750 | 0 | unsigned maxcpu; |
1751 | 0 | unsigned size; |
1752 | 0 | unsigned long long skip; |
1753 | 0 | struct cpu (*cpus)[8192]; /* Kernel maximum. */ |
1754 | 0 | struct cpu* c; |
1755 | 0 | struct cpu t; |
1756 | 0 | char (*model)[64]; |
1757 | 0 | unsigned char bitmap[ARRAY_SIZE(*cpus) / 8]; |
1758 | | /* Assumption: even big.LITTLE systems will have only a handful |
1759 | | * of different CPU models. Most systems will just have one. |
1760 | | */ |
1761 | 0 | char models[8][64]; |
1762 | 0 | char buf[1024]; |
1763 | |
|
1764 | 0 | memset(bitmap, 0, sizeof(bitmap)); |
1765 | 0 | memset(models, 0, sizeof(models)); |
1766 | 0 | snprintf(*models, sizeof(*models), "unknown"); |
1767 | 0 | maxcpu = 0; |
1768 | |
|
1769 | 0 | cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus)); |
1770 | 0 | if (cpus == NULL) |
1771 | 0 | return UV_ENOMEM; |
1772 | | |
1773 | 0 | fp = uv__open_file("/proc/stat"); |
1774 | 0 | if (fp == NULL) { |
1775 | 0 | uv__free(cpus); |
1776 | 0 | return UV__ERR(errno); |
1777 | 0 | } |
1778 | | |
1779 | 0 | if (NULL == fgets(buf, sizeof(buf), fp)) |
1780 | 0 | abort(); |
1781 | | |
1782 | 0 | for (;;) { |
1783 | 0 | memset(&t, 0, sizeof(t)); |
1784 | |
|
1785 | 0 | n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu", |
1786 | 0 | &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq); |
1787 | |
|
1788 | 0 | if (n != 7) |
1789 | 0 | break; |
1790 | | |
1791 | 0 | if (NULL == fgets(buf, sizeof(buf), fp)) |
1792 | 0 | abort(); |
1793 | | |
1794 | 0 | if (cpu >= ARRAY_SIZE(*cpus)) |
1795 | 0 | continue; |
1796 | | |
1797 | 0 | (*cpus)[cpu] = t; |
1798 | |
|
1799 | 0 | bitmap[cpu >> 3] |= 1 << (cpu & 7); |
1800 | |
|
1801 | 0 | if (cpu >= maxcpu) |
1802 | 0 | maxcpu = cpu + 1; |
1803 | 0 | } |
1804 | | |
1805 | 0 | fclose(fp); |
1806 | |
|
1807 | 0 | fp = uv__open_file("/proc/cpuinfo"); |
1808 | 0 | if (fp == NULL) |
1809 | 0 | goto nocpuinfo; |
1810 | | |
1811 | 0 | for (;;) { |
1812 | 0 | if (1 != fscanf(fp, "processor\t: %u\n", &cpu)) |
1813 | 0 | break; /* Parse error. */ |
1814 | | |
1815 | 0 | while (fgets(buf, sizeof(buf), fp)) { |
1816 | 0 | if (!strncmp(buf, model_marker, sizeof(model_marker) - 1)) { |
1817 | 0 | p = buf + sizeof(model_marker) - 1; |
1818 | 0 | goto parts; |
1819 | 0 | } |
1820 | 0 | if (!*model_marker2) |
1821 | 0 | continue; |
1822 | 0 | if (!strncmp(buf, model_marker2, sizeof(model_marker2) - 1)) { |
1823 | 0 | p = buf + sizeof(model_marker2) - 1; |
1824 | 0 | goto parts; |
1825 | 0 | } |
1826 | 0 | } |
1827 | | |
1828 | 0 | goto next; /* Not found. */ |
1829 | | |
1830 | 0 | parts: |
1831 | 0 | n = (int) strcspn(p, "\n"); |
1832 | | |
1833 | | /* arm64: translate CPU part code to model name. */ |
1834 | 0 | if (*parts) { |
1835 | 0 | p = memmem(parts, sizeof(parts) - 1, p, n + 1); |
1836 | 0 | if (p == NULL) |
1837 | 0 | p = "unknown"; |
1838 | 0 | else |
1839 | 0 | p += n + 1; |
1840 | 0 | n = (int) strcspn(p, "\n"); |
1841 | 0 | } |
1842 | |
|
1843 | 0 | found = 0; |
1844 | 0 | for (model = models; !found && model < ARRAY_END(models); model++) |
1845 | 0 | found = !strncmp(p, *model, strlen(*model)); |
1846 | |
|
1847 | 0 | if (!found) |
1848 | 0 | goto next; |
1849 | | |
1850 | 0 | if (**model == '\0') |
1851 | 0 | snprintf(*model, sizeof(*model), "%.*s", n, p); |
1852 | |
|
1853 | 0 | if (cpu < maxcpu) |
1854 | 0 | (*cpus)[cpu].model = model - models; |
1855 | |
|
1856 | 0 | next: |
1857 | 0 | while (fgets(buf, sizeof(buf), fp)) |
1858 | 0 | if (*buf == '\n') |
1859 | 0 | break; |
1860 | 0 | } |
1861 | | |
1862 | 0 | fclose(fp); |
1863 | 0 | fp = NULL; |
1864 | |
|
1865 | 0 | nocpuinfo: |
1866 | |
|
1867 | 0 | n = 0; |
1868 | 0 | for (cpu = 0; cpu < maxcpu; cpu++) { |
1869 | 0 | if (!(bitmap[cpu >> 3] & (1 << (cpu & 7)))) |
1870 | 0 | continue; |
1871 | | |
1872 | 0 | n++; |
1873 | 0 | snprintf(buf, sizeof(buf), |
1874 | 0 | "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu); |
1875 | |
|
1876 | 0 | fp = uv__open_file(buf); |
1877 | 0 | if (fp == NULL) |
1878 | 0 | continue; |
1879 | | |
1880 | 0 | if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq)) |
1881 | 0 | abort(); |
1882 | 0 | fclose(fp); |
1883 | 0 | fp = NULL; |
1884 | 0 | } |
1885 | | |
1886 | 0 | size = n * sizeof(**ci) + sizeof(models); |
1887 | 0 | *ci = uv__malloc(size); |
1888 | 0 | *count = 0; |
1889 | |
|
1890 | 0 | if (*ci == NULL) { |
1891 | 0 | uv__free(cpus); |
1892 | 0 | return UV_ENOMEM; |
1893 | 0 | } |
1894 | | |
1895 | 0 | *count = n; |
1896 | 0 | p = memcpy(*ci + n, models, sizeof(models)); |
1897 | |
|
1898 | 0 | i = 0; |
1899 | 0 | for (cpu = 0; cpu < maxcpu; cpu++) { |
1900 | 0 | if (!(bitmap[cpu >> 3] & (1 << (cpu & 7)))) |
1901 | 0 | continue; |
1902 | | |
1903 | 0 | c = *cpus + cpu; |
1904 | |
|
1905 | 0 | (*ci)[i++] = (uv_cpu_info_t) { |
1906 | 0 | .model = p + c->model * sizeof(*model), |
1907 | 0 | .speed = c->freq / 1000, |
1908 | | /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz, |
1909 | | * therefore the multiplier is always 1000/100 = 10. |
1910 | | */ |
1911 | 0 | .cpu_times = (struct uv_cpu_times_s) { |
1912 | 0 | .user = 10 * c->user, |
1913 | 0 | .nice = 10 * c->nice, |
1914 | 0 | .sys = 10 * c->sys, |
1915 | 0 | .idle = 10 * c->idle, |
1916 | 0 | .irq = 10 * c->irq, |
1917 | 0 | }, |
1918 | 0 | }; |
1919 | 0 | } |
1920 | |
|
1921 | 0 | uv__free(cpus); |
1922 | |
|
1923 | 0 | return 0; |
1924 | 0 | } |
1925 | | |
1926 | | |
1927 | 0 | static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) { |
1928 | 0 | if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING))) |
1929 | 0 | return 1; |
1930 | 0 | if (ent->ifa_addr == NULL) |
1931 | 0 | return 1; |
1932 | | /* |
1933 | | * On Linux getifaddrs returns information related to the raw underlying |
1934 | | * devices. We're not interested in this information yet. |
1935 | | */ |
1936 | 0 | if (ent->ifa_addr->sa_family == PF_PACKET) |
1937 | 0 | return exclude_type; |
1938 | 0 | return !exclude_type; |
1939 | 0 | } |
1940 | | |
1941 | | /* TODO(bnoordhuis) share with bsd-ifaddrs.c */ |
1942 | 0 | int uv_interface_addresses(uv_interface_address_t** addresses, int* count) { |
1943 | 0 | uv_interface_address_t* address; |
1944 | 0 | struct sockaddr_ll* sll; |
1945 | 0 | struct ifaddrs* addrs; |
1946 | 0 | struct ifaddrs* ent; |
1947 | 0 | size_t namelen; |
1948 | 0 | char* name; |
1949 | 0 | int i; |
1950 | |
|
1951 | 0 | *count = 0; |
1952 | 0 | *addresses = NULL; |
1953 | |
|
1954 | 0 | if (getifaddrs(&addrs)) |
1955 | 0 | return UV__ERR(errno); |
1956 | | |
1957 | | /* Count the number of interfaces */ |
1958 | 0 | namelen = 0; |
1959 | 0 | for (ent = addrs; ent != NULL; ent = ent->ifa_next) { |
1960 | 0 | if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR)) |
1961 | 0 | continue; |
1962 | | |
1963 | 0 | namelen += strlen(ent->ifa_name) + 1; |
1964 | 0 | (*count)++; |
1965 | 0 | } |
1966 | |
|
1967 | 0 | if (*count == 0) { |
1968 | 0 | freeifaddrs(addrs); |
1969 | 0 | return 0; |
1970 | 0 | } |
1971 | | |
1972 | | /* Make sure the memory is initiallized to zero using calloc() */ |
1973 | 0 | *addresses = uv__calloc(1, *count * sizeof(**addresses) + namelen); |
1974 | 0 | if (*addresses == NULL) { |
1975 | 0 | freeifaddrs(addrs); |
1976 | 0 | return UV_ENOMEM; |
1977 | 0 | } |
1978 | | |
1979 | 0 | name = (char*) &(*addresses)[*count]; |
1980 | 0 | address = *addresses; |
1981 | |
|
1982 | 0 | for (ent = addrs; ent != NULL; ent = ent->ifa_next) { |
1983 | 0 | if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR)) |
1984 | 0 | continue; |
1985 | | |
1986 | 0 | namelen = strlen(ent->ifa_name) + 1; |
1987 | 0 | address->name = memcpy(name, ent->ifa_name, namelen); |
1988 | 0 | name += namelen; |
1989 | |
|
1990 | 0 | if (ent->ifa_addr->sa_family == AF_INET6) { |
1991 | 0 | address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr); |
1992 | 0 | } else { |
1993 | 0 | address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr); |
1994 | 0 | } |
1995 | |
|
1996 | 0 | if (ent->ifa_netmask->sa_family == AF_INET6) { |
1997 | 0 | address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask); |
1998 | 0 | } else { |
1999 | 0 | address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask); |
2000 | 0 | } |
2001 | |
|
2002 | 0 | address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK); |
2003 | |
|
2004 | 0 | address++; |
2005 | 0 | } |
2006 | | |
2007 | | /* Fill in physical addresses for each interface */ |
2008 | 0 | for (ent = addrs; ent != NULL; ent = ent->ifa_next) { |
2009 | 0 | if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS)) |
2010 | 0 | continue; |
2011 | | |
2012 | 0 | address = *addresses; |
2013 | |
|
2014 | 0 | for (i = 0; i < (*count); i++) { |
2015 | 0 | size_t namelen = strlen(ent->ifa_name); |
2016 | | /* Alias interface share the same physical address */ |
2017 | 0 | if (strncmp(address->name, ent->ifa_name, namelen) == 0 && |
2018 | 0 | (address->name[namelen] == 0 || address->name[namelen] == ':')) { |
2019 | 0 | sll = (struct sockaddr_ll*)ent->ifa_addr; |
2020 | 0 | memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr)); |
2021 | 0 | } |
2022 | 0 | address++; |
2023 | 0 | } |
2024 | 0 | } |
2025 | |
|
2026 | 0 | freeifaddrs(addrs); |
2027 | |
|
2028 | 0 | return 0; |
2029 | 0 | } |
2030 | | |
2031 | | |
2032 | 0 | void uv__set_process_title(const char* title) { |
2033 | 0 | #if defined(PR_SET_NAME) |
2034 | 0 | prctl(PR_SET_NAME, title); /* Only copies first 16 characters. */ |
2035 | 0 | #endif |
2036 | 0 | } |
2037 | | |
2038 | | |
2039 | 0 | static uint64_t uv__read_proc_meminfo(const char* what) { |
2040 | 0 | uint64_t rc; |
2041 | 0 | char* p; |
2042 | 0 | char buf[4096]; /* Large enough to hold all of /proc/meminfo. */ |
2043 | |
|
2044 | 0 | if (uv__slurp("/proc/meminfo", buf, sizeof(buf))) |
2045 | 0 | return 0; |
2046 | | |
2047 | 0 | p = strstr(buf, what); |
2048 | |
|
2049 | 0 | if (p == NULL) |
2050 | 0 | return 0; |
2051 | | |
2052 | 0 | p += strlen(what); |
2053 | |
|
2054 | 0 | rc = 0; |
2055 | 0 | sscanf(p, "%" PRIu64 " kB", &rc); |
2056 | |
|
2057 | 0 | return rc * 1024; |
2058 | 0 | } |
2059 | | |
2060 | | |
2061 | 0 | uint64_t uv_get_free_memory(void) { |
2062 | 0 | struct sysinfo info; |
2063 | 0 | uint64_t rc; |
2064 | |
|
2065 | 0 | rc = uv__read_proc_meminfo("MemAvailable:"); |
2066 | |
|
2067 | 0 | if (rc != 0) |
2068 | 0 | return rc; |
2069 | | |
2070 | 0 | if (0 == sysinfo(&info)) |
2071 | 0 | return (uint64_t) info.freeram * info.mem_unit; |
2072 | | |
2073 | 0 | return 0; |
2074 | 0 | } |
2075 | | |
2076 | | |
2077 | 0 | uint64_t uv_get_total_memory(void) { |
2078 | 0 | struct sysinfo info; |
2079 | 0 | uint64_t rc; |
2080 | |
|
2081 | 0 | rc = uv__read_proc_meminfo("MemTotal:"); |
2082 | |
|
2083 | 0 | if (rc != 0) |
2084 | 0 | return rc; |
2085 | | |
2086 | 0 | if (0 == sysinfo(&info)) |
2087 | 0 | return (uint64_t) info.totalram * info.mem_unit; |
2088 | | |
2089 | 0 | return 0; |
2090 | 0 | } |
2091 | | |
2092 | | |
2093 | 0 | static uint64_t uv__read_uint64(const char* filename) { |
2094 | 0 | char buf[32]; /* Large enough to hold an encoded uint64_t. */ |
2095 | 0 | uint64_t rc; |
2096 | |
|
2097 | 0 | rc = 0; |
2098 | 0 | if (0 == uv__slurp(filename, buf, sizeof(buf))) |
2099 | 0 | if (1 != sscanf(buf, "%" PRIu64, &rc)) |
2100 | 0 | if (0 == strcmp(buf, "max\n")) |
2101 | 0 | rc = UINT64_MAX; |
2102 | |
|
2103 | 0 | return rc; |
2104 | 0 | } |
2105 | | |
2106 | | |
2107 | | /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups, |
2108 | | * finds the location and length of the memory controller mount path. |
2109 | | * This disregards the leading / for easy concatenation of paths. |
2110 | | * Returns NULL if the memory controller wasn't found. */ |
2111 | | static char* uv__cgroup1_find_memory_controller(char buf[static 1024], |
2112 | 0 | int* n) { |
2113 | 0 | char* p; |
2114 | | |
2115 | | /* Seek to the memory controller line. */ |
2116 | 0 | p = strchr(buf, ':'); |
2117 | 0 | while (p != NULL && strncmp(p, ":memory:", 8)) { |
2118 | 0 | p = strchr(p, '\n'); |
2119 | 0 | if (p != NULL) |
2120 | 0 | p = strchr(p, ':'); |
2121 | 0 | } |
2122 | |
|
2123 | 0 | if (p != NULL) { |
2124 | | /* Determine the length of the mount path. */ |
2125 | 0 | p = p + strlen(":memory:/"); |
2126 | 0 | *n = (int) strcspn(p, "\n"); |
2127 | 0 | } |
2128 | |
|
2129 | 0 | return p; |
2130 | 0 | } |
2131 | | |
2132 | | static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high, |
2133 | 0 | uint64_t* max) { |
2134 | 0 | char filename[4097]; |
2135 | 0 | char* p; |
2136 | 0 | int n; |
2137 | 0 | uint64_t cgroup1_max; |
2138 | | |
2139 | | /* Find out where the controller is mounted. */ |
2140 | 0 | p = uv__cgroup1_find_memory_controller(buf, &n); |
2141 | 0 | if (p != NULL) { |
2142 | 0 | snprintf(filename, sizeof(filename), |
2143 | 0 | "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p); |
2144 | 0 | *high = uv__read_uint64(filename); |
2145 | |
|
2146 | 0 | snprintf(filename, sizeof(filename), |
2147 | 0 | "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p); |
2148 | 0 | *max = uv__read_uint64(filename); |
2149 | | |
2150 | | /* If the controller wasn't mounted, the reads above will have failed, |
2151 | | * as indicated by uv__read_uint64 returning 0. |
2152 | | */ |
2153 | 0 | if (*high != 0 && *max != 0) |
2154 | 0 | goto update_limits; |
2155 | 0 | } |
2156 | | |
2157 | | /* Fall back to the limits of the global memory controller. */ |
2158 | 0 | *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes"); |
2159 | 0 | *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes"); |
2160 | | |
2161 | | /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect |
2162 | | * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE). |
2163 | | */ |
2164 | 0 | update_limits: |
2165 | 0 | cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1); |
2166 | 0 | if (*high == cgroup1_max) |
2167 | 0 | *high = UINT64_MAX; |
2168 | 0 | if (*max == cgroup1_max) |
2169 | 0 | *max = UINT64_MAX; |
2170 | 0 | } |
2171 | | |
2172 | | static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high, |
2173 | 0 | uint64_t* max) { |
2174 | 0 | char filename[4097]; |
2175 | 0 | char* p; |
2176 | 0 | int n; |
2177 | | |
2178 | | /* Find out where the controller is mounted. */ |
2179 | 0 | p = buf + strlen("0::/"); |
2180 | 0 | n = (int) strcspn(p, "\n"); |
2181 | | |
2182 | | /* Read the memory limits of the controller. */ |
2183 | 0 | snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p); |
2184 | 0 | *max = uv__read_uint64(filename); |
2185 | 0 | snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p); |
2186 | 0 | *high = uv__read_uint64(filename); |
2187 | 0 | } |
2188 | | |
2189 | 0 | static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) { |
2190 | 0 | uint64_t high; |
2191 | 0 | uint64_t max; |
2192 | | |
2193 | | /* In the case of cgroupv2, we'll only have a single entry. */ |
2194 | 0 | if (strncmp(buf, "0::/", 4)) |
2195 | 0 | uv__get_cgroup1_memory_limits(buf, &high, &max); |
2196 | 0 | else |
2197 | 0 | uv__get_cgroup2_memory_limits(buf, &high, &max); |
2198 | |
|
2199 | 0 | if (high == 0 || max == 0) |
2200 | 0 | return 0; |
2201 | | |
2202 | 0 | return high < max ? high : max; |
2203 | 0 | } |
2204 | | |
2205 | 0 | uint64_t uv_get_constrained_memory(void) { |
2206 | 0 | char buf[1024]; |
2207 | |
|
2208 | 0 | if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf))) |
2209 | 0 | return 0; |
2210 | | |
2211 | 0 | return uv__get_cgroup_constrained_memory(buf); |
2212 | 0 | } |
2213 | | |
2214 | | |
2215 | 0 | static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) { |
2216 | 0 | char filename[4097]; |
2217 | 0 | uint64_t current; |
2218 | 0 | char* p; |
2219 | 0 | int n; |
2220 | | |
2221 | | /* Find out where the controller is mounted. */ |
2222 | 0 | p = uv__cgroup1_find_memory_controller(buf, &n); |
2223 | 0 | if (p != NULL) { |
2224 | 0 | snprintf(filename, sizeof(filename), |
2225 | 0 | "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p); |
2226 | 0 | current = uv__read_uint64(filename); |
2227 | | |
2228 | | /* If the controller wasn't mounted, the reads above will have failed, |
2229 | | * as indicated by uv__read_uint64 returning 0. |
2230 | | */ |
2231 | 0 | if (current != 0) |
2232 | 0 | return current; |
2233 | 0 | } |
2234 | | |
2235 | | /* Fall back to the usage of the global memory controller. */ |
2236 | 0 | return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes"); |
2237 | 0 | } |
2238 | | |
2239 | 0 | static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) { |
2240 | 0 | char filename[4097]; |
2241 | 0 | char* p; |
2242 | 0 | int n; |
2243 | | |
2244 | | /* Find out where the controller is mounted. */ |
2245 | 0 | p = buf + strlen("0::/"); |
2246 | 0 | n = (int) strcspn(p, "\n"); |
2247 | |
|
2248 | 0 | snprintf(filename, sizeof(filename), |
2249 | 0 | "/sys/fs/cgroup/%.*s/memory.current", n, p); |
2250 | 0 | return uv__read_uint64(filename); |
2251 | 0 | } |
2252 | | |
2253 | 0 | uint64_t uv_get_available_memory(void) { |
2254 | 0 | char buf[1024]; |
2255 | 0 | uint64_t constrained; |
2256 | 0 | uint64_t current; |
2257 | 0 | uint64_t total; |
2258 | |
|
2259 | 0 | if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf))) |
2260 | 0 | return 0; |
2261 | | |
2262 | 0 | constrained = uv__get_cgroup_constrained_memory(buf); |
2263 | 0 | if (constrained == 0) |
2264 | 0 | return uv_get_free_memory(); |
2265 | | |
2266 | 0 | total = uv_get_total_memory(); |
2267 | 0 | if (constrained > total) |
2268 | 0 | return uv_get_free_memory(); |
2269 | | |
2270 | | /* In the case of cgroupv2, we'll only have a single entry. */ |
2271 | 0 | if (strncmp(buf, "0::/", 4)) |
2272 | 0 | current = uv__get_cgroup1_current_memory(buf); |
2273 | 0 | else |
2274 | 0 | current = uv__get_cgroup2_current_memory(buf); |
2275 | | |
2276 | | /* memory usage can be higher than the limit (for short bursts of time) */ |
2277 | 0 | if (constrained < current) |
2278 | 0 | return 0; |
2279 | | |
2280 | 0 | return constrained - current; |
2281 | 0 | } |
2282 | | |
2283 | | |
2284 | | static int uv__get_cgroupv2_constrained_cpu(const char* cgroup, |
2285 | 0 | long long* quota) { |
2286 | 0 | static const char cgroup_mount[] = "/sys/fs/cgroup"; |
2287 | 0 | const char* cgroup_trimmed; |
2288 | 0 | char buf[1024]; |
2289 | 0 | char path[256]; |
2290 | 0 | char full_path[sizeof(path) + sizeof("/cpu.max")]; |
2291 | 0 | char quota_buf[16]; |
2292 | 0 | char* last_slash; |
2293 | 0 | int cgroup_size; |
2294 | 0 | long long limit; |
2295 | 0 | long long min_quota; |
2296 | 0 | long long period; |
2297 | |
|
2298 | 0 | if (strncmp(cgroup, "0::/", 4) != 0) |
2299 | 0 | return UV_EINVAL; |
2300 | | |
2301 | | /* Trim ending \n by replacing it with a 0 */ |
2302 | 0 | cgroup_trimmed = cgroup + sizeof("0::/") - 1; /* Skip the prefix "0::/" */ |
2303 | 0 | cgroup_size = (int)strcspn(cgroup_trimmed, "\n"); /* Find the first \n */ |
2304 | 0 | min_quota = LLONG_MAX; |
2305 | | |
2306 | | /* Construct the path to the cpu.max files */ |
2307 | 0 | snprintf(path, sizeof(path), "%s/%.*s/cgroup.controllers", cgroup_mount, |
2308 | 0 | cgroup_size, cgroup_trimmed); |
2309 | | |
2310 | | /* Read controllers, if not exists, not really a cgroup */ |
2311 | 0 | if (uv__slurp(path, buf, sizeof(buf)) < 0) |
2312 | 0 | return UV_EIO; |
2313 | | |
2314 | 0 | snprintf(path, sizeof(path), "%s/%.*s", cgroup_mount, cgroup_size, |
2315 | 0 | cgroup_trimmed); |
2316 | | |
2317 | | /* |
2318 | | * Traverse up the cgroup v2 hierarchy, starting from the current cgroup path. |
2319 | | * At each level, attempt to read the "cpu.max" file, which defines the CPU |
2320 | | * quota and period. |
2321 | | * |
2322 | | * This reflects how Linux applies cgroup limits hierarchically. |
2323 | | * |
2324 | | * e.g: given a path like /sys/fs/cgroup/foo/bar/baz, we check: |
2325 | | * - /sys/fs/cgroup/foo/bar/baz/cpu.max |
2326 | | * - /sys/fs/cgroup/foo/bar/cpu.max |
2327 | | * - /sys/fs/cgroup/foo/cpu.max |
2328 | | * - /sys/fs/cgroup/cpu.max |
2329 | | */ |
2330 | 0 | while (strncmp(path, cgroup_mount, strlen(cgroup_mount)) == 0) { |
2331 | 0 | snprintf(full_path, sizeof(full_path), "%s/cpu.max", path); |
2332 | | |
2333 | | /* Silently ignore and continue if the file does not exist */ |
2334 | 0 | if (uv__slurp(full_path, quota_buf, sizeof(quota_buf)) < 0) |
2335 | 0 | goto next; |
2336 | | |
2337 | | /* No limit, move on */ |
2338 | 0 | if (strncmp(quota_buf, "max", 3) == 0) |
2339 | 0 | goto next; |
2340 | | |
2341 | | /* Read cpu.max */ |
2342 | 0 | if (sscanf(quota_buf, "%lld %lld", &limit, &period) != 2) |
2343 | 0 | goto next; |
2344 | | |
2345 | | /* Can't divide by 0 */ |
2346 | 0 | if (period == 0) |
2347 | 0 | goto next; |
2348 | | |
2349 | 0 | *quota = limit / period; |
2350 | 0 | if (*quota == 0) |
2351 | 0 | *quota = 1; |
2352 | 0 | if (*quota < min_quota) |
2353 | 0 | min_quota = *quota; |
2354 | |
|
2355 | 0 | next: |
2356 | | /* Move up one level in the cgroup hierarchy by trimming the last path. |
2357 | | * The loop ends once we reach the cgroup root mount point. |
2358 | | */ |
2359 | 0 | last_slash = strrchr(path, '/'); |
2360 | 0 | if (last_slash == NULL || strcmp(path, cgroup_mount) == 0) |
2361 | 0 | break; |
2362 | 0 | *last_slash = '\0'; |
2363 | 0 | } |
2364 | | |
2365 | 0 | return 0; |
2366 | 0 | } |
2367 | | |
2368 | | static char* uv__cgroup1_find_cpu_controller(const char* cgroup, |
2369 | 0 | int* cgroup_size) { |
2370 | | /* Seek to the cpu controller line. */ |
2371 | 0 | char* cgroup_cpu = strstr(cgroup, ":cpu,"); |
2372 | |
|
2373 | 0 | if (cgroup_cpu != NULL) { |
2374 | | /* Skip the controller prefix to the start of the cgroup path. */ |
2375 | 0 | cgroup_cpu += sizeof(":cpu,") - 1; |
2376 | | /* Determine the length of the cgroup path, excluding the newline. */ |
2377 | 0 | *cgroup_size = (int)strcspn(cgroup_cpu, "\n"); |
2378 | 0 | } |
2379 | |
|
2380 | 0 | return cgroup_cpu; |
2381 | 0 | } |
2382 | | |
2383 | | static int uv__get_cgroupv1_constrained_cpu(const char* cgroup, |
2384 | 0 | long long* quota) { |
2385 | 0 | char path[256]; |
2386 | 0 | char buf[1024]; |
2387 | 0 | int cgroup_size; |
2388 | 0 | char* cgroup_cpu; |
2389 | 0 | long long period_length; |
2390 | 0 | long long quota_per_period; |
2391 | |
|
2392 | 0 | cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size); |
2393 | |
|
2394 | 0 | if (cgroup_cpu == NULL) |
2395 | 0 | return UV_EIO; |
2396 | | |
2397 | | /* Construct the path to the cpu.cfs_quota_us file */ |
2398 | 0 | snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us", |
2399 | 0 | cgroup_size, cgroup_cpu); |
2400 | | |
2401 | | /* Read cpu.cfs_quota_us */ |
2402 | 0 | if (uv__slurp(path, buf, sizeof(buf)) < 0) |
2403 | 0 | return UV_EIO; |
2404 | | |
2405 | 0 | if (sscanf(buf, "%lld", "a_per_period) != 1) |
2406 | 0 | return UV_EINVAL; |
2407 | | |
2408 | | /* Construct the path to the cpu.cfs_period_us file */ |
2409 | 0 | snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us", |
2410 | 0 | cgroup_size, cgroup_cpu); |
2411 | | |
2412 | | /* Read cpu.cfs_period_us */ |
2413 | 0 | if (uv__slurp(path, buf, sizeof(buf)) < 0) |
2414 | 0 | return UV_EIO; |
2415 | | |
2416 | 0 | if (sscanf(buf, "%lld", &period_length) != 1) |
2417 | 0 | return UV_EINVAL; |
2418 | | |
2419 | | /* Can't divide by 0 */ |
2420 | 0 | if (period_length == 0) |
2421 | 0 | return UV_EINVAL; |
2422 | | |
2423 | 0 | *quota = quota_per_period / period_length; |
2424 | |
|
2425 | 0 | return 0; |
2426 | 0 | } |
2427 | | |
2428 | 0 | int uv__get_constrained_cpu(long long* quota) { |
2429 | 0 | char cgroup[1024]; |
2430 | | |
2431 | | /* Read the cgroup from /proc/self/cgroup */ |
2432 | 0 | if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0) |
2433 | 0 | return UV_EIO; |
2434 | | |
2435 | | /* Check if the system is using cgroup v2 by examining /proc/self/cgroup |
2436 | | * The entry for cgroup v2 is always in the format "0::$PATH" |
2437 | | * see https://docs.kernel.org/admin-guide/cgroup-v2.html */ |
2438 | 0 | if (strncmp(cgroup, "0::/", 4) == 0) |
2439 | 0 | return uv__get_cgroupv2_constrained_cpu(cgroup, quota); |
2440 | 0 | else |
2441 | 0 | return uv__get_cgroupv1_constrained_cpu(cgroup, quota); |
2442 | 0 | } |
2443 | | |
2444 | | |
2445 | 0 | void uv_loadavg(double avg[3]) { |
2446 | 0 | struct sysinfo info; |
2447 | 0 | char buf[128]; /* Large enough to hold all of /proc/loadavg. */ |
2448 | |
|
2449 | 0 | if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf))) |
2450 | 0 | if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2])) |
2451 | 0 | return; |
2452 | | |
2453 | 0 | if (sysinfo(&info) < 0) |
2454 | 0 | return; |
2455 | | |
2456 | 0 | avg[0] = (double) info.loads[0] / 65536.0; |
2457 | 0 | avg[1] = (double) info.loads[1] / 65536.0; |
2458 | 0 | avg[2] = (double) info.loads[2] / 65536.0; |
2459 | 0 | } |
2460 | | |
2461 | | |
2462 | | static int compare_watchers(const struct watcher_list* a, |
2463 | 0 | const struct watcher_list* b) { |
2464 | 0 | if (a->wd < b->wd) return -1; |
2465 | 0 | if (a->wd > b->wd) return 1; |
2466 | 0 | return 0; |
2467 | 0 | } |
2468 | | |
2469 | | |
2470 | 0 | static int init_inotify(uv_loop_t* loop) { |
2471 | 0 | int err; |
2472 | 0 | int fd; |
2473 | |
|
2474 | 0 | if (loop->inotify_fd != -1) |
2475 | 0 | return 0; |
2476 | | |
2477 | 0 | fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); |
2478 | 0 | if (fd < 0) |
2479 | 0 | return UV__ERR(errno); |
2480 | | |
2481 | 0 | err = uv__io_init_start(loop, &loop->inotify_read_watcher, UV__INOTIFY_READ, |
2482 | 0 | fd, POLLIN); |
2483 | 0 | if (err) { |
2484 | 0 | uv__close(fd); |
2485 | 0 | return err; |
2486 | 0 | } |
2487 | | |
2488 | 0 | loop->inotify_fd = fd; |
2489 | 0 | return 0; |
2490 | 0 | } |
2491 | | |
2492 | | |
2493 | 0 | static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) { |
2494 | | /* Open the inotify_fd, and re-arm all the inotify watchers. */ |
2495 | 0 | int err; |
2496 | 0 | struct watcher_list* tmp_watcher_list_iter; |
2497 | 0 | struct watcher_list* watcher_list; |
2498 | 0 | struct watcher_list tmp_watcher_list; |
2499 | 0 | struct uv__queue queue; |
2500 | 0 | struct uv__queue* q; |
2501 | 0 | uv_fs_event_t* handle; |
2502 | 0 | char* tmp_path; |
2503 | |
|
2504 | 0 | if (root == NULL) |
2505 | 0 | return 0; |
2506 | | |
2507 | | /* We must restore the old watcher list to be able to close items |
2508 | | * out of it. |
2509 | | */ |
2510 | 0 | loop->inotify_watchers = root; |
2511 | |
|
2512 | 0 | uv__queue_init(&tmp_watcher_list.watchers); |
2513 | | /* Note that the queue we use is shared with the start and stop() |
2514 | | * functions, making uv__queue_foreach unsafe to use. So we use the |
2515 | | * uv__queue_move trick to safely iterate. Also don't free the watcher |
2516 | | * list until we're done iterating. c.f. uv__inotify_read. |
2517 | | */ |
2518 | 0 | RB_FOREACH_SAFE(watcher_list, watcher_root, |
2519 | 0 | uv__inotify_watchers(loop), tmp_watcher_list_iter) { |
2520 | 0 | watcher_list->iterating = 1; |
2521 | 0 | uv__queue_move(&watcher_list->watchers, &queue); |
2522 | 0 | while (!uv__queue_empty(&queue)) { |
2523 | 0 | q = uv__queue_head(&queue); |
2524 | 0 | handle = uv__queue_data(q, uv_fs_event_t, watchers); |
2525 | | /* It's critical to keep a copy of path here, because it |
2526 | | * will be set to NULL by stop() and then deallocated by |
2527 | | * maybe_free_watcher_list |
2528 | | */ |
2529 | 0 | tmp_path = uv__strdup(handle->path); |
2530 | 0 | assert(tmp_path != NULL); |
2531 | 0 | uv__queue_remove(q); |
2532 | 0 | uv__queue_insert_tail(&watcher_list->watchers, q); |
2533 | 0 | uv_fs_event_stop(handle); |
2534 | |
|
2535 | 0 | uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers); |
2536 | 0 | handle->path = tmp_path; |
2537 | 0 | } |
2538 | 0 | watcher_list->iterating = 0; |
2539 | 0 | maybe_free_watcher_list(watcher_list, loop); |
2540 | 0 | } |
2541 | |
|
2542 | 0 | uv__queue_move(&tmp_watcher_list.watchers, &queue); |
2543 | 0 | while (!uv__queue_empty(&queue)) { |
2544 | 0 | q = uv__queue_head(&queue); |
2545 | 0 | uv__queue_remove(q); |
2546 | 0 | handle = uv__queue_data(q, uv_fs_event_t, watchers); |
2547 | 0 | tmp_path = handle->path; |
2548 | 0 | handle->path = NULL; |
2549 | 0 | err = uv_fs_event_start(handle, handle->cb, tmp_path, 0); |
2550 | 0 | uv__free(tmp_path); |
2551 | 0 | if (err) |
2552 | 0 | return err; |
2553 | 0 | } |
2554 | | |
2555 | 0 | return 0; |
2556 | 0 | } |
2557 | | |
2558 | | |
2559 | 0 | static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) { |
2560 | 0 | struct watcher_list w; |
2561 | 0 | w.wd = wd; |
2562 | 0 | return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w); |
2563 | 0 | } |
2564 | | |
2565 | | |
2566 | 0 | static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) { |
2567 | | /* if the watcher_list->watchers is being iterated over, we can't free it. */ |
2568 | 0 | if ((!w->iterating) && uv__queue_empty(&w->watchers)) { |
2569 | | /* No watchers left for this path. Clean up. */ |
2570 | 0 | RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w); |
2571 | 0 | inotify_rm_watch(loop->inotify_fd, w->wd); |
2572 | 0 | uv__free(w); |
2573 | 0 | } |
2574 | 0 | } |
2575 | | |
2576 | | |
2577 | 0 | void uv__inotify_read(uv_loop_t* loop, uv__io_t* dummy, unsigned int events) { |
2578 | 0 | const struct inotify_event* e; |
2579 | 0 | struct watcher_list* w; |
2580 | 0 | uv_fs_event_t* h; |
2581 | 0 | struct uv__queue queue; |
2582 | 0 | struct uv__queue* q; |
2583 | 0 | const char* path; |
2584 | 0 | ssize_t size; |
2585 | 0 | const char *p; |
2586 | | /* needs to be large enough for sizeof(inotify_event) + strlen(path) */ |
2587 | 0 | char buf[4096]; |
2588 | |
|
2589 | 0 | for (;;) { |
2590 | 0 | do |
2591 | 0 | size = read(loop->inotify_fd, buf, sizeof(buf)); |
2592 | 0 | while (size == -1 && errno == EINTR); |
2593 | |
|
2594 | 0 | if (size == -1) { |
2595 | 0 | assert(errno == EAGAIN || errno == EWOULDBLOCK); |
2596 | 0 | break; |
2597 | 0 | } |
2598 | | |
2599 | 0 | assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */ |
2600 | | |
2601 | | /* Now we have one or more inotify_event structs. */ |
2602 | 0 | for (p = buf; p < buf + size; p += sizeof(*e) + e->len) { |
2603 | 0 | e = (const struct inotify_event*) p; |
2604 | |
|
2605 | 0 | events = 0; |
2606 | 0 | if (e->mask & (IN_ATTRIB|IN_MODIFY)) |
2607 | 0 | events |= UV_CHANGE; |
2608 | 0 | if (e->mask & ~(IN_ATTRIB|IN_MODIFY)) |
2609 | 0 | events |= UV_RENAME; |
2610 | |
|
2611 | 0 | w = find_watcher(loop, e->wd); |
2612 | 0 | if (w == NULL) |
2613 | 0 | continue; /* Stale event, no watchers left. */ |
2614 | | |
2615 | | /* inotify does not return the filename when monitoring a single file |
2616 | | * for modifications. Repurpose the filename for API compatibility. |
2617 | | * I'm not convinced this is a good thing, maybe it should go. |
2618 | | */ |
2619 | 0 | path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path); |
2620 | | |
2621 | | /* We're about to iterate over the queue and call user's callbacks. |
2622 | | * What can go wrong? |
2623 | | * A callback could call uv_fs_event_stop() |
2624 | | * and the queue can change under our feet. |
2625 | | * So, we use uv__queue_move() trick to safely iterate over the queue. |
2626 | | * And we don't free the watcher_list until we're done iterating. |
2627 | | * |
2628 | | * First, |
2629 | | * tell uv_fs_event_stop() (that could be called from a user's callback) |
2630 | | * not to free watcher_list. |
2631 | | */ |
2632 | 0 | w->iterating = 1; |
2633 | 0 | uv__queue_move(&w->watchers, &queue); |
2634 | 0 | while (!uv__queue_empty(&queue)) { |
2635 | 0 | q = uv__queue_head(&queue); |
2636 | 0 | h = uv__queue_data(q, uv_fs_event_t, watchers); |
2637 | |
|
2638 | 0 | uv__queue_remove(q); |
2639 | 0 | uv__queue_insert_tail(&w->watchers, q); |
2640 | |
|
2641 | 0 | h->cb(h, path, events, 0); |
2642 | 0 | } |
2643 | | /* done iterating, time to (maybe) free empty watcher_list */ |
2644 | 0 | w->iterating = 0; |
2645 | 0 | maybe_free_watcher_list(w, loop); |
2646 | 0 | } |
2647 | 0 | } |
2648 | 0 | } |
2649 | | |
2650 | | |
2651 | 0 | int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) { |
2652 | 0 | uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT); |
2653 | 0 | return 0; |
2654 | 0 | } |
2655 | | |
2656 | | |
2657 | | int uv_fs_event_start(uv_fs_event_t* handle, |
2658 | | uv_fs_event_cb cb, |
2659 | | const char* path, |
2660 | 0 | unsigned int flags) { |
2661 | 0 | struct watcher_list* w; |
2662 | 0 | uv_loop_t* loop; |
2663 | 0 | size_t len; |
2664 | 0 | int events; |
2665 | 0 | int err; |
2666 | 0 | int wd; |
2667 | |
|
2668 | 0 | if (uv__is_active(handle)) |
2669 | 0 | return UV_EINVAL; |
2670 | | |
2671 | 0 | loop = handle->loop; |
2672 | |
|
2673 | 0 | err = init_inotify(loop); |
2674 | 0 | if (err) |
2675 | 0 | return err; |
2676 | | |
2677 | 0 | events = IN_ATTRIB |
2678 | 0 | | IN_CREATE |
2679 | 0 | | IN_MODIFY |
2680 | 0 | | IN_DELETE |
2681 | 0 | | IN_DELETE_SELF |
2682 | 0 | | IN_MOVE_SELF |
2683 | 0 | | IN_MOVED_FROM |
2684 | 0 | | IN_MOVED_TO; |
2685 | |
|
2686 | 0 | wd = inotify_add_watch(loop->inotify_fd, path, events); |
2687 | 0 | if (wd == -1) |
2688 | 0 | return UV__ERR(errno); |
2689 | | |
2690 | 0 | w = find_watcher(loop, wd); |
2691 | 0 | if (w) |
2692 | 0 | goto no_insert; |
2693 | | |
2694 | 0 | len = strlen(path) + 1; |
2695 | 0 | w = uv__malloc(sizeof(*w) + len); |
2696 | 0 | if (w == NULL) |
2697 | 0 | return UV_ENOMEM; |
2698 | | |
2699 | 0 | w->wd = wd; |
2700 | 0 | w->path = memcpy(w + 1, path, len); |
2701 | 0 | uv__queue_init(&w->watchers); |
2702 | 0 | w->iterating = 0; |
2703 | 0 | RB_INSERT(watcher_root, uv__inotify_watchers(loop), w); |
2704 | |
|
2705 | 0 | no_insert: |
2706 | 0 | uv__handle_start(handle); |
2707 | 0 | uv__queue_insert_tail(&w->watchers, &handle->watchers); |
2708 | 0 | handle->path = w->path; |
2709 | 0 | handle->cb = cb; |
2710 | 0 | handle->wd = wd; |
2711 | |
|
2712 | 0 | return 0; |
2713 | 0 | } |
2714 | | |
2715 | | |
2716 | 0 | int uv_fs_event_stop(uv_fs_event_t* handle) { |
2717 | 0 | struct watcher_list* w; |
2718 | |
|
2719 | 0 | if (!uv__is_active(handle)) |
2720 | 0 | return 0; |
2721 | | |
2722 | 0 | w = find_watcher(handle->loop, handle->wd); |
2723 | 0 | assert(w != NULL); |
2724 | |
|
2725 | 0 | handle->wd = -1; |
2726 | 0 | handle->path = NULL; |
2727 | 0 | uv__handle_stop(handle); |
2728 | 0 | uv__queue_remove(&handle->watchers); |
2729 | |
|
2730 | 0 | maybe_free_watcher_list(w, handle->loop); |
2731 | |
|
2732 | 0 | return 0; |
2733 | 0 | } |
2734 | | |
2735 | | |
2736 | 0 | void uv__fs_event_close(uv_fs_event_t* handle) { |
2737 | 0 | uv_fs_event_stop(handle); |
2738 | 0 | } |