/src/systemd/src/shared/cgroup-setup.c
Line | Count | Source |
1 | | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
2 | | |
3 | | #include <linux/magic.h> |
4 | | #include <unistd.h> |
5 | | |
6 | | #include "cgroup-setup.h" |
7 | | #include "cgroup-util.h" |
8 | | #include "errno-util.h" |
9 | | #include "fd-util.h" |
10 | | #include "fileio.h" |
11 | | #include "format-util.h" |
12 | | #include "fs-util.h" |
13 | | #include "log.h" |
14 | | #include "mkdir.h" |
15 | | #include "parse-util.h" |
16 | | #include "path-util.h" |
17 | | #include "process-util.h" |
18 | | #include "recurse-dir.h" |
19 | | #include "set.h" |
20 | | #include "stat-util.h" |
21 | | #include "stdio-util.h" |
22 | | #include "string-util.h" |
23 | | #include "user-util.h" |
24 | | |
25 | 4.34k | int cg_weight_parse(const char *s, uint64_t *ret) { |
26 | 4.34k | uint64_t u; |
27 | 4.34k | int r; |
28 | | |
29 | 4.34k | assert(s); |
30 | 4.34k | assert(ret); |
31 | | |
32 | 4.34k | if (isempty(s)) { |
33 | 395 | *ret = CGROUP_WEIGHT_INVALID; |
34 | 395 | return 0; |
35 | 395 | } |
36 | | |
37 | 3.94k | r = safe_atou64(s, &u); |
38 | 3.94k | if (r < 0) |
39 | 1.22k | return r; |
40 | | |
41 | 2.72k | if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX) |
42 | 1.15k | return -ERANGE; |
43 | | |
44 | 1.56k | *ret = u; |
45 | 1.56k | return 0; |
46 | 2.72k | } |
47 | | |
48 | 1.87k | int cg_cpu_weight_parse(const char *s, uint64_t *ret) { |
49 | 1.87k | assert(s); |
50 | 1.87k | assert(ret); |
51 | | |
52 | 1.87k | if (streq(s, "idle")) |
53 | 254 | return *ret = CGROUP_WEIGHT_IDLE; |
54 | | |
55 | 1.61k | return cg_weight_parse(s, ret); |
56 | 1.87k | } |
57 | | |
58 | | static int trim_cb( |
59 | | RecurseDirEvent event, |
60 | | const char *path, |
61 | | int dir_fd, |
62 | | int inode_fd, |
63 | | const struct dirent *de, |
64 | | const struct statx *sx, |
65 | 0 | void *userdata) { |
66 | | |
67 | | /* Failures to delete inner cgroup we ignore (but debug log in case error code is unexpected) */ |
68 | 0 | if (event == RECURSE_DIR_LEAVE && |
69 | 0 | de->d_type == DT_DIR && |
70 | 0 | unlinkat(dir_fd, de->d_name, AT_REMOVEDIR) < 0 && |
71 | 0 | !IN_SET(errno, ENOENT, ENOTEMPTY, EBUSY)) |
72 | 0 | log_debug_errno(errno, "Failed to trim inner cgroup %s, ignoring: %m", path); |
73 | |
|
74 | 0 | return RECURSE_DIR_CONTINUE; |
75 | 0 | } |
76 | | |
77 | 0 | int cg_trim(const char *path, bool delete_root) { |
78 | 0 | _cleanup_free_ char *fs = NULL; |
79 | 0 | int r; |
80 | |
|
81 | 0 | r = cg_get_path(path, /* suffix= */ NULL, &fs); |
82 | 0 | if (r < 0) |
83 | 0 | return r; |
84 | | |
85 | 0 | r = recurse_dir_at( |
86 | 0 | AT_FDCWD, |
87 | 0 | fs, |
88 | 0 | /* statx_mask= */ 0, |
89 | 0 | /* n_depth_max= */ UINT_MAX, |
90 | 0 | RECURSE_DIR_ENSURE_TYPE, |
91 | 0 | trim_cb, |
92 | 0 | /* userdata= */ NULL); |
93 | 0 | if (r == -ENOENT) /* non-existing is the ultimate trimming, hence no error */ |
94 | 0 | r = 0; |
95 | 0 | else if (r < 0) |
96 | 0 | log_debug_errno(r, "Failed to trim subcgroups of '%s': %m", path); |
97 | | |
98 | | /* If we shall delete the top-level cgroup, then propagate the failure to do so (except if it is |
99 | | * already gone anyway). Also, let's debug log about this failure, except if the error code is an |
100 | | * expected one. */ |
101 | 0 | if (delete_root && !empty_or_root(path) && |
102 | 0 | rmdir(fs) < 0 && errno != ENOENT) { |
103 | 0 | if (!IN_SET(errno, ENOTEMPTY, EBUSY)) |
104 | 0 | log_debug_errno(errno, "Failed to trim cgroup '%s': %m", path); |
105 | 0 | RET_GATHER(r, -errno); |
106 | 0 | } |
107 | |
|
108 | 0 | return r; |
109 | 0 | } |
110 | | |
111 | | /* Create a cgroup in the hierarchy of controller. |
112 | | * Returns 0 if the group already existed, 1 on success, negative otherwise. |
113 | | */ |
114 | 0 | int cg_create(const char *path) { |
115 | 0 | _cleanup_free_ char *fs = NULL; |
116 | 0 | int r; |
117 | |
|
118 | 0 | r = cg_get_path(path, /* suffix= */ NULL, &fs); |
119 | 0 | if (r < 0) |
120 | 0 | return r; |
121 | | |
122 | 0 | r = mkdir_parents(fs, 0755); |
123 | 0 | if (r < 0) |
124 | 0 | return r; |
125 | | |
126 | 0 | r = RET_NERRNO(mkdir(fs, 0755)); |
127 | 0 | if (r == -EEXIST) |
128 | 0 | return 0; |
129 | 0 | if (r < 0) |
130 | 0 | return r; |
131 | | |
132 | 0 | return 1; |
133 | 0 | } |
134 | | |
135 | 0 | int cg_attach(const char *path, pid_t pid) { |
136 | 0 | _cleanup_free_ char *fs = NULL; |
137 | 0 | char c[DECIMAL_STR_MAX(pid_t) + 2]; |
138 | 0 | int r; |
139 | |
|
140 | 0 | assert(path); |
141 | 0 | assert(pid >= 0); |
142 | |
|
143 | 0 | r = cg_get_path(path, "cgroup.procs", &fs); |
144 | 0 | if (r < 0) |
145 | 0 | return r; |
146 | | |
147 | 0 | if (pid == 0) |
148 | 0 | pid = getpid_cached(); |
149 | |
|
150 | 0 | xsprintf(c, PID_FMT "\n", pid); |
151 | |
|
152 | 0 | r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER); |
153 | 0 | if (r == -EOPNOTSUPP && cg_is_threaded(path) > 0) |
154 | | /* When the threaded mode is used, we cannot read/write the file. Let's return recognizable error. */ |
155 | 0 | return -EUCLEAN; |
156 | 0 | if (r < 0) |
157 | 0 | return r; |
158 | | |
159 | 0 | return 0; |
160 | 0 | } |
161 | | |
162 | 0 | int cg_fd_attach(int fd, pid_t pid) { |
163 | 0 | char c[DECIMAL_STR_MAX(pid_t) + 2]; |
164 | |
|
165 | 0 | assert(fd >= 0); |
166 | 0 | assert(pid >= 0); |
167 | |
|
168 | 0 | if (pid == 0) |
169 | 0 | pid = getpid_cached(); |
170 | |
|
171 | 0 | xsprintf(c, PID_FMT "\n", pid); |
172 | |
|
173 | 0 | return write_string_file_at(fd, "cgroup.procs", c, WRITE_STRING_FILE_DISABLE_BUFFER); |
174 | 0 | } |
175 | | |
176 | 0 | int cg_create_and_attach(const char *path, pid_t pid) { |
177 | 0 | int r, q; |
178 | | |
179 | | /* This does not remove the cgroup on failure */ |
180 | |
|
181 | 0 | assert(pid >= 0); |
182 | |
|
183 | 0 | r = cg_create(path); |
184 | 0 | if (r < 0) |
185 | 0 | return r; |
186 | | |
187 | 0 | q = cg_attach(path, pid); |
188 | 0 | if (q < 0) |
189 | 0 | return q; |
190 | | |
191 | 0 | return r; |
192 | 0 | } |
193 | | |
194 | | int cg_set_access( |
195 | | const char *path, |
196 | | uid_t uid, |
197 | 0 | gid_t gid) { |
198 | |
|
199 | 0 | static const struct { |
200 | 0 | const char *name; |
201 | 0 | bool fatal; |
202 | 0 | } attributes[] = { |
203 | 0 | { "cgroup.procs", true }, |
204 | 0 | { "cgroup.subtree_control", true }, |
205 | 0 | { "cgroup.threads", false }, |
206 | 0 | { "memory.oom.group", false }, |
207 | 0 | { "memory.reclaim", false }, |
208 | 0 | }; |
209 | |
|
210 | 0 | _cleanup_free_ char *fs = NULL; |
211 | 0 | int r; |
212 | |
|
213 | 0 | assert(path); |
214 | |
|
215 | 0 | if (uid == UID_INVALID && gid == GID_INVALID) |
216 | 0 | return 0; |
217 | | |
218 | | /* Configure access to the cgroup itself */ |
219 | 0 | r = cg_get_path(path, /* suffix= */ NULL, &fs); |
220 | 0 | if (r < 0) |
221 | 0 | return r; |
222 | | |
223 | 0 | r = chmod_and_chown(fs, 0755, uid, gid); |
224 | 0 | if (r < 0) |
225 | 0 | return r; |
226 | | |
227 | | /* Configure access to the cgroup's attributes */ |
228 | 0 | FOREACH_ELEMENT(i, attributes) { |
229 | 0 | _cleanup_free_ char *a = path_join(fs, i->name); |
230 | 0 | if (!a) |
231 | 0 | return -ENOMEM; |
232 | | |
233 | 0 | r = chmod_and_chown(a, 0644, uid, gid); |
234 | 0 | if (r < 0) { |
235 | 0 | if (i->fatal) |
236 | 0 | return r; |
237 | | |
238 | 0 | log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", a); |
239 | 0 | } |
240 | 0 | } |
241 | | |
242 | 0 | return 0; |
243 | 0 | } |
244 | | |
245 | | struct access_callback_data { |
246 | | uid_t uid; |
247 | | gid_t gid; |
248 | | int error; |
249 | | }; |
250 | | |
251 | | static int access_callback( |
252 | | RecurseDirEvent event, |
253 | | const char *path, |
254 | | int dir_fd, |
255 | | int inode_fd, |
256 | | const struct dirent *de, |
257 | | const struct statx *sx, |
258 | 0 | void *userdata) { |
259 | |
|
260 | 0 | if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY)) |
261 | 0 | return RECURSE_DIR_CONTINUE; |
262 | | |
263 | 0 | struct access_callback_data *d = ASSERT_PTR(userdata); |
264 | |
|
265 | 0 | assert(path); |
266 | 0 | assert(inode_fd >= 0); |
267 | |
|
268 | 0 | if (fchownat(inode_fd, "", d->uid, d->gid, AT_EMPTY_PATH) < 0) |
269 | 0 | RET_GATHER(d->error, log_debug_errno(errno, "Failed to change ownership of '%s', ignoring: %m", path)); |
270 | |
|
271 | 0 | return RECURSE_DIR_CONTINUE; |
272 | 0 | } |
273 | | |
274 | | int cg_set_access_recursive( |
275 | | const char *path, |
276 | | uid_t uid, |
277 | 0 | gid_t gid) { |
278 | |
|
279 | 0 | _cleanup_close_ int fd = -EBADF; |
280 | 0 | _cleanup_free_ char *fs = NULL; |
281 | 0 | int r; |
282 | |
|
283 | 0 | assert(path); |
284 | | |
285 | | /* A recursive version of cg_set_access(). But note that this one changes ownership of *all* files, |
286 | | * not just the allowlist that cg_set_access() uses. Use cg_set_access() on the cgroup you want to |
287 | | * delegate, and cg_set_access_recursive() for any subcgroups you might want to create below it. */ |
288 | |
|
289 | 0 | if (!uid_is_valid(uid) && !gid_is_valid(gid)) |
290 | 0 | return 0; |
291 | | |
292 | 0 | r = cg_get_path(path, /* suffix= */ NULL, &fs); |
293 | 0 | if (r < 0) |
294 | 0 | return r; |
295 | | |
296 | 0 | fd = open(fs, O_DIRECTORY|O_CLOEXEC); |
297 | 0 | if (fd < 0) |
298 | 0 | return -errno; |
299 | | |
300 | 0 | struct access_callback_data d = { |
301 | 0 | .uid = uid, |
302 | 0 | .gid = gid, |
303 | 0 | }; |
304 | |
|
305 | 0 | r = recurse_dir(fd, |
306 | 0 | fs, |
307 | 0 | /* statx_mask= */ 0, |
308 | 0 | /* n_depth_max= */ UINT_MAX, |
309 | 0 | RECURSE_DIR_SAME_MOUNT|RECURSE_DIR_INODE_FD|RECURSE_DIR_TOPLEVEL, |
310 | 0 | access_callback, |
311 | 0 | &d); |
312 | 0 | if (r < 0) |
313 | 0 | return r; |
314 | | |
315 | 0 | assert(d.error <= 0); |
316 | 0 | return d.error; |
317 | 0 | } |
318 | | |
319 | | int cg_migrate( |
320 | | const char *from, |
321 | | const char *to, |
322 | 0 | CGroupFlags flags) { |
323 | |
|
324 | 0 | _cleanup_set_free_ Set *s = NULL; |
325 | 0 | bool done; |
326 | 0 | int r, ret = 0; |
327 | |
|
328 | 0 | assert(from); |
329 | 0 | assert(to); |
330 | |
|
331 | 0 | do { |
332 | 0 | _cleanup_fclose_ FILE *f = NULL; |
333 | 0 | pid_t pid; |
334 | |
|
335 | 0 | done = true; |
336 | |
|
337 | 0 | r = cg_enumerate_processes(from, &f); |
338 | 0 | if (r < 0) |
339 | 0 | return RET_GATHER(ret, r); |
340 | | |
341 | 0 | while ((r = cg_read_pid(f, &pid, flags)) > 0) { |
342 | | /* Throw an error if unmappable PIDs are in output, we can't migrate those. */ |
343 | 0 | if (pid == 0) |
344 | 0 | return -EREMOTE; |
345 | | |
346 | | /* This might do weird stuff if we aren't a single-threaded program. However, we |
347 | | * luckily know we are. */ |
348 | 0 | if (FLAGS_SET(flags, CGROUP_IGNORE_SELF) && pid == getpid_cached()) |
349 | 0 | continue; |
350 | | |
351 | 0 | if (set_contains(s, PID_TO_PTR(pid))) |
352 | 0 | continue; |
353 | | |
354 | 0 | if (pid_is_kernel_thread(pid) > 0) |
355 | 0 | continue; |
356 | | |
357 | 0 | r = cg_attach(to, pid); |
358 | 0 | if (r < 0) { |
359 | 0 | if (r != -ESRCH) |
360 | 0 | RET_GATHER(ret, r); |
361 | 0 | } else if (ret == 0) |
362 | 0 | ret = 1; |
363 | |
|
364 | 0 | done = false; |
365 | |
|
366 | 0 | r = set_ensure_put(&s, /* hash_ops= */ NULL, PID_TO_PTR(pid)); |
367 | 0 | if (r < 0) |
368 | 0 | return RET_GATHER(ret, r); |
369 | 0 | } |
370 | 0 | if (r == -ENODEV) |
371 | 0 | continue; |
372 | 0 | if (r < 0) |
373 | 0 | return RET_GATHER(ret, r); |
374 | 0 | } while (!done); |
375 | | |
376 | 0 | return ret; |
377 | 0 | } |
378 | | |
379 | | int cg_enable( |
380 | | CGroupMask supported, |
381 | | CGroupMask mask, |
382 | | const char *p, |
383 | 0 | CGroupMask *ret_result_mask) { |
384 | |
|
385 | 0 | _cleanup_fclose_ FILE *f = NULL; |
386 | 0 | _cleanup_free_ char *fs = NULL; |
387 | 0 | CGroupController c; |
388 | 0 | CGroupMask ret = 0; |
389 | 0 | int r; |
390 | |
|
391 | 0 | assert(p); |
392 | |
|
393 | 0 | if (supported == 0) { |
394 | 0 | if (ret_result_mask) |
395 | 0 | *ret_result_mask = 0; |
396 | 0 | return 0; |
397 | 0 | } |
398 | | |
399 | 0 | r = cg_get_path(p, "cgroup.subtree_control", &fs); |
400 | 0 | if (r < 0) |
401 | 0 | return r; |
402 | | |
403 | 0 | for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { |
404 | 0 | CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); |
405 | 0 | const char *n; |
406 | |
|
407 | 0 | if (!FLAGS_SET(CGROUP_MASK_V2, bit)) |
408 | 0 | continue; |
409 | | |
410 | 0 | if (!FLAGS_SET(supported, bit)) |
411 | 0 | continue; |
412 | | |
413 | 0 | n = cgroup_controller_to_string(c); |
414 | 0 | { |
415 | 0 | char s[1 + strlen(n) + 1]; |
416 | |
|
417 | 0 | s[0] = FLAGS_SET(mask, bit) ? '+' : '-'; |
418 | 0 | strcpy(s + 1, n); |
419 | |
|
420 | 0 | if (!f) { |
421 | 0 | f = fopen(fs, "we"); |
422 | 0 | if (!f) |
423 | 0 | return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p); |
424 | 0 | } |
425 | | |
426 | 0 | r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER); |
427 | 0 | if (r < 0) { |
428 | 0 | log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m", |
429 | 0 | FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs); |
430 | 0 | clearerr(f); |
431 | | |
432 | | /* If we can't turn off a controller, leave it on in the reported resulting mask. This |
433 | | * happens for example when we attempt to turn off a controller up in the tree that is |
434 | | * used down in the tree. */ |
435 | 0 | if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY |
436 | | * only here, and not follow the same logic |
437 | | * for other errors such as EINVAL or |
438 | | * EOPNOTSUPP or anything else. That's |
439 | | * because EBUSY indicates that the |
440 | | * controllers is currently enabled and |
441 | | * cannot be disabled because something down |
442 | | * the hierarchy is still using it. Any other |
443 | | * error most likely means something like "I |
444 | | * never heard of this controller" or |
445 | | * similar. In the former case it's hence |
446 | | * safe to assume the controller is still on |
447 | | * after the failed operation, while in the |
448 | | * latter case it's safer to assume the |
449 | | * controller is unknown and hence certainly |
450 | | * not enabled. */ |
451 | 0 | ret |= bit; |
452 | 0 | } else { |
453 | | /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */ |
454 | 0 | if (FLAGS_SET(mask, bit)) |
455 | 0 | ret |= bit; |
456 | 0 | } |
457 | 0 | } |
458 | 0 | } |
459 | | |
460 | | /* Let's return the precise set of controllers now enabled for the cgroup. */ |
461 | 0 | if (ret_result_mask) |
462 | 0 | *ret_result_mask = ret; |
463 | |
|
464 | 0 | return 0; |
465 | 0 | } |
466 | | |
467 | 0 | int cg_has_legacy(void) { |
468 | 0 | struct statfs fs; |
469 | | |
470 | | /* Checks if any legacy controller/hierarchy is mounted. */ |
471 | |
|
472 | 0 | if (statfs("/sys/fs/cgroup/", &fs) < 0) { |
473 | 0 | if (errno == ENOENT) /* sysfs not mounted? */ |
474 | 0 | return false; |
475 | | |
476 | 0 | return log_error_errno(errno, "Failed to statfs /sys/fs/cgroup/: %m"); |
477 | 0 | } |
478 | | |
479 | 0 | if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC) || |
480 | 0 | is_fs_type(&fs, SYSFS_MAGIC)) /* not mounted yet */ |
481 | 0 | return false; |
482 | | |
483 | 0 | if (is_fs_type(&fs, TMPFS_MAGIC)) { |
484 | 0 | log_info("Found tmpfs on /sys/fs/cgroup/, assuming legacy hierarchy."); |
485 | 0 | return true; |
486 | 0 | } |
487 | | |
488 | 0 | return log_error_errno(SYNTHETIC_ERRNO(ENOMEDIUM), |
489 | 0 | "Unknown filesystem type %llx mounted on /sys/fs/cgroup/.", |
490 | 0 | (unsigned long long) fs.f_type); |
491 | 0 | } |