/src/crosvm/third_party/minijail/libminijail.c
Line | Count | Source |
1 | | /* Copyright 2012 The ChromiumOS Authors |
2 | | * Use of this source code is governed by a BSD-style license that can be |
3 | | * found in the LICENSE file. |
4 | | */ |
5 | | |
6 | | #define _BSD_SOURCE |
7 | | #define _DEFAULT_SOURCE |
8 | | #define _GNU_SOURCE |
9 | | |
10 | | #include <asm/unistd.h> |
11 | | #include <assert.h> |
12 | | #include <dirent.h> |
13 | | #include <errno.h> |
14 | | #include <fcntl.h> |
15 | | #include <grp.h> |
16 | | #include <linux/capability.h> |
17 | | #include <linux/filter.h> |
18 | | #include <sched.h> |
19 | | #include <signal.h> |
20 | | #include <stddef.h> |
21 | | #include <stdio.h> |
22 | | #include <stdlib.h> |
23 | | #include <string.h> |
24 | | #include <sys/capability.h> |
25 | | #include <sys/mount.h> |
26 | | #include <sys/param.h> |
27 | | #include <sys/prctl.h> |
28 | | #include <sys/resource.h> |
29 | | #include <sys/stat.h> |
30 | | #include <sys/sysmacros.h> |
31 | | #include <sys/types.h> |
32 | | #include <sys/user.h> |
33 | | #include <sys/wait.h> |
34 | | #include <syscall.h> |
35 | | #include <unistd.h> |
36 | | |
37 | | #include "landlock_util.h" |
38 | | #include "libminijail-private.h" |
39 | | #include "libminijail.h" |
40 | | |
41 | | #include "signal_handler.h" |
42 | | #include "syscall_filter.h" |
43 | | #include "syscall_wrapper.h" |
44 | | #include "system.h" |
45 | | #include "util.h" |
46 | | |
47 | | /* Until these are reliably available in linux/prctl.h. */ |
48 | | #ifndef PR_ALT_SYSCALL |
49 | 0 | #define PR_ALT_SYSCALL 0x43724f53 |
50 | | #endif |
51 | | |
52 | | /* New cgroup namespace might not be in linux-headers yet. */ |
53 | | #ifndef CLONE_NEWCGROUP |
54 | | #define CLONE_NEWCGROUP 0x02000000 |
55 | | #endif |
56 | | |
57 | 0 | #define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */ |
58 | | |
59 | 0 | #define MAX_RLIMITS 32 /* Currently there are 15 supported by Linux. */ |
60 | | |
61 | 0 | #define MAX_PRESERVED_FDS 128U |
62 | | |
63 | | /* Keyctl commands. */ |
64 | 0 | #define KEYCTL_JOIN_SESSION_KEYRING 1 |
65 | | |
66 | | /* |
67 | | * The userspace equivalent of MNT_USER_SETTABLE_MASK, which is the mask of all |
68 | | * flags that can be modified by MS_REMOUNT. |
69 | | */ |
70 | | #define MS_USER_SETTABLE_MASK \ |
71 | 0 | (MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_NOATIME | MS_NODIRATIME | \ |
72 | 0 | MS_RELATIME | MS_RDONLY) |
73 | | |
74 | | /* |
75 | | * Required for Android host glibc which is permanently stuck on 2.17. Causes |
76 | | * no harm for newer glibc versions. |
77 | | */ |
78 | | #ifndef MS_NOSYMFOLLOW |
79 | | /* Added locally in kernels 4.x+. */ |
80 | 0 | #define MS_NOSYMFOLLOW 256 |
81 | | #endif |
82 | | |
83 | | struct minijail_rlimit { |
84 | | int type; |
85 | | rlim_t cur; |
86 | | rlim_t max; |
87 | | }; |
88 | | |
89 | | struct mountpoint { |
90 | | char *src; |
91 | | char *dest; |
92 | | char *type; |
93 | | char *data; |
94 | | int has_data; |
95 | | unsigned long flags; |
96 | | struct mountpoint *next; |
97 | | }; |
98 | | |
99 | | struct minijail_remount { |
100 | | unsigned long remount_mode; |
101 | | char *mount_name; |
102 | | struct minijail_remount *next; |
103 | | }; |
104 | | |
105 | | struct hook { |
106 | | minijail_hook_t hook; |
107 | | void *payload; |
108 | | minijail_hook_event_t event; |
109 | | struct hook *next; |
110 | | }; |
111 | | |
112 | | struct fs_rule { |
113 | | char *path; |
114 | | uint64_t landlock_flags; |
115 | | struct fs_rule *next; |
116 | | }; |
117 | | |
118 | | struct preserved_fd { |
119 | | int parent_fd; |
120 | | int child_fd; |
121 | | }; |
122 | | |
123 | | /* |
124 | | * minijail struct: new fields should either be marshaled/unmarshaled or have a |
125 | | * comment explaining why that's unnecessary. |
126 | | */ |
127 | | struct minijail { |
128 | | /* |
129 | | * WARNING: new bool flags should always be added to this struct, |
130 | | * unless you’re certain they don’t need to remain after marshaling. |
131 | | * If you add a flag here you need to make sure it's |
132 | | * accounted for in minijail_pre{enter|exec}() below. |
133 | | */ |
134 | | struct { |
135 | | bool uid : 1; |
136 | | bool gid : 1; |
137 | | bool inherit_suppl_gids : 1; |
138 | | bool set_suppl_gids : 1; |
139 | | bool keep_suppl_gids : 1; |
140 | | bool use_caps : 1; |
141 | | bool capbset_drop : 1; |
142 | | bool set_ambient_caps : 1; |
143 | | bool vfs : 1; |
144 | | bool enter_vfs : 1; |
145 | | bool pids : 1; |
146 | | bool ipc : 1; |
147 | | bool uts : 1; |
148 | | bool net : 1; |
149 | | bool net_loopback : 1; |
150 | | bool enter_net : 1; |
151 | | bool ns_cgroups : 1; |
152 | | bool userns : 1; |
153 | | bool disable_setgroups : 1; |
154 | | bool seccomp : 1; |
155 | | bool remount_proc_ro : 1; |
156 | | bool no_new_privs : 1; |
157 | | bool seccomp_filter : 1; |
158 | | bool seccomp_filter_tsync : 1; |
159 | | bool seccomp_filter_logging : 1; |
160 | | bool seccomp_filter_allow_speculation : 1; |
161 | | bool chroot : 1; |
162 | | bool pivot_root : 1; |
163 | | bool mount_dev : 1; |
164 | | bool mount_tmp : 1; |
165 | | bool do_init : 1; |
166 | | bool run_as_init : 1; |
167 | | bool pid_file : 1; |
168 | | bool cgroups : 1; |
169 | | bool alt_syscall : 1; |
170 | | bool reset_signal_mask : 1; |
171 | | bool reset_signal_handlers : 1; |
172 | | bool close_open_fds : 1; |
173 | | bool new_session_keyring : 1; |
174 | | bool forward_signals : 1; |
175 | | bool setsid : 1; |
176 | | bool using_minimalistic_mountns : 1; |
177 | | bool enable_fs_restrictions : 1; |
178 | | bool enable_profile_fs_restrictions : 1; |
179 | | bool enable_default_runtime : 1; |
180 | | bool enable_new_sessions : 1; |
181 | | } flags; |
182 | | uid_t uid; |
183 | | gid_t gid; |
184 | | gid_t usergid; |
185 | | char *user; |
186 | | size_t suppl_gid_count; |
187 | | gid_t *suppl_gid_list; |
188 | | uint64_t caps; |
189 | | uint64_t cap_bset; |
190 | | pid_t initpid; |
191 | | int mountns_fd; |
192 | | int netns_fd; |
193 | | int fs_rules_fd; |
194 | | int fs_rules_landlock_abi; |
195 | | char *chrootdir; |
196 | | char *pid_file_path; |
197 | | char *uidmap; |
198 | | char *gidmap; |
199 | | char *hostname; |
200 | | char *preload_path; |
201 | | /* |
202 | | * Filename that will be executed, unless an ELF fd is used instead. |
203 | | * This field is only used for logs and isn't included in marshaling. |
204 | | */ |
205 | | char *filename; |
206 | | size_t filter_len; |
207 | | struct sock_fprog *filter_prog; |
208 | | char *alt_syscall_table; |
209 | | struct mountpoint *mounts_head; |
210 | | struct mountpoint *mounts_tail; |
211 | | size_t mounts_count; |
212 | | unsigned long remount_mode; |
213 | | struct minijail_remount *remounts_head; |
214 | | struct minijail_remount *remounts_tail; |
215 | | size_t tmpfs_size; |
216 | | struct fs_rule *fs_rules_head; |
217 | | struct fs_rule *fs_rules_tail; |
218 | | size_t fs_rules_count; |
219 | | char *cgroups[MAX_CGROUPS]; |
220 | | size_t cgroup_count; |
221 | | struct minijail_rlimit rlimits[MAX_RLIMITS]; |
222 | | size_t rlimit_count; |
223 | | uint64_t securebits_skip_mask; |
224 | | struct hook *hooks_head; |
225 | | struct hook *hooks_tail; |
226 | | struct preserved_fd preserved_fds[MAX_PRESERVED_FDS]; |
227 | | size_t preserved_fd_count; |
228 | | char *seccomp_policy_path; |
229 | | }; |
230 | | |
231 | | static void run_hooks_or_die(const struct minijail *j, |
232 | | minijail_hook_event_t event); |
233 | | |
234 | | static bool seccomp_is_logging_allowed(const struct minijail *j) |
235 | 0 | { |
236 | 0 | return seccomp_default_ret_log() || j->flags.seccomp_filter_logging; |
237 | 0 | } |
238 | | |
239 | | static void free_mounts_list(struct minijail *j) |
240 | 0 | { |
241 | 0 | while (j->mounts_head) { |
242 | 0 | struct mountpoint *m = j->mounts_head; |
243 | 0 | j->mounts_head = j->mounts_head->next; |
244 | 0 | free(m->data); |
245 | 0 | free(m->type); |
246 | 0 | free(m->dest); |
247 | 0 | free(m->src); |
248 | 0 | free(m); |
249 | 0 | } |
250 | | // No need to clear mounts_head as we know it's NULL after the loop. |
251 | 0 | j->mounts_tail = NULL; |
252 | 0 | } |
253 | | |
254 | | static void free_remounts_list(struct minijail *j) |
255 | 0 | { |
256 | 0 | while (j->remounts_head) { |
257 | 0 | struct minijail_remount *m = j->remounts_head; |
258 | 0 | j->remounts_head = j->remounts_head->next; |
259 | 0 | free(m->mount_name); |
260 | 0 | free(m); |
261 | 0 | } |
262 | | // No need to clear remounts_head as we know it's NULL after the loop. |
263 | 0 | j->remounts_tail = NULL; |
264 | 0 | } |
265 | | |
266 | | static void free_fs_rules_list(struct minijail *j) |
267 | 0 | { |
268 | 0 | while (j->fs_rules_head) { |
269 | 0 | struct fs_rule *r = j->fs_rules_head; |
270 | 0 | j->fs_rules_head = j->fs_rules_head->next; |
271 | 0 | free(r->path); |
272 | 0 | free(r); |
273 | 0 | } |
274 | 0 | j->fs_rules_tail = NULL; |
275 | 0 | } |
276 | | |
277 | | /* |
278 | | * Writes exactly n bytes from buf to file descriptor fd. |
279 | | * Returns 0 on success or a negative error code on error. |
280 | | */ |
281 | | static int write_exactly(int fd, const void *buf, size_t n) |
282 | 0 | { |
283 | 0 | const char *p = buf; |
284 | 0 | while (n > 0) { |
285 | 0 | const ssize_t written = write(fd, p, n); |
286 | 0 | if (written < 0) { |
287 | 0 | if (errno == EINTR) |
288 | 0 | continue; |
289 | | |
290 | 0 | return -errno; |
291 | 0 | } |
292 | | |
293 | 0 | p += written; |
294 | 0 | n -= written; |
295 | 0 | } |
296 | | |
297 | 0 | return 0; |
298 | 0 | } |
299 | | |
300 | | /* |
301 | | * Reads exactly n bytes from file descriptor fd into buf. |
302 | | * Returns 0 on success or a negative error code on error. |
303 | | */ |
304 | | static int read_exactly(int fd, void *buf, size_t n) |
305 | 0 | { |
306 | 0 | char *p = buf; |
307 | 0 | while (n > 0) { |
308 | 0 | const ssize_t bytes = read(fd, p, n); |
309 | 0 | if (bytes < 0) { |
310 | 0 | if (errno == EINTR) |
311 | 0 | continue; |
312 | | |
313 | 0 | return -errno; |
314 | 0 | } |
315 | 0 | if (bytes == 0) { |
316 | 0 | errno = EPIPE; |
317 | 0 | return -EPIPE; |
318 | 0 | } |
319 | | |
320 | 0 | p += bytes; |
321 | 0 | n -= bytes; |
322 | 0 | } |
323 | | |
324 | 0 | return 0; |
325 | 0 | } |
326 | | |
327 | | /* Closes *pfd and sets it to -1. */ |
328 | | static void close_and_reset(int *pfd) |
329 | 0 | { |
330 | 0 | if (*pfd != -1) |
331 | 0 | close(*pfd); |
332 | 0 | *pfd = -1; |
333 | 0 | } |
334 | | |
335 | | /* |
336 | | * Strip out flags meant for the parent. |
337 | | * We keep things that are not inherited across execve(2) (e.g. capabilities), |
338 | | * or are easier to set after execve(2) (e.g. seccomp filters). |
339 | | */ |
340 | | void minijail_preenter(struct minijail *j) |
341 | 0 | { |
342 | 0 | j->flags.vfs = 0; |
343 | 0 | j->flags.enter_vfs = 0; |
344 | 0 | j->flags.ns_cgroups = 0; |
345 | 0 | j->flags.net = 0; |
346 | 0 | j->flags.net_loopback = 0; |
347 | 0 | j->flags.uts = 0; |
348 | 0 | j->flags.remount_proc_ro = 0; |
349 | 0 | j->flags.pids = 0; |
350 | 0 | j->flags.do_init = 0; |
351 | 0 | j->flags.run_as_init = 0; |
352 | 0 | j->flags.pid_file = 0; |
353 | 0 | j->flags.cgroups = 0; |
354 | 0 | j->flags.forward_signals = 0; |
355 | 0 | j->flags.setsid = 0; |
356 | 0 | j->remount_mode = 0; |
357 | 0 | j->flags.using_minimalistic_mountns = 0; |
358 | 0 | j->flags.enable_profile_fs_restrictions = 0; |
359 | 0 | j->flags.enable_default_runtime = 0; |
360 | 0 | j->flags.enable_new_sessions = 0; |
361 | 0 | free_remounts_list(j); |
362 | 0 | } |
363 | | |
364 | | static bool fs_refer_restriction_supported(struct minijail *j) |
365 | 0 | { |
366 | 0 | if (j->fs_rules_landlock_abi < 0) { |
367 | 0 | const int abi = landlock_create_ruleset( |
368 | 0 | NULL, 0, LANDLOCK_CREATE_RULESET_VERSION); |
369 | | /* |
370 | | * If we have a valid ABI, save the result. Otherwise, leave |
371 | | * the struct field unmodified to make sure it's correctly |
372 | | * marshaled and unmarshaled. |
373 | | */ |
374 | 0 | if (abi > 0) { |
375 | 0 | j->fs_rules_landlock_abi = abi; |
376 | 0 | } |
377 | 0 | } |
378 | |
|
379 | 0 | return j->fs_rules_landlock_abi >= LANDLOCK_ABI_FS_REFER_SUPPORTED; |
380 | 0 | } |
381 | | |
382 | | /* Sets fs_rules_fd to an empty ruleset, if Landlock is available. */ |
383 | | static int setup_fs_rules_fd(struct minijail *j) |
384 | 0 | { |
385 | 0 | struct minijail_landlock_ruleset_attr ruleset_attr = { |
386 | 0 | .handled_access_fs = HANDLED_ACCESS_TYPES}; |
387 | 0 | if (fs_refer_restriction_supported(j)) { |
388 | 0 | ruleset_attr.handled_access_fs |= LANDLOCK_ACCESS_FS_REFER; |
389 | 0 | } |
390 | |
|
391 | 0 | j->fs_rules_fd = |
392 | 0 | landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); |
393 | 0 | if (j->fs_rules_fd < 0) { |
394 | | /* |
395 | | * As of Landlock ABI=3, the useful errors we expect here are |
396 | | * ENOSYS or EOPNOTSUPP. In both cases, Landlock is not |
397 | | * supported by the kernel and Minijail can silently ignore it. |
398 | | * TODO(b/300142205): log when we no longer have 5.4 kernels in |
399 | | * ChromeOS (~EoY 2024). |
400 | | */ |
401 | 0 | return errno; |
402 | 0 | } |
403 | | |
404 | 0 | return 0; |
405 | 0 | } |
406 | | |
407 | | /* Adds a rule for a given path to apply once minijail is entered. */ |
408 | | static int add_fs_restriction_path(struct minijail *j, const char *path, |
409 | | uint64_t landlock_flags) |
410 | 0 | { |
411 | 0 | struct fs_rule *r = calloc(1, sizeof(*r)); |
412 | 0 | if (!r) |
413 | 0 | return -ENOMEM; |
414 | 0 | r->path = strdup(path); |
415 | 0 | r->landlock_flags = landlock_flags; |
416 | |
|
417 | 0 | if (j->fs_rules_tail) { |
418 | 0 | j->fs_rules_tail->next = r; |
419 | 0 | j->fs_rules_tail = r; |
420 | 0 | } else { |
421 | 0 | j->fs_rules_head = r; |
422 | 0 | j->fs_rules_tail = r; |
423 | 0 | } |
424 | | |
425 | | /* |
426 | | * If this is our first rule, set up the rules FD early for API users. |
427 | | * |
428 | | * This is important for users calling minijail_enter() directly. |
429 | | * Otherise, this is handled later inside minijail_run_internal(). |
430 | | * |
431 | | * The reason for this is because setup_fs_rules_fd() needs to be |
432 | | * called from inside the process that applies Landlock rules. For |
433 | | * minijail_enter(), that's this process. For minijail_run_internal(), |
434 | | * that's the child process. |
435 | | */ |
436 | 0 | if (j->fs_rules_count == 0) |
437 | 0 | setup_fs_rules_fd(j); |
438 | |
|
439 | 0 | j->fs_rules_count++; |
440 | 0 | return 0; |
441 | 0 | } |
442 | | |
443 | | bool mount_has_bind_flag(struct mountpoint *m) |
444 | 0 | { |
445 | 0 | return !!(m->flags & MS_BIND); |
446 | 0 | } |
447 | | |
448 | | bool mount_has_readonly_flag(struct mountpoint *m) |
449 | 0 | { |
450 | 0 | return !!(m->flags & MS_RDONLY); |
451 | 0 | } |
452 | | |
453 | | bool mount_events_allowed(struct mountpoint *m) |
454 | 0 | { |
455 | 0 | return !!(m->flags & MS_SHARED) || !!(m->flags & MS_SLAVE); |
456 | 0 | } |
457 | | |
458 | | /* |
459 | | * Strip out flags meant for the child. |
460 | | * We keep things that are inherited across execve(2). |
461 | | */ |
462 | | void minijail_preexec(struct minijail *j) |
463 | 0 | { |
464 | 0 | int vfs = j->flags.vfs; |
465 | 0 | int enter_vfs = j->flags.enter_vfs; |
466 | 0 | int ns_cgroups = j->flags.ns_cgroups; |
467 | 0 | int net = j->flags.net; |
468 | 0 | int net_loopback = j->flags.net_loopback; |
469 | 0 | int uts = j->flags.uts; |
470 | 0 | int remount_proc_ro = j->flags.remount_proc_ro; |
471 | 0 | int userns = j->flags.userns; |
472 | 0 | int using_minimalistic_mountns = j->flags.using_minimalistic_mountns; |
473 | 0 | int enable_fs_restrictions = j->flags.enable_fs_restrictions; |
474 | 0 | int enable_profile_fs_restrictions = |
475 | 0 | j->flags.enable_profile_fs_restrictions; |
476 | 0 | int enable_default_runtime = j->flags.enable_default_runtime; |
477 | 0 | int enable_new_sessions = j->flags.enable_new_sessions; |
478 | 0 | if (j->user) |
479 | 0 | free(j->user); |
480 | 0 | j->user = NULL; |
481 | 0 | if (j->suppl_gid_list) |
482 | 0 | free(j->suppl_gid_list); |
483 | 0 | j->suppl_gid_list = NULL; |
484 | 0 | if (j->preload_path) |
485 | 0 | free(j->preload_path); |
486 | 0 | j->preload_path = NULL; |
487 | 0 | free_mounts_list(j); |
488 | 0 | free_fs_rules_list(j); |
489 | 0 | memset(&j->flags, 0, sizeof(j->flags)); |
490 | | /* Now restore anything we meant to keep. */ |
491 | 0 | j->flags.vfs = vfs; |
492 | 0 | j->flags.enter_vfs = enter_vfs; |
493 | 0 | j->flags.ns_cgroups = ns_cgroups; |
494 | 0 | j->flags.net = net; |
495 | 0 | j->flags.net_loopback = net_loopback; |
496 | 0 | j->flags.uts = uts; |
497 | 0 | j->flags.remount_proc_ro = remount_proc_ro; |
498 | 0 | j->flags.userns = userns; |
499 | 0 | j->flags.using_minimalistic_mountns = using_minimalistic_mountns; |
500 | 0 | j->flags.enable_fs_restrictions = enable_fs_restrictions; |
501 | 0 | j->flags.enable_profile_fs_restrictions = |
502 | 0 | enable_profile_fs_restrictions; |
503 | 0 | j->flags.enable_default_runtime = enable_default_runtime; |
504 | 0 | j->flags.enable_new_sessions = enable_new_sessions; |
505 | | /* Note, |pids| will already have been used before this call. */ |
506 | 0 | } |
507 | | |
508 | | /* Minijail API. */ |
509 | | |
510 | | struct minijail API *minijail_new(void) |
511 | 0 | { |
512 | 0 | struct minijail *j = calloc(1, sizeof(struct minijail)); |
513 | 0 | if (j) { |
514 | 0 | j->remount_mode = MS_PRIVATE; |
515 | 0 | j->fs_rules_fd = -1; |
516 | 0 | j->fs_rules_landlock_abi = -1; |
517 | 0 | j->flags.using_minimalistic_mountns = false; |
518 | 0 | j->flags.enable_fs_restrictions = true; |
519 | 0 | j->flags.enable_profile_fs_restrictions = true; |
520 | 0 | j->flags.enable_default_runtime = true; |
521 | 0 | j->flags.enable_new_sessions = true; |
522 | 0 | } |
523 | 0 | return j; |
524 | 0 | } |
525 | | |
526 | | void API minijail_change_uid(struct minijail *j, uid_t uid) |
527 | 0 | { |
528 | 0 | if (uid == 0) |
529 | 0 | die("useless change to uid 0"); |
530 | 0 | j->uid = uid; |
531 | 0 | j->flags.uid = 1; |
532 | 0 | } |
533 | | |
534 | | void API minijail_change_gid(struct minijail *j, gid_t gid) |
535 | 0 | { |
536 | 0 | if (gid == 0) |
537 | 0 | die("useless change to gid 0"); |
538 | 0 | j->gid = gid; |
539 | 0 | j->flags.gid = 1; |
540 | 0 | } |
541 | | |
542 | | void API minijail_set_supplementary_gids(struct minijail *j, size_t size, |
543 | | const gid_t *list) |
544 | 0 | { |
545 | 0 | size_t i; |
546 | |
|
547 | 0 | if (j->flags.inherit_suppl_gids) |
548 | 0 | die("cannot inherit *and* set supplementary groups"); |
549 | 0 | if (j->flags.keep_suppl_gids) |
550 | 0 | die("cannot keep *and* set supplementary groups"); |
551 | |
|
552 | 0 | if (size == 0) { |
553 | | /* Clear supplementary groups. */ |
554 | 0 | j->suppl_gid_list = NULL; |
555 | 0 | j->suppl_gid_count = 0; |
556 | 0 | j->flags.set_suppl_gids = 1; |
557 | 0 | return; |
558 | 0 | } |
559 | | |
560 | | /* Copy the gid_t array. */ |
561 | 0 | j->suppl_gid_list = calloc(size, sizeof(gid_t)); |
562 | 0 | if (!j->suppl_gid_list) { |
563 | 0 | die("failed to allocate internal supplementary group array"); |
564 | 0 | } |
565 | 0 | for (i = 0; i < size; i++) { |
566 | 0 | j->suppl_gid_list[i] = list[i]; |
567 | 0 | } |
568 | 0 | j->suppl_gid_count = size; |
569 | 0 | j->flags.set_suppl_gids = 1; |
570 | 0 | } |
571 | | |
572 | | void API minijail_keep_supplementary_gids(struct minijail *j) |
573 | 0 | { |
574 | 0 | j->flags.keep_suppl_gids = 1; |
575 | 0 | } |
576 | | |
577 | | int API minijail_change_user(struct minijail *j, const char *user) |
578 | 0 | { |
579 | 0 | uid_t uid; |
580 | 0 | gid_t gid; |
581 | 0 | int rc = lookup_user(user, &uid, &gid); |
582 | 0 | if (rc) |
583 | 0 | return rc; |
584 | 0 | minijail_change_uid(j, uid); |
585 | 0 | j->user = strdup(user); |
586 | 0 | if (!j->user) |
587 | 0 | return -ENOMEM; |
588 | 0 | j->usergid = gid; |
589 | 0 | return 0; |
590 | 0 | } |
591 | | |
592 | | int API minijail_change_group(struct minijail *j, const char *group) |
593 | 0 | { |
594 | 0 | gid_t gid; |
595 | 0 | int rc = lookup_group(group, &gid); |
596 | 0 | if (rc) |
597 | 0 | return rc; |
598 | 0 | minijail_change_gid(j, gid); |
599 | 0 | return 0; |
600 | 0 | } |
601 | | |
602 | | void API minijail_use_seccomp(struct minijail *j) |
603 | 0 | { |
604 | 0 | j->flags.seccomp = 1; |
605 | 0 | } |
606 | | |
607 | | void API minijail_no_new_privs(struct minijail *j) |
608 | 0 | { |
609 | 0 | j->flags.no_new_privs = 1; |
610 | 0 | } |
611 | | |
612 | | void API minijail_use_seccomp_filter(struct minijail *j) |
613 | 0 | { |
614 | 0 | j->flags.seccomp_filter = 1; |
615 | 0 | } |
616 | | |
617 | | void API minijail_set_seccomp_filter_tsync(struct minijail *j) |
618 | 0 | { |
619 | 0 | if (j->filter_len > 0 && j->filter_prog != NULL) { |
620 | 0 | die("minijail_set_seccomp_filter_tsync() must be called " |
621 | 0 | "before minijail_parse_seccomp_filters()"); |
622 | 0 | } |
623 | |
|
624 | 0 | if (seccomp_is_logging_allowed(j) && !seccomp_ret_log_available()) { |
625 | | /* |
626 | | * If SECCOMP_RET_LOG is not available, we don't want to use |
627 | | * SECCOMP_RET_TRAP to both kill the entire process and report |
628 | | * failing syscalls, since it will be brittle. Just bail. |
629 | | */ |
630 | 0 | die("SECCOMP_RET_LOG not available, cannot use logging with " |
631 | 0 | "thread sync at the same time"); |
632 | 0 | } |
633 | |
|
634 | 0 | j->flags.seccomp_filter_tsync = 1; |
635 | 0 | } |
636 | | |
637 | | void API minijail_set_seccomp_filter_allow_speculation(struct minijail *j) |
638 | 0 | { |
639 | 0 | if (j->filter_len > 0 && j->filter_prog != NULL) { |
640 | 0 | die("minijail_set_seccomp_filter_allow_speculation() must be " |
641 | 0 | "called before minijail_parse_seccomp_filters()"); |
642 | 0 | } |
643 | |
|
644 | 0 | j->flags.seccomp_filter_allow_speculation = 1; |
645 | 0 | } |
646 | | |
647 | | void API minijail_log_seccomp_filter_failures(struct minijail *j) |
648 | 0 | { |
649 | 0 | if (j->filter_len > 0 && j->filter_prog != NULL) { |
650 | 0 | die("minijail_log_seccomp_filter_failures() must be called " |
651 | 0 | "before minijail_parse_seccomp_filters()"); |
652 | 0 | } |
653 | |
|
654 | 0 | if (j->flags.seccomp_filter_tsync && !seccomp_ret_log_available()) { |
655 | | /* |
656 | | * If SECCOMP_RET_LOG is not available, we don't want to use |
657 | | * SECCOMP_RET_TRAP to both kill the entire process and report |
658 | | * failing syscalls, since it will be brittle. Just bail. |
659 | | */ |
660 | 0 | die("SECCOMP_RET_LOG not available, cannot use thread sync " |
661 | 0 | "with logging at the same time"); |
662 | 0 | } |
663 | |
|
664 | 0 | if (debug_logging_allowed()) { |
665 | 0 | j->flags.seccomp_filter_logging = 1; |
666 | 0 | } else { |
667 | 0 | warn("non-debug build: ignoring request to enable seccomp " |
668 | 0 | "logging"); |
669 | 0 | } |
670 | 0 | } |
671 | | |
672 | | void API minijail_set_using_minimalistic_mountns(struct minijail *j) |
673 | 0 | { |
674 | 0 | j->flags.using_minimalistic_mountns = true; |
675 | 0 | } |
676 | | |
677 | | void API minijail_set_enable_new_sessions(struct minijail *j, |
678 | | bool enable_new_sessions) |
679 | 0 | { |
680 | 0 | j->flags.enable_new_sessions = enable_new_sessions; |
681 | 0 | } |
682 | | |
683 | | void API minijail_set_enable_default_runtime(struct minijail *j, |
684 | | bool enable_default_runtime) |
685 | 0 | { |
686 | 0 | j->flags.enable_default_runtime = enable_default_runtime; |
687 | 0 | } |
688 | | |
689 | | bool API minijail_get_enable_default_runtime(struct minijail *j) |
690 | 0 | { |
691 | 0 | return j->flags.enable_default_runtime; |
692 | 0 | } |
693 | | |
694 | | bool API minijail_is_fs_restriction_available(void) |
695 | 0 | { |
696 | 0 | const int abi = |
697 | 0 | landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION); |
698 | | // ABI > 0 is considered supported. |
699 | 0 | return abi > 0; |
700 | 0 | } |
701 | | |
702 | | void API minijail_disable_fs_restrictions(struct minijail *j) |
703 | 0 | { |
704 | 0 | j->flags.enable_fs_restrictions = false; |
705 | 0 | } |
706 | | |
707 | | void API minijail_set_enable_profile_fs_restrictions(struct minijail *j) |
708 | 0 | { |
709 | 0 | j->flags.enable_profile_fs_restrictions = true; |
710 | 0 | } |
711 | | |
712 | | void API minijail_add_minimalistic_mountns_fs_rules(struct minijail *j) |
713 | 0 | { |
714 | 0 | struct mountpoint *m = j->mounts_head; |
715 | 0 | bool landlock_enabled_by_profile = false; |
716 | 0 | if (!j->flags.using_minimalistic_mountns || |
717 | 0 | !j->flags.enable_profile_fs_restrictions) |
718 | 0 | return; |
719 | | |
720 | | /* Apply Landlock rules. */ |
721 | 0 | while (m) { |
722 | 0 | landlock_enabled_by_profile = true; |
723 | 0 | minijail_add_fs_restriction_rx(j, m->dest); |
724 | | /* |
725 | | * Allow rw if mounted as writable, or mount flags allow mount |
726 | | * events. |
727 | | */ |
728 | 0 | if (!mount_has_readonly_flag(m) || mount_events_allowed(m)) |
729 | 0 | minijail_add_fs_restriction_advanced_rw(j, m->dest); |
730 | 0 | m = m->next; |
731 | 0 | } |
732 | 0 | if (landlock_enabled_by_profile) { |
733 | 0 | minijail_enable_default_fs_restrictions(j); |
734 | 0 | minijail_add_fs_restriction_edit(j, "/dev"); |
735 | 0 | minijail_add_fs_restriction_ro(j, "/proc"); |
736 | 0 | if (j->flags.vfs) |
737 | 0 | minijail_add_fs_restriction_rw(j, "/tmp"); |
738 | 0 | } |
739 | 0 | } |
740 | | |
741 | | void API minijail_enable_default_fs_restrictions(struct minijail *j) |
742 | 0 | { |
743 | | // Common library locations. |
744 | 0 | minijail_add_fs_restriction_rx(j, "/lib"); |
745 | 0 | minijail_add_fs_restriction_rx(j, "/lib64"); |
746 | 0 | minijail_add_fs_restriction_rx(j, "/usr/lib"); |
747 | 0 | minijail_add_fs_restriction_rx(j, "/usr/lib64"); |
748 | | // Common locations for services invoking Minijail. |
749 | 0 | minijail_add_fs_restriction_rx(j, "/bin"); |
750 | 0 | minijail_add_fs_restriction_rx(j, "/sbin"); |
751 | 0 | minijail_add_fs_restriction_rx(j, "/usr/sbin"); |
752 | 0 | minijail_add_fs_restriction_rx(j, "/usr/bin"); |
753 | | // Common /etc locations. |
754 | 0 | minijail_add_fs_restriction_ro(j, "/etc/group"); |
755 | 0 | minijail_add_fs_restriction_ro(j, "/etc/passwd"); |
756 | 0 | } |
757 | | |
758 | | void API minijail_use_caps(struct minijail *j, uint64_t capmask) |
759 | 0 | { |
760 | | /* |
761 | | * 'minijail_use_caps' configures a runtime-capabilities-only |
762 | | * environment, including a bounding set matching the thread's runtime |
763 | | * (permitted|inheritable|effective) sets. |
764 | | * Therefore, it will override any existing bounding set configurations |
765 | | * since the latter would allow gaining extra runtime capabilities from |
766 | | * file capabilities. |
767 | | */ |
768 | 0 | if (j->flags.capbset_drop) { |
769 | 0 | warn("overriding bounding set configuration"); |
770 | 0 | j->cap_bset = 0; |
771 | 0 | j->flags.capbset_drop = 0; |
772 | 0 | } |
773 | 0 | j->caps = capmask; |
774 | 0 | j->flags.use_caps = 1; |
775 | 0 | } |
776 | | |
777 | | void API minijail_capbset_drop(struct minijail *j, uint64_t capmask) |
778 | 0 | { |
779 | 0 | if (j->flags.use_caps) { |
780 | | /* |
781 | | * 'minijail_use_caps' will have already configured a capability |
782 | | * bounding set matching the (permitted|inheritable|effective) |
783 | | * sets. Abort if the user tries to configure a separate |
784 | | * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps' |
785 | | * are mutually exclusive. |
786 | | */ |
787 | 0 | die("runtime capabilities already configured, can't drop " |
788 | 0 | "bounding set separately"); |
789 | 0 | } |
790 | 0 | j->cap_bset = capmask; |
791 | 0 | j->flags.capbset_drop = 1; |
792 | 0 | } |
793 | | |
794 | | void API minijail_set_ambient_caps(struct minijail *j) |
795 | 0 | { |
796 | 0 | j->flags.set_ambient_caps = 1; |
797 | 0 | } |
798 | | |
799 | | void API minijail_reset_signal_mask(struct minijail *j) |
800 | 0 | { |
801 | 0 | j->flags.reset_signal_mask = 1; |
802 | 0 | } |
803 | | |
804 | | void API minijail_reset_signal_handlers(struct minijail *j) |
805 | 0 | { |
806 | 0 | j->flags.reset_signal_handlers = 1; |
807 | 0 | } |
808 | | |
809 | | void API minijail_namespace_vfs(struct minijail *j) |
810 | 0 | { |
811 | 0 | j->flags.vfs = 1; |
812 | 0 | } |
813 | | |
814 | | void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) |
815 | 0 | { |
816 | | /* Note: Do not use O_CLOEXEC here. We'll close it after we use it. */ |
817 | 0 | int ns_fd = open(ns_path, O_RDONLY); |
818 | 0 | if (ns_fd < 0) { |
819 | 0 | pdie("failed to open namespace '%s'", ns_path); |
820 | 0 | } |
821 | 0 | j->mountns_fd = ns_fd; |
822 | 0 | j->flags.enter_vfs = 1; |
823 | 0 | } |
824 | | |
825 | | void API minijail_new_session_keyring(struct minijail *j) |
826 | 0 | { |
827 | 0 | j->flags.new_session_keyring = 1; |
828 | 0 | } |
829 | | |
830 | | void API minijail_skip_setting_securebits(struct minijail *j, |
831 | | uint64_t securebits_skip_mask) |
832 | 0 | { |
833 | 0 | j->securebits_skip_mask = securebits_skip_mask; |
834 | 0 | } |
835 | | |
836 | | void API minijail_remount_mode(struct minijail *j, unsigned long mode) |
837 | 0 | { |
838 | 0 | j->remount_mode = mode; |
839 | 0 | } |
840 | | |
841 | | void API minijail_skip_remount_private(struct minijail *j) |
842 | 0 | { |
843 | 0 | j->remount_mode = 0; |
844 | 0 | } |
845 | | |
846 | | void API minijail_namespace_pids(struct minijail *j) |
847 | 0 | { |
848 | 0 | j->flags.vfs = 1; |
849 | 0 | j->flags.remount_proc_ro = 1; |
850 | 0 | j->flags.pids = 1; |
851 | 0 | j->flags.do_init = 1; |
852 | 0 | } |
853 | | |
854 | | void API minijail_namespace_pids_rw_proc(struct minijail *j) |
855 | 0 | { |
856 | 0 | j->flags.vfs = 1; |
857 | 0 | j->flags.pids = 1; |
858 | 0 | j->flags.do_init = 1; |
859 | 0 | } |
860 | | |
861 | | void API minijail_namespace_ipc(struct minijail *j) |
862 | 0 | { |
863 | 0 | j->flags.ipc = 1; |
864 | 0 | } |
865 | | |
866 | | void API minijail_namespace_uts(struct minijail *j) |
867 | 0 | { |
868 | 0 | j->flags.uts = 1; |
869 | 0 | } |
870 | | |
871 | | int API minijail_namespace_set_hostname(struct minijail *j, const char *name) |
872 | 0 | { |
873 | 0 | if (j->hostname) |
874 | 0 | return -EINVAL; |
875 | 0 | minijail_namespace_uts(j); |
876 | 0 | j->hostname = strdup(name); |
877 | 0 | if (!j->hostname) |
878 | 0 | return -ENOMEM; |
879 | 0 | return 0; |
880 | 0 | } |
881 | | |
882 | | void API minijail_namespace_net_loopback(struct minijail *j, |
883 | | bool enable_loopback) |
884 | 0 | { |
885 | 0 | j->flags.net = 1; |
886 | 0 | j->flags.net_loopback = enable_loopback; |
887 | 0 | } |
888 | | |
889 | | void API minijail_namespace_net(struct minijail *j) |
890 | 0 | { |
891 | 0 | minijail_namespace_net_loopback(j, true); |
892 | 0 | } |
893 | | |
894 | | void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) |
895 | 0 | { |
896 | | /* Note: Do not use O_CLOEXEC here. We'll close it after we use it. */ |
897 | 0 | int ns_fd = open(ns_path, O_RDONLY); |
898 | 0 | if (ns_fd < 0) { |
899 | 0 | pdie("failed to open namespace '%s'", ns_path); |
900 | 0 | } |
901 | 0 | j->netns_fd = ns_fd; |
902 | 0 | j->flags.enter_net = 1; |
903 | 0 | } |
904 | | |
905 | | void API minijail_namespace_cgroups(struct minijail *j) |
906 | 0 | { |
907 | 0 | j->flags.ns_cgroups = 1; |
908 | 0 | } |
909 | | |
910 | | void API minijail_close_open_fds(struct minijail *j) |
911 | 0 | { |
912 | 0 | j->flags.close_open_fds = 1; |
913 | 0 | } |
914 | | |
915 | | void API minijail_remount_proc_readonly(struct minijail *j) |
916 | 0 | { |
917 | 0 | j->flags.vfs = 1; |
918 | 0 | j->flags.remount_proc_ro = 1; |
919 | 0 | } |
920 | | |
921 | | void API minijail_namespace_user(struct minijail *j) |
922 | 0 | { |
923 | 0 | j->flags.userns = 1; |
924 | 0 | } |
925 | | |
926 | | void API minijail_namespace_user_disable_setgroups(struct minijail *j) |
927 | 0 | { |
928 | 0 | j->flags.disable_setgroups = 1; |
929 | 0 | } |
930 | | |
931 | | int API minijail_uidmap(struct minijail *j, const char *uidmap) |
932 | 0 | { |
933 | 0 | j->uidmap = strdup(uidmap); |
934 | 0 | if (!j->uidmap) |
935 | 0 | return -ENOMEM; |
936 | 0 | char *ch; |
937 | 0 | for (ch = j->uidmap; *ch; ch++) { |
938 | 0 | if (*ch == ',') |
939 | 0 | *ch = '\n'; |
940 | 0 | } |
941 | 0 | return 0; |
942 | 0 | } |
943 | | |
944 | | int API minijail_gidmap(struct minijail *j, const char *gidmap) |
945 | 0 | { |
946 | 0 | j->gidmap = strdup(gidmap); |
947 | 0 | if (!j->gidmap) |
948 | 0 | return -ENOMEM; |
949 | 0 | char *ch; |
950 | 0 | for (ch = j->gidmap; *ch; ch++) { |
951 | 0 | if (*ch == ',') |
952 | 0 | *ch = '\n'; |
953 | 0 | } |
954 | 0 | return 0; |
955 | 0 | } |
956 | | |
957 | | void API minijail_inherit_usergroups(struct minijail *j) |
958 | 0 | { |
959 | 0 | j->flags.inherit_suppl_gids = 1; |
960 | 0 | } |
961 | | |
962 | | void API minijail_run_as_init(struct minijail *j) |
963 | 0 | { |
964 | | /* |
965 | | * Since the jailed program will become 'init' in the new PID namespace, |
966 | | * Minijail does not need to fork an 'init' process. |
967 | | */ |
968 | 0 | j->flags.run_as_init = 1; |
969 | 0 | } |
970 | | |
971 | | int API minijail_enter_chroot(struct minijail *j, const char *dir) |
972 | 0 | { |
973 | 0 | if (j->chrootdir) |
974 | 0 | return -EINVAL; |
975 | 0 | j->chrootdir = strdup(dir); |
976 | 0 | if (!j->chrootdir) |
977 | 0 | return -ENOMEM; |
978 | 0 | j->flags.chroot = 1; |
979 | 0 | return 0; |
980 | 0 | } |
981 | | |
982 | | int API minijail_enter_pivot_root(struct minijail *j, const char *dir) |
983 | 0 | { |
984 | 0 | if (j->chrootdir) |
985 | 0 | return -EINVAL; |
986 | 0 | j->chrootdir = strdup(dir); |
987 | 0 | if (!j->chrootdir) |
988 | 0 | return -ENOMEM; |
989 | 0 | j->flags.pivot_root = 1; |
990 | 0 | return 0; |
991 | 0 | } |
992 | | |
993 | | char API *minijail_get_original_path(struct minijail *j, |
994 | | const char *path_inside_chroot) |
995 | 0 | { |
996 | 0 | struct mountpoint *b; |
997 | |
|
998 | 0 | b = j->mounts_head; |
999 | 0 | while (b) { |
1000 | | /* |
1001 | | * If |path_inside_chroot| is the exact destination of a |
1002 | | * mount, then the original path is exactly the source of |
1003 | | * the mount. |
1004 | | * for example: "-b /some/path/exe,/chroot/path/exe" |
1005 | | * mount source = /some/path/exe, mount dest = |
1006 | | * /chroot/path/exe Then when getting the original path of |
1007 | | * "/chroot/path/exe", the source of that mount, |
1008 | | * "/some/path/exe" is what should be returned. |
1009 | | */ |
1010 | 0 | if (streq(b->dest, path_inside_chroot)) |
1011 | 0 | return strdup(b->src); |
1012 | | |
1013 | | /* |
1014 | | * If |path_inside_chroot| is within the destination path of a |
1015 | | * mount, take the suffix of the chroot path relative to the |
1016 | | * mount destination path, and append it to the mount source |
1017 | | * path. |
1018 | | */ |
1019 | 0 | if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { |
1020 | 0 | const char *relative_path = |
1021 | 0 | path_inside_chroot + strlen(b->dest); |
1022 | 0 | return path_join(b->src, relative_path); |
1023 | 0 | } |
1024 | 0 | b = b->next; |
1025 | 0 | } |
1026 | | |
1027 | | /* If there is a chroot path, append |path_inside_chroot| to that. */ |
1028 | 0 | if (j->chrootdir) |
1029 | 0 | return path_join(j->chrootdir, path_inside_chroot); |
1030 | | |
1031 | | /* No chroot, so the path outside is the same as it is inside. */ |
1032 | 0 | return strdup(path_inside_chroot); |
1033 | 0 | } |
1034 | | |
1035 | | void API minijail_mount_dev(struct minijail *j) |
1036 | 0 | { |
1037 | 0 | j->flags.mount_dev = 1; |
1038 | 0 | } |
1039 | | |
1040 | | void API minijail_mount_tmp(struct minijail *j) |
1041 | 0 | { |
1042 | 0 | minijail_mount_tmp_size(j, 64 * 1024 * 1024); |
1043 | 0 | } |
1044 | | |
1045 | | void API minijail_mount_tmp_size(struct minijail *j, size_t size) |
1046 | 0 | { |
1047 | 0 | j->tmpfs_size = size; |
1048 | 0 | j->flags.mount_tmp = 1; |
1049 | 0 | } |
1050 | | |
1051 | | int API minijail_write_pid_file(struct minijail *j, const char *path) |
1052 | 0 | { |
1053 | 0 | j->pid_file_path = strdup(path); |
1054 | 0 | if (!j->pid_file_path) |
1055 | 0 | return -ENOMEM; |
1056 | 0 | j->flags.pid_file = 1; |
1057 | 0 | return 0; |
1058 | 0 | } |
1059 | | |
1060 | | int API minijail_add_to_cgroup(struct minijail *j, const char *path) |
1061 | 0 | { |
1062 | 0 | if (j->cgroup_count >= MAX_CGROUPS) |
1063 | 0 | return -ENOMEM; |
1064 | 0 | j->cgroups[j->cgroup_count] = strdup(path); |
1065 | 0 | if (!j->cgroups[j->cgroup_count]) |
1066 | 0 | return -ENOMEM; |
1067 | 0 | j->cgroup_count++; |
1068 | 0 | j->flags.cgroups = 1; |
1069 | 0 | return 0; |
1070 | 0 | } |
1071 | | |
1072 | | int API minijail_rlimit(struct minijail *j, int type, rlim_t cur, rlim_t max) |
1073 | 0 | { |
1074 | 0 | size_t i; |
1075 | |
|
1076 | 0 | if (j->rlimit_count >= MAX_RLIMITS) |
1077 | 0 | return -ENOMEM; |
1078 | | /* It's an error if the caller sets the same rlimit multiple times. */ |
1079 | 0 | for (i = 0; i < j->rlimit_count; i++) { |
1080 | 0 | if (j->rlimits[i].type == type) |
1081 | 0 | return -EEXIST; |
1082 | 0 | } |
1083 | | |
1084 | 0 | j->rlimits[j->rlimit_count].type = type; |
1085 | 0 | j->rlimits[j->rlimit_count].cur = cur; |
1086 | 0 | j->rlimits[j->rlimit_count].max = max; |
1087 | 0 | j->rlimit_count++; |
1088 | 0 | return 0; |
1089 | 0 | } |
1090 | | |
1091 | | int API minijail_forward_signals(struct minijail *j) |
1092 | 0 | { |
1093 | 0 | j->flags.forward_signals = 1; |
1094 | 0 | return 0; |
1095 | 0 | } |
1096 | | |
1097 | | int API minijail_create_session(struct minijail *j) |
1098 | 0 | { |
1099 | 0 | j->flags.setsid = 1; |
1100 | 0 | return 0; |
1101 | 0 | } |
1102 | | |
1103 | | int API minijail_add_fs_restriction_rx(struct minijail *j, const char *path) |
1104 | 0 | { |
1105 | 0 | return !add_fs_restriction_path(j, path, |
1106 | 0 | ACCESS_FS_ROUGHLY_READ_EXECUTE); |
1107 | 0 | } |
1108 | | |
1109 | | int API minijail_add_fs_restriction_ro(struct minijail *j, const char *path) |
1110 | 0 | { |
1111 | 0 | return !add_fs_restriction_path(j, path, ACCESS_FS_ROUGHLY_READ); |
1112 | 0 | } |
1113 | | |
1114 | | int API minijail_add_fs_restriction_rw(struct minijail *j, const char *path) |
1115 | 0 | { |
1116 | 0 | return !add_fs_restriction_path( |
1117 | 0 | j, path, ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_BASIC_WRITE); |
1118 | 0 | } |
1119 | | |
1120 | | int API minijail_add_fs_restriction_advanced_rw(struct minijail *j, |
1121 | | const char *path) |
1122 | 0 | { |
1123 | 0 | uint16_t landlock_flags = |
1124 | 0 | ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_FULL_WRITE; |
1125 | 0 | if (fs_refer_restriction_supported(j)) { |
1126 | 0 | landlock_flags |= LANDLOCK_ACCESS_FS_REFER; |
1127 | 0 | } |
1128 | |
|
1129 | 0 | return !add_fs_restriction_path(j, path, landlock_flags); |
1130 | 0 | } |
1131 | | |
1132 | | int API minijail_add_fs_restriction_edit(struct minijail *j, const char *path) |
1133 | 0 | { |
1134 | 0 | return !add_fs_restriction_path( |
1135 | 0 | j, path, ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_EDIT); |
1136 | 0 | } |
1137 | | |
1138 | | int API minijail_add_fs_restriction_access_rights(struct minijail *j, |
1139 | | const char *path, |
1140 | | uint16_t landlock_flags) |
1141 | 0 | { |
1142 | 0 | return !add_fs_restriction_path(j, path, landlock_flags); |
1143 | 0 | } |
1144 | | |
1145 | | bool API |
1146 | | minijail_is_fs_restriction_ruleset_initialized(const struct minijail *j) |
1147 | 0 | { |
1148 | 0 | return j->fs_rules_fd >= 0; |
1149 | 0 | } |
1150 | | |
1151 | | static bool is_valid_bind_path(const char *path) |
1152 | 0 | { |
1153 | 0 | if (!block_symlinks_in_bindmount_paths()) { |
1154 | 0 | return true; |
1155 | 0 | } |
1156 | | |
1157 | | /* |
1158 | | * tokenize() will modify both the |prefixes| pointer and the contents |
1159 | | * of the string, so: |
1160 | | * -Copy |BINDMOUNT_ALLOWED_PREFIXES| since it lives in .rodata. |
1161 | | * -Save the original pointer for free()ing. |
1162 | | */ |
1163 | 0 | char *prefixes = strdup(BINDMOUNT_ALLOWED_PREFIXES); |
1164 | 0 | attribute_cleanup_str char *orig_prefixes = prefixes; |
1165 | 0 | (void)orig_prefixes; |
1166 | |
|
1167 | 0 | char *prefix = NULL; |
1168 | 0 | bool found_prefix = false; |
1169 | 0 | if (!is_canonical_path(path)) { |
1170 | 0 | while ((prefix = tokenize(&prefixes, ",")) != NULL) { |
1171 | 0 | if (path_is_parent(prefix, path)) { |
1172 | 0 | found_prefix = true; |
1173 | 0 | break; |
1174 | 0 | } |
1175 | 0 | } |
1176 | 0 | if (!found_prefix) { |
1177 | | /* |
1178 | | * If the path does not include one of the allowed |
1179 | | * prefixes, fail. |
1180 | | */ |
1181 | 0 | warn("path '%s' is not a canonical path", path); |
1182 | 0 | return false; |
1183 | 0 | } |
1184 | 0 | } |
1185 | 0 | return true; |
1186 | 0 | } |
1187 | | |
1188 | | int API minijail_mount_with_data(struct minijail *j, const char *src, |
1189 | | const char *dest, const char *type, |
1190 | | unsigned long flags, const char *data) |
1191 | 0 | { |
1192 | 0 | struct mountpoint *m; |
1193 | |
|
1194 | 0 | if (*dest != '/') |
1195 | 0 | return -EINVAL; |
1196 | 0 | m = calloc(1, sizeof(*m)); |
1197 | 0 | if (!m) |
1198 | 0 | return -ENOMEM; |
1199 | 0 | m->dest = strdup(dest); |
1200 | 0 | if (!m->dest) |
1201 | 0 | goto error; |
1202 | 0 | m->src = strdup(src); |
1203 | 0 | if (!m->src) |
1204 | 0 | goto error; |
1205 | 0 | m->type = strdup(type); |
1206 | 0 | if (!m->type) |
1207 | 0 | goto error; |
1208 | | |
1209 | 0 | if (!data || !data[0]) { |
1210 | | /* |
1211 | | * Set up secure defaults for certain filesystems. Adding this |
1212 | | * fs-specific logic here kind of sucks, but considering how |
1213 | | * people use these in practice, it's probably OK. If they want |
1214 | | * the kernel defaults, they can pass data="" instead of NULL. |
1215 | | */ |
1216 | 0 | if (streq(type, "tmpfs")) { |
1217 | | /* tmpfs defaults to mode=1777 and size=50%. */ |
1218 | 0 | data = "mode=0755,size=10M"; |
1219 | 0 | } |
1220 | 0 | } |
1221 | 0 | if (data) { |
1222 | 0 | m->data = strdup(data); |
1223 | 0 | if (!m->data) |
1224 | 0 | goto error; |
1225 | 0 | m->has_data = 1; |
1226 | 0 | } |
1227 | | |
1228 | | /* If they don't specify any flags, default to secure ones. */ |
1229 | 0 | if (flags == 0) |
1230 | 0 | flags = MS_NODEV | MS_NOEXEC | MS_NOSUID; |
1231 | 0 | m->flags = flags; |
1232 | | |
1233 | | /* |
1234 | | * Unless asked to enter an existing namespace, force vfs namespacing |
1235 | | * so the mounts don't leak out into the containing vfs namespace. |
1236 | | * If Minijail is being asked to enter the root vfs namespace this will |
1237 | | * leak mounts, but it's unlikely that the user would ask to do that by |
1238 | | * mistake. |
1239 | | */ |
1240 | 0 | if (!j->flags.enter_vfs) |
1241 | 0 | minijail_namespace_vfs(j); |
1242 | |
|
1243 | 0 | if (j->mounts_tail) |
1244 | 0 | j->mounts_tail->next = m; |
1245 | 0 | else |
1246 | 0 | j->mounts_head = m; |
1247 | 0 | j->mounts_tail = m; |
1248 | 0 | j->mounts_count++; |
1249 | |
|
1250 | 0 | return 0; |
1251 | | |
1252 | 0 | error: |
1253 | 0 | free(m->type); |
1254 | 0 | free(m->src); |
1255 | 0 | free(m->dest); |
1256 | 0 | free(m); |
1257 | 0 | return -ENOMEM; |
1258 | 0 | } |
1259 | | |
1260 | | int API minijail_mount(struct minijail *j, const char *src, const char *dest, |
1261 | | const char *type, unsigned long flags) |
1262 | 0 | { |
1263 | 0 | return minijail_mount_with_data(j, src, dest, type, flags, NULL); |
1264 | 0 | } |
1265 | | |
1266 | | int API minijail_bind(struct minijail *j, const char *src, const char *dest, |
1267 | | int writeable) |
1268 | 0 | { |
1269 | 0 | unsigned long flags = MS_BIND; |
1270 | | |
1271 | | /* |
1272 | | * Check for symlinks in bind-mount source paths to warn the user early. |
1273 | | * Minijail will perform one final check immediately before the mount() |
1274 | | * call. |
1275 | | */ |
1276 | 0 | if (!is_valid_bind_path(src)) { |
1277 | 0 | warn("src '%s' is not a valid bind mount path", src); |
1278 | 0 | return -ELOOP; |
1279 | 0 | } |
1280 | | |
1281 | | /* |
1282 | | * Symlinks in |dest| are blocked by the ChromiumOS LSM: |
1283 | | * <kernel>/security/chromiumos/lsm.c#77 |
1284 | | */ |
1285 | | |
1286 | 0 | if (!writeable) |
1287 | 0 | flags |= MS_RDONLY; |
1288 | | |
1289 | | /* |
1290 | | * |type| is ignored for bind mounts, use it to signal that this mount |
1291 | | * came from minijail_bind(). |
1292 | | * TODO(b/238362528): Implement a better way to signal this. |
1293 | | */ |
1294 | 0 | return minijail_mount(j, src, dest, "minijail_bind", flags); |
1295 | 0 | } |
1296 | | |
1297 | | int API minijail_add_remount(struct minijail *j, const char *mount_name, |
1298 | | unsigned long remount_mode) |
1299 | 0 | { |
1300 | 0 | struct minijail_remount *m; |
1301 | |
|
1302 | 0 | if (*mount_name != '/') |
1303 | 0 | return -EINVAL; |
1304 | 0 | m = calloc(1, sizeof(*m)); |
1305 | 0 | if (!m) |
1306 | 0 | return -ENOMEM; |
1307 | 0 | m->mount_name = strdup(mount_name); |
1308 | 0 | if (!m->mount_name) { |
1309 | 0 | free(m); |
1310 | 0 | return -ENOMEM; |
1311 | 0 | } |
1312 | | |
1313 | 0 | m->remount_mode = remount_mode; |
1314 | |
|
1315 | 0 | if (j->remounts_tail) |
1316 | 0 | j->remounts_tail->next = m; |
1317 | 0 | else |
1318 | 0 | j->remounts_head = m; |
1319 | 0 | j->remounts_tail = m; |
1320 | |
|
1321 | 0 | return 0; |
1322 | 0 | } |
1323 | | |
1324 | | int API minijail_add_hook(struct minijail *j, minijail_hook_t hook, |
1325 | | void *payload, minijail_hook_event_t event) |
1326 | 0 | { |
1327 | 0 | struct hook *c; |
1328 | |
|
1329 | 0 | if (event >= MINIJAIL_HOOK_EVENT_MAX) |
1330 | 0 | return -EINVAL; |
1331 | 0 | c = calloc(1, sizeof(*c)); |
1332 | 0 | if (!c) |
1333 | 0 | return -ENOMEM; |
1334 | | |
1335 | 0 | c->hook = hook; |
1336 | 0 | c->payload = payload; |
1337 | 0 | c->event = event; |
1338 | |
|
1339 | 0 | if (j->hooks_tail) |
1340 | 0 | j->hooks_tail->next = c; |
1341 | 0 | else |
1342 | 0 | j->hooks_head = c; |
1343 | 0 | j->hooks_tail = c; |
1344 | |
|
1345 | 0 | return 0; |
1346 | 0 | } |
1347 | | |
1348 | | int API minijail_preserve_fd(struct minijail *j, int parent_fd, int child_fd) |
1349 | 0 | { |
1350 | 0 | if (parent_fd < 0 || child_fd < 0) |
1351 | 0 | return -EINVAL; |
1352 | 0 | if (j->preserved_fd_count >= MAX_PRESERVED_FDS) |
1353 | 0 | return -ENOMEM; |
1354 | 0 | j->preserved_fds[j->preserved_fd_count].parent_fd = parent_fd; |
1355 | 0 | j->preserved_fds[j->preserved_fd_count].child_fd = child_fd; |
1356 | 0 | j->preserved_fd_count++; |
1357 | 0 | return 0; |
1358 | 0 | } |
1359 | | |
1360 | | int API minijail_set_preload_path(struct minijail *j, const char *preload_path) |
1361 | 0 | { |
1362 | 0 | if (j->preload_path) |
1363 | 0 | return -EINVAL; |
1364 | 0 | j->preload_path = strdup(preload_path); |
1365 | 0 | if (!j->preload_path) |
1366 | 0 | return -ENOMEM; |
1367 | 0 | return 0; |
1368 | 0 | } |
1369 | | |
1370 | | static void clear_seccomp_options(struct minijail *j) |
1371 | 0 | { |
1372 | 0 | j->flags.seccomp_filter = 0; |
1373 | 0 | j->flags.seccomp_filter_tsync = 0; |
1374 | 0 | j->flags.seccomp_filter_logging = 0; |
1375 | 0 | j->flags.seccomp_filter_allow_speculation = 0; |
1376 | 0 | j->filter_len = 0; |
1377 | 0 | j->filter_prog = NULL; |
1378 | 0 | j->flags.no_new_privs = 0; |
1379 | 0 | if (j->seccomp_policy_path) { |
1380 | 0 | free(j->seccomp_policy_path); |
1381 | 0 | } |
1382 | 0 | j->seccomp_policy_path = NULL; |
1383 | 0 | } |
1384 | | |
1385 | | static int seccomp_should_use_filters(struct minijail *j) |
1386 | 0 | { |
1387 | 0 | if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL) == -1) { |
1388 | | /* |
1389 | | * |errno| will be set to EINVAL when seccomp has not been |
1390 | | * compiled into the kernel. On certain platforms and kernel |
1391 | | * versions this is not a fatal failure. In that case, and only |
1392 | | * in that case, disable seccomp and skip loading the filters. |
1393 | | */ |
1394 | 0 | if ((errno == EINVAL) && seccomp_can_softfail()) { |
1395 | 0 | warn("not loading seccomp filters, seccomp filter not " |
1396 | 0 | "supported"); |
1397 | 0 | clear_seccomp_options(j); |
1398 | 0 | return 0; |
1399 | 0 | } |
1400 | | /* |
1401 | | * If |errno| != EINVAL or seccomp_can_softfail() is false, |
1402 | | * we can proceed. Worst case scenario minijail_enter() will |
1403 | | * abort() if seccomp fails. |
1404 | | */ |
1405 | 0 | } |
1406 | 0 | if (j->flags.seccomp_filter_tsync) { |
1407 | | /* Are the seccomp(2) syscall and the TSYNC option supported? */ |
1408 | 0 | if (sys_seccomp(SECCOMP_SET_MODE_FILTER, |
1409 | 0 | SECCOMP_FILTER_FLAG_TSYNC, NULL) == -1) { |
1410 | 0 | int saved_errno = errno; |
1411 | 0 | if (saved_errno == ENOSYS && seccomp_can_softfail()) { |
1412 | 0 | warn("seccomp(2) syscall not supported"); |
1413 | 0 | clear_seccomp_options(j); |
1414 | 0 | return 0; |
1415 | 0 | } else if (saved_errno == EINVAL && |
1416 | 0 | seccomp_can_softfail()) { |
1417 | 0 | warn( |
1418 | 0 | "seccomp filter thread sync not supported"); |
1419 | 0 | clear_seccomp_options(j); |
1420 | 0 | return 0; |
1421 | 0 | } |
1422 | | /* |
1423 | | * Similar logic here. If seccomp_can_softfail() is |
1424 | | * false, or |errno| != ENOSYS, or |errno| != EINVAL, |
1425 | | * we can proceed. Worst case scenario minijail_enter() |
1426 | | * will abort() if seccomp or TSYNC fail. |
1427 | | */ |
1428 | 0 | } |
1429 | 0 | } |
1430 | 0 | if (j->flags.seccomp_filter_allow_speculation) { |
1431 | | /* Is the SPEC_ALLOW flag supported? */ |
1432 | 0 | if (!seccomp_filter_flags_available( |
1433 | 0 | SECCOMP_FILTER_FLAG_SPEC_ALLOW)) { |
1434 | 0 | warn("allowing speculative execution on seccomp " |
1435 | 0 | "processes not supported"); |
1436 | 0 | j->flags.seccomp_filter_allow_speculation = 0; |
1437 | 0 | } |
1438 | 0 | } |
1439 | 0 | return 1; |
1440 | 0 | } |
1441 | | |
1442 | | static int set_seccomp_filters_internal(struct minijail *j, |
1443 | | const struct sock_fprog *filter, |
1444 | | bool owned) |
1445 | 0 | { |
1446 | 0 | struct sock_fprog *fprog; |
1447 | |
|
1448 | 0 | if (owned) { |
1449 | | /* |
1450 | | * If |owned| is true, it's OK to cast away the const-ness since |
1451 | | * we'll own the pointer going forward. |
1452 | | */ |
1453 | 0 | fprog = (struct sock_fprog *)filter; |
1454 | 0 | } else { |
1455 | 0 | fprog = malloc(sizeof(struct sock_fprog)); |
1456 | 0 | if (!fprog) |
1457 | 0 | return -ENOMEM; |
1458 | 0 | fprog->len = filter->len; |
1459 | 0 | fprog->filter = malloc(sizeof(struct sock_filter) * fprog->len); |
1460 | 0 | if (!fprog->filter) { |
1461 | 0 | free(fprog); |
1462 | 0 | return -ENOMEM; |
1463 | 0 | } |
1464 | 0 | memcpy(fprog->filter, filter->filter, |
1465 | 0 | sizeof(struct sock_filter) * fprog->len); |
1466 | 0 | } |
1467 | | |
1468 | 0 | if (j->filter_prog) { |
1469 | 0 | free(j->filter_prog->filter); |
1470 | 0 | free(j->filter_prog); |
1471 | 0 | } |
1472 | |
|
1473 | 0 | j->filter_len = fprog->len; |
1474 | 0 | j->filter_prog = fprog; |
1475 | 0 | return 0; |
1476 | 0 | } |
1477 | | |
1478 | | static int parse_seccomp_filters(struct minijail *j, const char *filename, |
1479 | | FILE *policy_file) |
1480 | 0 | { |
1481 | 0 | struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); |
1482 | 0 | if (!fprog) |
1483 | 0 | return -ENOMEM; |
1484 | | |
1485 | 0 | struct filter_options filteropts; |
1486 | | |
1487 | | /* |
1488 | | * Figure out filter options. |
1489 | | * Allow logging? |
1490 | | */ |
1491 | 0 | filteropts.allow_logging = |
1492 | 0 | debug_logging_allowed() && seccomp_is_logging_allowed(j); |
1493 | | |
1494 | | /* What to do on a blocked system call? */ |
1495 | 0 | if (filteropts.allow_logging) { |
1496 | 0 | if (seccomp_ret_log_available()) |
1497 | 0 | filteropts.action = ACTION_RET_LOG; |
1498 | 0 | else |
1499 | 0 | filteropts.action = ACTION_RET_TRAP; |
1500 | 0 | } else { |
1501 | 0 | if (j->flags.seccomp_filter_tsync) { |
1502 | 0 | if (seccomp_ret_kill_process_available()) { |
1503 | 0 | filteropts.action = ACTION_RET_KILL_PROCESS; |
1504 | 0 | } else { |
1505 | 0 | filteropts.action = ACTION_RET_TRAP; |
1506 | 0 | } |
1507 | 0 | } else { |
1508 | 0 | filteropts.action = ACTION_RET_KILL; |
1509 | 0 | } |
1510 | 0 | } |
1511 | | |
1512 | | /* |
1513 | | * If SECCOMP_RET_LOG is not available, need to allow extra syscalls |
1514 | | * for logging. |
1515 | | */ |
1516 | 0 | filteropts.allow_syscalls_for_logging = |
1517 | 0 | filteropts.allow_logging && !seccomp_ret_log_available(); |
1518 | | |
1519 | | /* Whether to also allow syscalls for libc compatibility. */ |
1520 | 0 | filteropts.include_libc_compatibility_allowlist = |
1521 | 0 | allow_libc_compatibility_syscalls(); |
1522 | | |
1523 | | /* Whether to fail on duplicate syscalls. */ |
1524 | 0 | filteropts.allow_duplicate_syscalls = allow_duplicate_syscalls(); |
1525 | |
|
1526 | 0 | if (compile_filter(filename, policy_file, fprog, &filteropts)) { |
1527 | 0 | free(fprog); |
1528 | 0 | return -1; |
1529 | 0 | } |
1530 | | |
1531 | 0 | return set_seccomp_filters_internal(j, fprog, true /* owned */); |
1532 | 0 | } |
1533 | | |
1534 | | void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) |
1535 | 0 | { |
1536 | 0 | if (!seccomp_should_use_filters(j)) |
1537 | 0 | return; |
1538 | | |
1539 | 0 | attribute_cleanup_fp FILE *file = fopen(path, "re"); |
1540 | 0 | if (!file) { |
1541 | 0 | pdie("failed to open seccomp filter file '%s'", path); |
1542 | 0 | } |
1543 | |
|
1544 | 0 | if (parse_seccomp_filters(j, path, file) != 0) { |
1545 | 0 | die("failed to compile seccomp filter BPF program in '%s'", |
1546 | 0 | path); |
1547 | 0 | } |
1548 | 0 | if (j->seccomp_policy_path) { |
1549 | 0 | free(j->seccomp_policy_path); |
1550 | 0 | } |
1551 | 0 | j->seccomp_policy_path = strdup(path); |
1552 | 0 | } |
1553 | | |
1554 | | void API minijail_parse_seccomp_filters_from_fd(struct minijail *j, int fd) |
1555 | 0 | { |
1556 | 0 | char *fd_path, *path; |
1557 | 0 | attribute_cleanup_fp FILE *file = NULL; |
1558 | |
|
1559 | 0 | if (!seccomp_should_use_filters(j)) |
1560 | 0 | return; |
1561 | | |
1562 | 0 | file = fdopen(fd, "r"); |
1563 | 0 | if (!file) { |
1564 | 0 | pdie("failed to associate stream with fd %d", fd); |
1565 | 0 | } |
1566 | |
|
1567 | 0 | if (asprintf(&fd_path, "/proc/self/fd/%d", fd) == -1) |
1568 | 0 | pdie("failed to create path for fd %d", fd); |
1569 | 0 | path = realpath(fd_path, NULL); |
1570 | 0 | if (path == NULL) |
1571 | 0 | pwarn("failed to get path of fd %d", fd); |
1572 | 0 | free(fd_path); |
1573 | |
|
1574 | 0 | if (parse_seccomp_filters(j, path ? path : "<fd>", file) != 0) { |
1575 | 0 | die("failed to compile seccomp filter BPF program from fd %d", |
1576 | 0 | fd); |
1577 | 0 | } |
1578 | 0 | if (j->seccomp_policy_path) { |
1579 | 0 | free(j->seccomp_policy_path); |
1580 | 0 | } |
1581 | 0 | j->seccomp_policy_path = path; |
1582 | 0 | } |
1583 | | |
1584 | | void API minijail_set_seccomp_filters(struct minijail *j, |
1585 | | const struct sock_fprog *filter) |
1586 | 0 | { |
1587 | 0 | if (!seccomp_should_use_filters(j)) |
1588 | 0 | return; |
1589 | | |
1590 | 0 | if (seccomp_is_logging_allowed(j)) { |
1591 | 0 | die("minijail_log_seccomp_filter_failures() is incompatible " |
1592 | 0 | "with minijail_set_seccomp_filters()"); |
1593 | 0 | } |
1594 | | |
1595 | | /* |
1596 | | * set_seccomp_filters_internal() can only fail with ENOMEM. |
1597 | | * Furthermore, since we won't own the incoming filter, it will not be |
1598 | | * modified. |
1599 | | */ |
1600 | 0 | if (set_seccomp_filters_internal(j, filter, false /* owned */) < 0) { |
1601 | 0 | die("failed to set seccomp filter"); |
1602 | 0 | } |
1603 | 0 | } |
1604 | | |
1605 | | int API minijail_use_alt_syscall(struct minijail *j, const char *table) |
1606 | 0 | { |
1607 | 0 | j->alt_syscall_table = strdup(table); |
1608 | 0 | if (!j->alt_syscall_table) |
1609 | 0 | return -ENOMEM; |
1610 | 0 | j->flags.alt_syscall = 1; |
1611 | 0 | return 0; |
1612 | 0 | } |
1613 | | |
1614 | | struct marshal_state { |
1615 | | size_t available; |
1616 | | size_t total; |
1617 | | char *buf; |
1618 | | }; |
1619 | | |
1620 | | static void marshal_state_init(struct marshal_state *state, char *buf, |
1621 | | size_t available) |
1622 | 0 | { |
1623 | 0 | state->available = available; |
1624 | 0 | state->buf = buf; |
1625 | 0 | state->total = 0; |
1626 | 0 | } |
1627 | | |
1628 | | static void marshal_append(struct marshal_state *state, const void *src, |
1629 | | size_t length) |
1630 | 0 | { |
1631 | 0 | size_t copy_len = MIN(state->available, length); |
1632 | | |
1633 | | /* Up to |available| will be written. */ |
1634 | 0 | if (copy_len) { |
1635 | 0 | memcpy(state->buf, src, copy_len); |
1636 | 0 | state->buf += copy_len; |
1637 | 0 | state->available -= copy_len; |
1638 | 0 | } |
1639 | | /* |total| will contain the expected length. */ |
1640 | 0 | state->total += length; |
1641 | 0 | } |
1642 | | |
1643 | | static void marshal_append_string(struct marshal_state *state, const char *src) |
1644 | 0 | { |
1645 | 0 | marshal_append(state, src, strlen(src) + 1); |
1646 | 0 | } |
1647 | | |
1648 | | static void marshal_mount(struct marshal_state *state, |
1649 | | const struct mountpoint *m) |
1650 | 0 | { |
1651 | 0 | marshal_append(state, m->src, strlen(m->src) + 1); |
1652 | 0 | marshal_append(state, m->dest, strlen(m->dest) + 1); |
1653 | 0 | marshal_append(state, m->type, strlen(m->type) + 1); |
1654 | 0 | marshal_append(state, (char *)&m->has_data, sizeof(m->has_data)); |
1655 | 0 | if (m->has_data) |
1656 | 0 | marshal_append(state, m->data, strlen(m->data) + 1); |
1657 | 0 | marshal_append(state, (char *)&m->flags, sizeof(m->flags)); |
1658 | 0 | } |
1659 | | |
1660 | | static void marshal_fs_rule(struct marshal_state *state, |
1661 | | const struct fs_rule *r) |
1662 | 0 | { |
1663 | 0 | marshal_append(state, r->path, strlen(r->path) + 1); |
1664 | 0 | marshal_append(state, (char *)&r->landlock_flags, |
1665 | 0 | sizeof(r->landlock_flags)); |
1666 | 0 | } |
1667 | | |
1668 | | static void minijail_marshal_helper(struct marshal_state *state, |
1669 | | const struct minijail *j) |
1670 | 0 | { |
1671 | 0 | struct mountpoint *m = NULL; |
1672 | 0 | struct fs_rule *r = NULL; |
1673 | 0 | size_t i; |
1674 | |
|
1675 | 0 | marshal_append(state, (char *)j, sizeof(*j)); |
1676 | 0 | if (j->user) |
1677 | 0 | marshal_append_string(state, j->user); |
1678 | 0 | if (j->suppl_gid_list) { |
1679 | 0 | marshal_append(state, j->suppl_gid_list, |
1680 | 0 | j->suppl_gid_count * sizeof(gid_t)); |
1681 | 0 | } |
1682 | 0 | if (j->chrootdir) |
1683 | 0 | marshal_append_string(state, j->chrootdir); |
1684 | 0 | if (j->hostname) |
1685 | 0 | marshal_append_string(state, j->hostname); |
1686 | 0 | if (j->alt_syscall_table) { |
1687 | 0 | marshal_append(state, j->alt_syscall_table, |
1688 | 0 | strlen(j->alt_syscall_table) + 1); |
1689 | 0 | } |
1690 | 0 | if (j->flags.seccomp_filter && j->filter_prog) { |
1691 | 0 | struct sock_fprog *fp = j->filter_prog; |
1692 | 0 | marshal_append(state, (char *)fp->filter, |
1693 | 0 | fp->len * sizeof(struct sock_filter)); |
1694 | 0 | } |
1695 | 0 | for (m = j->mounts_head; m; m = m->next) { |
1696 | 0 | marshal_mount(state, m); |
1697 | 0 | } |
1698 | 0 | for (i = 0; i < j->cgroup_count; ++i) |
1699 | 0 | marshal_append_string(state, j->cgroups[i]); |
1700 | 0 | for (r = j->fs_rules_head; r; r = r->next) |
1701 | 0 | marshal_fs_rule(state, r); |
1702 | 0 | marshal_append(state, (char *)&j->fs_rules_fd, sizeof(j->fs_rules_fd)); |
1703 | 0 | if (j->seccomp_policy_path) |
1704 | 0 | marshal_append_string(state, j->seccomp_policy_path); |
1705 | 0 | } |
1706 | | |
1707 | | size_t API minijail_size(const struct minijail *j) |
1708 | 0 | { |
1709 | 0 | struct marshal_state state; |
1710 | 0 | marshal_state_init(&state, NULL, 0); |
1711 | 0 | minijail_marshal_helper(&state, j); |
1712 | 0 | return state.total; |
1713 | 0 | } |
1714 | | |
1715 | | int minijail_marshal(const struct minijail *j, char *buf, size_t available) |
1716 | 0 | { |
1717 | 0 | struct marshal_state state; |
1718 | 0 | marshal_state_init(&state, buf, available); |
1719 | 0 | minijail_marshal_helper(&state, j); |
1720 | 0 | return (state.total > available); |
1721 | 0 | } |
1722 | | |
1723 | | int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) |
1724 | 0 | { |
1725 | 0 | size_t i; |
1726 | 0 | size_t count; |
1727 | 0 | size_t fs_rules_count; |
1728 | 0 | int ret = -EINVAL; |
1729 | |
|
1730 | 0 | if (length < sizeof(*j)) |
1731 | 0 | goto out; |
1732 | 0 | memcpy((void *)j, serialized, sizeof(*j)); |
1733 | 0 | serialized += sizeof(*j); |
1734 | 0 | length -= sizeof(*j); |
1735 | | |
1736 | | /* Potentially stale pointers not used as signals. */ |
1737 | 0 | j->preload_path = NULL; |
1738 | 0 | j->filename = NULL; |
1739 | 0 | j->pid_file_path = NULL; |
1740 | 0 | j->uidmap = NULL; |
1741 | 0 | j->gidmap = NULL; |
1742 | 0 | j->mounts_head = NULL; |
1743 | 0 | j->mounts_tail = NULL; |
1744 | 0 | j->remounts_head = NULL; |
1745 | 0 | j->remounts_tail = NULL; |
1746 | 0 | j->filter_prog = NULL; |
1747 | 0 | j->hooks_head = NULL; |
1748 | 0 | j->hooks_tail = NULL; |
1749 | 0 | j->fs_rules_head = NULL; |
1750 | 0 | j->fs_rules_tail = NULL; |
1751 | |
|
1752 | 0 | if (j->user) { /* stale pointer */ |
1753 | 0 | char *user = consumestr(&serialized, &length); |
1754 | 0 | if (!user) |
1755 | 0 | goto clear_pointers; |
1756 | 0 | j->user = strdup(user); |
1757 | 0 | if (!j->user) |
1758 | 0 | goto clear_pointers; |
1759 | 0 | } |
1760 | | |
1761 | 0 | if (j->suppl_gid_list) { /* stale pointer */ |
1762 | 0 | if (j->suppl_gid_count > NGROUPS_MAX) { |
1763 | 0 | goto bad_gid_list; |
1764 | 0 | } |
1765 | 0 | size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t); |
1766 | 0 | void *gid_list_bytes = |
1767 | 0 | consumebytes(gid_list_size, &serialized, &length); |
1768 | 0 | if (!gid_list_bytes) |
1769 | 0 | goto bad_gid_list; |
1770 | | |
1771 | 0 | j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t)); |
1772 | 0 | if (!j->suppl_gid_list) |
1773 | 0 | goto bad_gid_list; |
1774 | | |
1775 | 0 | memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size); |
1776 | 0 | } |
1777 | | |
1778 | 0 | if (j->chrootdir) { /* stale pointer */ |
1779 | 0 | char *chrootdir = consumestr(&serialized, &length); |
1780 | 0 | if (!chrootdir) |
1781 | 0 | goto bad_chrootdir; |
1782 | 0 | j->chrootdir = strdup(chrootdir); |
1783 | 0 | if (!j->chrootdir) |
1784 | 0 | goto bad_chrootdir; |
1785 | 0 | } |
1786 | | |
1787 | 0 | if (j->hostname) { /* stale pointer */ |
1788 | 0 | char *hostname = consumestr(&serialized, &length); |
1789 | 0 | if (!hostname) |
1790 | 0 | goto bad_hostname; |
1791 | 0 | j->hostname = strdup(hostname); |
1792 | 0 | if (!j->hostname) |
1793 | 0 | goto bad_hostname; |
1794 | 0 | } |
1795 | | |
1796 | 0 | if (j->alt_syscall_table) { /* stale pointer */ |
1797 | 0 | char *alt_syscall_table = consumestr(&serialized, &length); |
1798 | 0 | if (!alt_syscall_table) |
1799 | 0 | goto bad_syscall_table; |
1800 | 0 | j->alt_syscall_table = strdup(alt_syscall_table); |
1801 | 0 | if (!j->alt_syscall_table) |
1802 | 0 | goto bad_syscall_table; |
1803 | 0 | } |
1804 | | |
1805 | 0 | if (j->flags.seccomp_filter && j->filter_len > 0) { |
1806 | 0 | size_t ninstrs = j->filter_len; |
1807 | 0 | if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || |
1808 | 0 | ninstrs > USHRT_MAX) |
1809 | 0 | goto bad_filters; |
1810 | | |
1811 | 0 | size_t program_len = ninstrs * sizeof(struct sock_filter); |
1812 | 0 | void *program = consumebytes(program_len, &serialized, &length); |
1813 | 0 | if (!program) |
1814 | 0 | goto bad_filters; |
1815 | | |
1816 | 0 | j->filter_prog = malloc(sizeof(struct sock_fprog)); |
1817 | 0 | if (!j->filter_prog) |
1818 | 0 | goto bad_filters; |
1819 | | |
1820 | 0 | j->filter_prog->len = ninstrs; |
1821 | 0 | j->filter_prog->filter = malloc(program_len); |
1822 | 0 | if (!j->filter_prog->filter) |
1823 | 0 | goto bad_filter_prog_instrs; |
1824 | | |
1825 | 0 | memcpy(j->filter_prog->filter, program, program_len); |
1826 | 0 | } |
1827 | | |
1828 | 0 | count = j->mounts_count; |
1829 | 0 | j->mounts_count = 0; |
1830 | 0 | for (i = 0; i < count; ++i) { |
1831 | 0 | unsigned long *flags; |
1832 | 0 | int *has_data; |
1833 | 0 | const char *dest; |
1834 | 0 | const char *type; |
1835 | 0 | const char *data = NULL; |
1836 | 0 | const char *src = consumestr(&serialized, &length); |
1837 | 0 | if (!src) |
1838 | 0 | goto bad_mounts; |
1839 | 0 | dest = consumestr(&serialized, &length); |
1840 | 0 | if (!dest) |
1841 | 0 | goto bad_mounts; |
1842 | 0 | type = consumestr(&serialized, &length); |
1843 | 0 | if (!type) |
1844 | 0 | goto bad_mounts; |
1845 | 0 | has_data = |
1846 | 0 | consumebytes(sizeof(*has_data), &serialized, &length); |
1847 | 0 | if (!has_data) |
1848 | 0 | goto bad_mounts; |
1849 | 0 | if (*has_data) { |
1850 | 0 | data = consumestr(&serialized, &length); |
1851 | 0 | if (!data) |
1852 | 0 | goto bad_mounts; |
1853 | 0 | } |
1854 | 0 | flags = consumebytes(sizeof(*flags), &serialized, &length); |
1855 | 0 | if (!flags) |
1856 | 0 | goto bad_mounts; |
1857 | 0 | if (minijail_mount_with_data(j, src, dest, type, *flags, data)) |
1858 | 0 | goto bad_mounts; |
1859 | 0 | } |
1860 | | |
1861 | 0 | count = j->cgroup_count; |
1862 | 0 | j->cgroup_count = 0; |
1863 | 0 | for (i = 0; i < count; ++i) { |
1864 | 0 | char *cgroup = consumestr(&serialized, &length); |
1865 | 0 | if (!cgroup) |
1866 | 0 | goto bad_cgroups; |
1867 | 0 | j->cgroups[i] = strdup(cgroup); |
1868 | 0 | if (!j->cgroups[i]) |
1869 | 0 | goto bad_cgroups; |
1870 | 0 | ++j->cgroup_count; |
1871 | 0 | } |
1872 | | |
1873 | | /* Unmarshal fs_rules. */ |
1874 | 0 | fs_rules_count = j->fs_rules_count; |
1875 | 0 | j->fs_rules_count = 0; |
1876 | 0 | for (i = 0; i < fs_rules_count; ++i) { |
1877 | 0 | const char *path = consumestr(&serialized, &length); |
1878 | 0 | uint64_t landlock_flags; |
1879 | 0 | void *landlock_flags_bytes = |
1880 | 0 | consumebytes(sizeof(landlock_flags), &serialized, &length); |
1881 | |
|
1882 | 0 | if (!path) |
1883 | 0 | goto bad_fs_rules; |
1884 | 0 | memcpy(&landlock_flags, landlock_flags_bytes, |
1885 | 0 | sizeof(landlock_flags)); |
1886 | 0 | if (!landlock_flags) |
1887 | 0 | goto bad_fs_rules; |
1888 | 0 | if (add_fs_restriction_path(j, path, landlock_flags)) |
1889 | 0 | goto bad_fs_rules; |
1890 | 0 | } |
1891 | | /* Unmarshal fs_rules_fd. */ |
1892 | 0 | void *fs_rules_fd_bytes = |
1893 | 0 | consumebytes(sizeof(j->fs_rules_fd), &serialized, &length); |
1894 | 0 | memcpy(&j->fs_rules_fd, fs_rules_fd_bytes, sizeof(j->fs_rules_fd)); |
1895 | 0 | if (!j->fs_rules_fd) |
1896 | 0 | goto bad_cgroups; |
1897 | | |
1898 | 0 | if (j->seccomp_policy_path) { /* stale pointer */ |
1899 | 0 | char *seccomp_policy_path = consumestr(&serialized, &length); |
1900 | 0 | if (!seccomp_policy_path) |
1901 | 0 | goto bad_cgroups; |
1902 | 0 | j->seccomp_policy_path = strdup(seccomp_policy_path); |
1903 | 0 | if (!j->seccomp_policy_path) |
1904 | 0 | goto bad_cgroups; |
1905 | 0 | } |
1906 | | |
1907 | 0 | return 0; |
1908 | | |
1909 | | /* |
1910 | | * If more is added after j->seccomp_policy_path, then this is needed: |
1911 | | * if (j->seccomp_policy_path) |
1912 | | * free(j->seccomp_policy_path); |
1913 | | */ |
1914 | | |
1915 | 0 | bad_cgroups: |
1916 | 0 | free_mounts_list(j); |
1917 | 0 | free_remounts_list(j); |
1918 | 0 | for (i = 0; i < j->cgroup_count; ++i) |
1919 | 0 | free(j->cgroups[i]); |
1920 | 0 | bad_fs_rules: |
1921 | 0 | free_fs_rules_list(j); |
1922 | 0 | bad_mounts: |
1923 | 0 | if (j->filter_prog && j->filter_prog->filter) |
1924 | 0 | free(j->filter_prog->filter); |
1925 | 0 | bad_filter_prog_instrs: |
1926 | 0 | if (j->filter_prog) |
1927 | 0 | free(j->filter_prog); |
1928 | 0 | bad_filters: |
1929 | 0 | if (j->alt_syscall_table) |
1930 | 0 | free(j->alt_syscall_table); |
1931 | 0 | bad_syscall_table: |
1932 | 0 | if (j->hostname) |
1933 | 0 | free(j->hostname); |
1934 | 0 | bad_hostname: |
1935 | 0 | if (j->chrootdir) |
1936 | 0 | free(j->chrootdir); |
1937 | 0 | bad_chrootdir: |
1938 | 0 | if (j->suppl_gid_list) |
1939 | 0 | free(j->suppl_gid_list); |
1940 | 0 | bad_gid_list: |
1941 | 0 | if (j->user) |
1942 | 0 | free(j->user); |
1943 | 0 | clear_pointers: |
1944 | 0 | j->user = NULL; |
1945 | 0 | j->suppl_gid_list = NULL; |
1946 | 0 | j->chrootdir = NULL; |
1947 | 0 | j->hostname = NULL; |
1948 | 0 | j->alt_syscall_table = NULL; |
1949 | 0 | j->cgroup_count = 0; |
1950 | 0 | j->fs_rules_count = 0; |
1951 | 0 | j->seccomp_policy_path = NULL; |
1952 | 0 | out: |
1953 | 0 | return ret; |
1954 | 0 | } |
1955 | | |
1956 | | struct dev_spec { |
1957 | | const char *name; |
1958 | | mode_t mode; |
1959 | | dev_t major, minor; |
1960 | | }; |
1961 | | |
1962 | | // clang-format off |
1963 | | static const struct dev_spec device_nodes[] = { |
1964 | | { |
1965 | | "null", |
1966 | | S_IFCHR | 0666, 1, 3, |
1967 | | }, |
1968 | | { |
1969 | | "zero", |
1970 | | S_IFCHR | 0666, 1, 5, |
1971 | | }, |
1972 | | { |
1973 | | "full", |
1974 | | S_IFCHR | 0666, 1, 7, |
1975 | | }, |
1976 | | { |
1977 | | "urandom", |
1978 | | S_IFCHR | 0444, 1, 9, |
1979 | | }, |
1980 | | { |
1981 | | "tty", |
1982 | | S_IFCHR | 0666, 5, 0, |
1983 | | }, |
1984 | | }; |
1985 | | // clang-format on |
1986 | | |
1987 | | struct dev_sym_spec { |
1988 | | const char *source, *dest; |
1989 | | }; |
1990 | | |
1991 | | static const struct dev_sym_spec device_symlinks[] = { |
1992 | | { |
1993 | | "ptmx", |
1994 | | "pts/ptmx", |
1995 | | }, |
1996 | | { |
1997 | | "fd", |
1998 | | "/proc/self/fd", |
1999 | | }, |
2000 | | { |
2001 | | "stdin", |
2002 | | "fd/0", |
2003 | | }, |
2004 | | { |
2005 | | "stdout", |
2006 | | "fd/1", |
2007 | | }, |
2008 | | { |
2009 | | "stderr", |
2010 | | "fd/2", |
2011 | | }, |
2012 | | }; |
2013 | | |
2014 | | /* |
2015 | | * Clean up the temporary dev path we had setup previously. In case of errors, |
2016 | | * we don't want to go leaking empty tempdirs. |
2017 | | */ |
2018 | | static void mount_dev_cleanup(char *dev_path) |
2019 | 0 | { |
2020 | 0 | umount2(dev_path, MNT_DETACH); |
2021 | 0 | rmdir(dev_path); |
2022 | 0 | free(dev_path); |
2023 | 0 | } |
2024 | | |
2025 | | /* |
2026 | | * Set up the pseudo /dev path at the temporary location. |
2027 | | * See mount_dev_finalize for more details. |
2028 | | */ |
2029 | | static int mount_dev(char **dev_path_ret) |
2030 | 0 | { |
2031 | 0 | int ret; |
2032 | 0 | attribute_cleanup_fd int dev_fd = -1; |
2033 | 0 | size_t i; |
2034 | 0 | mode_t mask; |
2035 | 0 | char *dev_path; |
2036 | | |
2037 | | /* |
2038 | | * Create a temp path for the /dev init. We'll relocate this to the |
2039 | | * final location later on in the startup process. |
2040 | | */ |
2041 | 0 | dev_path = *dev_path_ret = strdup("/tmp/minijail.dev.XXXXXX"); |
2042 | 0 | if (dev_path == NULL || mkdtemp(dev_path) == NULL) |
2043 | 0 | pdie("could not create temp path for /dev"); |
2044 | | |
2045 | | /* Set up the empty /dev mount point first. */ |
2046 | 0 | ret = mount("minijail-devfs", dev_path, "tmpfs", MS_NOEXEC | MS_NOSUID, |
2047 | 0 | "size=5M,mode=755"); |
2048 | 0 | if (ret) { |
2049 | 0 | rmdir(dev_path); |
2050 | 0 | return ret; |
2051 | 0 | } |
2052 | | |
2053 | | /* We want to set the mode directly from the spec. */ |
2054 | 0 | mask = umask(0); |
2055 | | |
2056 | | /* Get a handle to the temp dev path for *at funcs below. */ |
2057 | 0 | dev_fd = open(dev_path, O_DIRECTORY | O_PATH | O_CLOEXEC); |
2058 | 0 | if (dev_fd < 0) { |
2059 | 0 | ret = 1; |
2060 | 0 | goto done; |
2061 | 0 | } |
2062 | | |
2063 | | /* Create all the nodes in /dev. */ |
2064 | 0 | for (i = 0; i < ARRAY_SIZE(device_nodes); ++i) { |
2065 | 0 | const struct dev_spec *ds = &device_nodes[i]; |
2066 | 0 | ret = mknodat(dev_fd, ds->name, ds->mode, |
2067 | 0 | makedev(ds->major, ds->minor)); |
2068 | 0 | if (ret) |
2069 | 0 | goto done; |
2070 | 0 | } |
2071 | | |
2072 | | /* Create all the symlinks in /dev. */ |
2073 | 0 | for (i = 0; i < ARRAY_SIZE(device_symlinks); ++i) { |
2074 | 0 | const struct dev_sym_spec *ds = &device_symlinks[i]; |
2075 | 0 | ret = symlinkat(ds->dest, dev_fd, ds->source); |
2076 | 0 | if (ret) |
2077 | 0 | goto done; |
2078 | 0 | } |
2079 | | |
2080 | | /* Create empty dir for glibc shared mem APIs. */ |
2081 | 0 | ret = mkdirat(dev_fd, "shm", 01777); |
2082 | 0 | if (ret) |
2083 | 0 | goto done; |
2084 | | |
2085 | | /* Restore old mask. */ |
2086 | 0 | done: |
2087 | 0 | umask(mask); |
2088 | |
|
2089 | 0 | if (ret) |
2090 | 0 | mount_dev_cleanup(dev_path); |
2091 | |
|
2092 | 0 | return ret; |
2093 | 0 | } |
2094 | | |
2095 | | /* |
2096 | | * Relocate the temporary /dev mount to its final /dev place. |
2097 | | * We have to do this two step process so people can bind mount extra |
2098 | | * /dev paths like /dev/log. |
2099 | | */ |
2100 | | static int mount_dev_finalize(const struct minijail *j, char *dev_path) |
2101 | 0 | { |
2102 | 0 | int ret = -1; |
2103 | 0 | char *dest = NULL; |
2104 | | |
2105 | | /* Unmount the /dev mount if possible. */ |
2106 | 0 | if (umount2("/dev", MNT_DETACH)) |
2107 | 0 | goto done; |
2108 | | |
2109 | 0 | if (asprintf(&dest, "%s/dev", j->chrootdir ?: "") < 0) |
2110 | 0 | goto done; |
2111 | | |
2112 | 0 | if (mount(dev_path, dest, NULL, MS_MOVE, NULL)) |
2113 | 0 | goto done; |
2114 | | |
2115 | 0 | ret = 0; |
2116 | 0 | done: |
2117 | 0 | free(dest); |
2118 | 0 | mount_dev_cleanup(dev_path); |
2119 | |
|
2120 | 0 | return ret; |
2121 | 0 | } |
2122 | | |
2123 | | /* |
2124 | | * mount_one: Applies mounts from @m for @j, recursing as needed. |
2125 | | * @j Minijail these mounts are for |
2126 | | * @m Head of list of mounts |
2127 | | * |
2128 | | * Returns 0 for success. |
2129 | | */ |
2130 | | static int mount_one(const struct minijail *j, struct mountpoint *m, |
2131 | | const char *dev_path) |
2132 | 0 | { |
2133 | 0 | int ret; |
2134 | 0 | char *dest; |
2135 | 0 | bool do_remount = false; |
2136 | 0 | bool has_bind_flag = mount_has_bind_flag(m); |
2137 | 0 | bool has_remount_flag = !!(m->flags & MS_REMOUNT); |
2138 | 0 | unsigned long original_mnt_flags = 0; |
2139 | | |
2140 | | /* We assume |dest| has a leading "/". */ |
2141 | 0 | if (dev_path && strncmp("/dev/", m->dest, 5) == 0) { |
2142 | | /* |
2143 | | * Since the temp path is rooted at /dev, skip that dest part. |
2144 | | */ |
2145 | 0 | if (asprintf(&dest, "%s%s", dev_path, m->dest + 4) < 0) |
2146 | 0 | return -ENOMEM; |
2147 | 0 | } else { |
2148 | 0 | if (asprintf(&dest, "%s%s", j->chrootdir ?: "", m->dest) < 0) |
2149 | 0 | return -ENOMEM; |
2150 | 0 | } |
2151 | | |
2152 | 0 | ret = setup_mount_destination(m->src, dest, j->uid, j->gid, |
2153 | 0 | has_bind_flag); |
2154 | 0 | if (ret) { |
2155 | 0 | warn("cannot create mount target '%s'", dest); |
2156 | 0 | goto error; |
2157 | 0 | } |
2158 | | |
2159 | | /* |
2160 | | * Remount bind mounts that: |
2161 | | * - Come from the minijail_bind() API, and |
2162 | | * - Add the 'ro' flag |
2163 | | * since 'bind' and other flags can't both be specified in the same |
2164 | | * mount(2) call. |
2165 | | * Callers using minijail_mount() to perform bind mounts are expected to |
2166 | | * know what they're doing and call minijail_mount() with MS_REMOUNT as |
2167 | | * needed. |
2168 | | * Therefore, if the caller is asking for a remount (using MS_REMOUNT), |
2169 | | * there is no need to do an extra remount here. |
2170 | | */ |
2171 | 0 | if (has_bind_flag && strcmp(m->type, "minijail_bind") == 0 && |
2172 | 0 | !has_remount_flag) { |
2173 | | /* |
2174 | | * Grab the mount flags of the source. These are used to figure |
2175 | | * out whether the bind mount needs to be remounted read-only. |
2176 | | */ |
2177 | 0 | if (get_mount_flags(m->src, &original_mnt_flags)) { |
2178 | 0 | warn("cannot get mount flags for '%s'", m->src); |
2179 | 0 | goto error; |
2180 | 0 | } |
2181 | | |
2182 | 0 | if ((m->flags & MS_RDONLY) != |
2183 | 0 | (original_mnt_flags & MS_RDONLY)) { |
2184 | 0 | do_remount = 1; |
2185 | | /* |
2186 | | * Restrict the mount flags to those that are |
2187 | | * user-settable in a MS_REMOUNT request, but excluding |
2188 | | * MS_RDONLY. The user-requested mount flags will |
2189 | | * dictate whether the remount will have that flag or |
2190 | | * not. |
2191 | | */ |
2192 | 0 | original_mnt_flags &= |
2193 | 0 | (MS_USER_SETTABLE_MASK & ~MS_RDONLY); |
2194 | 0 | } |
2195 | 0 | } |
2196 | | |
2197 | | /* |
2198 | | * Do a final check for symlinks in |m->src|. |
2199 | | * |m->src| will only contain a valid path when purely bind-mounting |
2200 | | * (but not when remounting a bind mount). |
2201 | | * |
2202 | | * Short of having a version of mount(2) that can take fd's, this is the |
2203 | | * smallest we can make the TOCTOU window. |
2204 | | */ |
2205 | 0 | if (has_bind_flag && !has_remount_flag && !is_valid_bind_path(m->src)) { |
2206 | 0 | warn("src '%s' is not a valid bind mount path", m->src); |
2207 | 0 | goto error; |
2208 | 0 | } |
2209 | | |
2210 | 0 | ret = mount(m->src, dest, m->type, m->flags, m->data); |
2211 | 0 | if (ret) { |
2212 | 0 | pwarn("cannot mount '%s' as '%s' with flags %#lx", m->src, dest, |
2213 | 0 | m->flags); |
2214 | 0 | goto error; |
2215 | 0 | } |
2216 | | |
2217 | | /* Remount *after* the initial mount. */ |
2218 | 0 | if (do_remount) { |
2219 | 0 | ret = |
2220 | 0 | mount(m->src, dest, NULL, |
2221 | 0 | m->flags | original_mnt_flags | MS_REMOUNT, m->data); |
2222 | 0 | if (ret) { |
2223 | 0 | pwarn( |
2224 | 0 | "cannot bind-remount '%s' as '%s' with flags %#lx", |
2225 | 0 | m->src, dest, |
2226 | 0 | m->flags | original_mnt_flags | MS_REMOUNT); |
2227 | 0 | goto error; |
2228 | 0 | } |
2229 | 0 | } |
2230 | | |
2231 | 0 | free(dest); |
2232 | 0 | if (m->next) |
2233 | 0 | return mount_one(j, m->next, dev_path); |
2234 | 0 | return 0; |
2235 | | |
2236 | 0 | error: |
2237 | 0 | free(dest); |
2238 | 0 | return ret; |
2239 | 0 | } |
2240 | | |
2241 | | static void process_mounts_or_die(const struct minijail *j) |
2242 | 0 | { |
2243 | | /* |
2244 | | * We have to mount /dev first in case there are bind mounts from |
2245 | | * the original /dev into the new unique tmpfs one. |
2246 | | */ |
2247 | 0 | char *dev_path = NULL; |
2248 | 0 | if (j->flags.mount_dev && mount_dev(&dev_path)) |
2249 | 0 | pdie("mount_dev failed"); |
2250 | |
|
2251 | 0 | if (j->mounts_head && mount_one(j, j->mounts_head, dev_path)) { |
2252 | 0 | warn("mount_one failed with /dev at '%s'", dev_path); |
2253 | |
|
2254 | 0 | if (dev_path) |
2255 | 0 | mount_dev_cleanup(dev_path); |
2256 | |
|
2257 | 0 | _exit(MINIJAIL_ERR_MOUNT); |
2258 | 0 | } |
2259 | | |
2260 | | /* |
2261 | | * Once all bind mounts have been processed, move the temp dev to |
2262 | | * its final /dev home. |
2263 | | */ |
2264 | 0 | if (j->flags.mount_dev && mount_dev_finalize(j, dev_path)) |
2265 | 0 | pdie("mount_dev_finalize failed"); |
2266 | 0 | } |
2267 | | |
2268 | | static int enter_chroot(const struct minijail *j) |
2269 | 0 | { |
2270 | 0 | run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT); |
2271 | |
|
2272 | 0 | if (chroot(j->chrootdir)) |
2273 | 0 | return -errno; |
2274 | | |
2275 | 0 | if (chdir("/")) |
2276 | 0 | return -errno; |
2277 | | |
2278 | 0 | return 0; |
2279 | 0 | } |
2280 | | |
2281 | | static int enter_pivot_root(const struct minijail *j) |
2282 | 0 | { |
2283 | 0 | attribute_cleanup_fd int oldroot = -1; |
2284 | 0 | attribute_cleanup_fd int newroot = -1; |
2285 | |
|
2286 | 0 | run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT); |
2287 | | |
2288 | | /* |
2289 | | * Keep the fd for both old and new root. |
2290 | | * It will be used in fchdir(2) later. |
2291 | | */ |
2292 | 0 | oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
2293 | 0 | if (oldroot < 0) |
2294 | 0 | pdie("failed to open / for fchdir"); |
2295 | 0 | newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
2296 | 0 | if (newroot < 0) |
2297 | 0 | pdie("failed to open %s for fchdir", j->chrootdir); |
2298 | | |
2299 | | /* |
2300 | | * To ensure j->chrootdir is the root of a filesystem, |
2301 | | * do a self bind mount. |
2302 | | */ |
2303 | 0 | if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) |
2304 | 0 | pdie("failed to bind mount '%s'", j->chrootdir); |
2305 | 0 | if (chdir(j->chrootdir)) |
2306 | 0 | return -errno; |
2307 | 0 | if (syscall(SYS_pivot_root, ".", ".")) |
2308 | 0 | pdie("pivot_root"); |
2309 | | |
2310 | | /* |
2311 | | * Now the old root is mounted on top of the new root. Use fchdir(2) to |
2312 | | * change to the old root and unmount it. |
2313 | | */ |
2314 | 0 | if (fchdir(oldroot)) |
2315 | 0 | pdie("failed to fchdir to old /"); |
2316 | | |
2317 | | /* |
2318 | | * If skip_remount_private was enabled for minijail_enter(), |
2319 | | * there could be a shared mount point under |oldroot|. In that case, |
2320 | | * mounts under this shared mount point will be unmounted below, and |
2321 | | * this unmounting will propagate to the original mount namespace |
2322 | | * (because the mount point is shared). To prevent this unexpected |
2323 | | * unmounting, remove these mounts from their peer groups by recursively |
2324 | | * remounting them as MS_PRIVATE. |
2325 | | */ |
2326 | 0 | if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL)) |
2327 | 0 | pdie("failed to mount(/, private) before umount(/)"); |
2328 | | /* The old root might be busy, so use lazy unmount. */ |
2329 | 0 | if (umount2(".", MNT_DETACH)) |
2330 | 0 | pdie("umount(/)"); |
2331 | | /* Change back to the new root. */ |
2332 | 0 | if (fchdir(newroot)) |
2333 | 0 | return -errno; |
2334 | 0 | if (chroot("/")) |
2335 | 0 | return -errno; |
2336 | | /* Set correct CWD for getcwd(3). */ |
2337 | 0 | if (chdir("/")) |
2338 | 0 | return -errno; |
2339 | | |
2340 | 0 | return 0; |
2341 | 0 | } |
2342 | | |
2343 | | static int mount_tmp(const struct minijail *j) |
2344 | 0 | { |
2345 | 0 | const char fmt[] = "size=%zu,mode=1777"; |
2346 | | /* Count for the user storing ULLONG_MAX literally + extra space. */ |
2347 | 0 | char data[sizeof(fmt) + sizeof("18446744073709551615ULL")]; |
2348 | 0 | int ret; |
2349 | |
|
2350 | 0 | ret = snprintf(data, sizeof(data), fmt, j->tmpfs_size); |
2351 | |
|
2352 | 0 | if (ret <= 0) |
2353 | 0 | pdie("tmpfs size spec error"); |
2354 | 0 | else if ((size_t)ret >= sizeof(data)) |
2355 | 0 | pdie("tmpfs size spec too large"); |
2356 | |
|
2357 | 0 | unsigned long flags = MS_NODEV | MS_NOEXEC | MS_NOSUID; |
2358 | |
|
2359 | 0 | if (block_symlinks_in_noninit_mountns_tmp()) { |
2360 | 0 | flags |= MS_NOSYMFOLLOW; |
2361 | 0 | } |
2362 | |
|
2363 | 0 | return mount("none", "/tmp", "tmpfs", flags, data); |
2364 | 0 | } |
2365 | | |
2366 | | static int remount_proc_readonly(const struct minijail *j) |
2367 | 0 | { |
2368 | 0 | const char *kProcPath = "/proc"; |
2369 | 0 | const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; |
2370 | | /* |
2371 | | * Right now, we're holding a reference to our parent's old mount of |
2372 | | * /proc in our namespace, which means using MS_REMOUNT here would |
2373 | | * mutate our parent's mount as well, even though we're in a VFS |
2374 | | * namespace (!). Instead, remove their mount from our namespace lazily |
2375 | | * (MNT_DETACH) and make our own. |
2376 | | * |
2377 | | * However, we skip this in the user namespace case because it will |
2378 | | * invariably fail. Every mount namespace is "owned" by the |
2379 | | * user namespace of the process that creates it. Mount namespace A is |
2380 | | * "less privileged" than mount namespace B if A is created off of B, |
2381 | | * and B is owned by a different user namespace. |
2382 | | * When a less privileged mount namespace is created, the mounts used to |
2383 | | * initialize it (coming from the more privileged mount namespace) come |
2384 | | * as a unit, and are locked together. This means that code running in |
2385 | | * the new mount (and user) namespace cannot piecemeal unmount |
2386 | | * individual mounts inherited from a more privileged mount namespace. |
2387 | | * See https://man7.org/linux/man-pages/man7/mount_namespaces.7.html, |
2388 | | * "Restrictions on mount namespaces" for details. |
2389 | | * |
2390 | | * This happens in our use case because we first enter a new user |
2391 | | * namespace (on clone(2)) and then we unshare(2) a new mount namespace, |
2392 | | * which means the new mount namespace is less privileged than its |
2393 | | * parent mount namespace. This would also happen if we entered a new |
2394 | | * mount namespace on clone(2), since the user namespace is created |
2395 | | * first. |
2396 | | * In all other non-user-namespace cases the new mount namespace is |
2397 | | * similarly privileged as the parent mount namespace so unmounting a |
2398 | | * single mount is allowed. |
2399 | | * |
2400 | | * We still remount /proc as read-only in the user namespace case |
2401 | | * because while a process with CAP_SYS_ADMIN in the new user namespace |
2402 | | * can unmount the RO mount and get at the RW mount, an attacker with |
2403 | | * access only to a write primitive will not be able to modify /proc. |
2404 | | */ |
2405 | 0 | if (!j->flags.userns && umount2(kProcPath, MNT_DETACH)) |
2406 | 0 | return -errno; |
2407 | 0 | if (mount("proc", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) |
2408 | 0 | return -errno; |
2409 | 0 | return 0; |
2410 | 0 | } |
2411 | | |
2412 | | static void kill_child_and_die(const struct minijail *j, const char *msg) |
2413 | 0 | { |
2414 | 0 | kill(j->initpid, SIGKILL); |
2415 | 0 | die("%s", msg); |
2416 | 0 | } |
2417 | | |
2418 | | static void write_pid_file_or_die(const struct minijail *j) |
2419 | 0 | { |
2420 | 0 | if (write_pid_to_path(j->initpid, j->pid_file_path)) |
2421 | 0 | kill_child_and_die(j, "failed to write pid file"); |
2422 | 0 | } |
2423 | | |
2424 | | static void add_to_cgroups_or_die(const struct minijail *j) |
2425 | 0 | { |
2426 | 0 | size_t i; |
2427 | |
|
2428 | 0 | for (i = 0; i < j->cgroup_count; ++i) { |
2429 | 0 | if (write_pid_to_path(j->initpid, j->cgroups[i])) |
2430 | 0 | kill_child_and_die(j, "failed to add to cgroups"); |
2431 | 0 | } |
2432 | 0 | } |
2433 | | |
2434 | | static void set_rlimits_or_die(const struct minijail *j) |
2435 | 0 | { |
2436 | 0 | size_t i; |
2437 | |
|
2438 | 0 | for (i = 0; i < j->rlimit_count; ++i) { |
2439 | 0 | struct rlimit limit; |
2440 | 0 | limit.rlim_cur = j->rlimits[i].cur; |
2441 | 0 | limit.rlim_max = j->rlimits[i].max; |
2442 | 0 | if (prlimit(j->initpid, j->rlimits[i].type, &limit, NULL)) |
2443 | 0 | kill_child_and_die(j, "failed to set rlimit"); |
2444 | 0 | } |
2445 | 0 | } |
2446 | | |
2447 | | static void write_ugid_maps_or_die(const struct minijail *j) |
2448 | 0 | { |
2449 | 0 | if (j->uidmap && write_proc_file(j->initpid, j->uidmap, "uid_map") != 0) |
2450 | 0 | kill_child_and_die(j, "failed to write uid_map"); |
2451 | 0 | if (j->gidmap && j->flags.disable_setgroups) { |
2452 | | /* |
2453 | | * Older kernels might not have the /proc/<pid>/setgroups files. |
2454 | | */ |
2455 | 0 | int ret = write_proc_file(j->initpid, "deny", "setgroups"); |
2456 | 0 | if (ret != 0) { |
2457 | 0 | if (ret == -ENOENT) { |
2458 | | /* |
2459 | | * See |
2460 | | * http://man7.org/linux/man-pages/man7/user_namespaces.7.html. |
2461 | | */ |
2462 | 0 | warn("could not disable setgroups(2)"); |
2463 | 0 | } else |
2464 | 0 | kill_child_and_die( |
2465 | 0 | j, "failed to disable setgroups(2)"); |
2466 | 0 | } |
2467 | 0 | } |
2468 | 0 | if (j->gidmap && write_proc_file(j->initpid, j->gidmap, "gid_map") != 0) |
2469 | 0 | kill_child_and_die(j, "failed to write gid_map"); |
2470 | 0 | } |
2471 | | |
2472 | | static void enter_user_namespace(const struct minijail *j) |
2473 | 0 | { |
2474 | 0 | int uid = j->flags.uid ? j->uid : 0; |
2475 | 0 | int gid = j->flags.gid ? j->gid : 0; |
2476 | 0 | if (j->gidmap && setresgid(gid, gid, gid)) { |
2477 | 0 | pdie("user_namespaces: setresgid(%d, %d, %d) failed", gid, gid, |
2478 | 0 | gid); |
2479 | 0 | } |
2480 | 0 | if (j->uidmap && setresuid(uid, uid, uid)) { |
2481 | 0 | pdie("user_namespaces: setresuid(%d, %d, %d) failed", uid, uid, |
2482 | 0 | uid); |
2483 | 0 | } |
2484 | 0 | } |
2485 | | |
2486 | | static void parent_setup_complete(int *pipe_fds) |
2487 | 0 | { |
2488 | 0 | close_and_reset(&pipe_fds[0]); |
2489 | 0 | close_and_reset(&pipe_fds[1]); |
2490 | 0 | } |
2491 | | |
2492 | | /* |
2493 | | * wait_for_parent_setup: Called by the child process to wait for any |
2494 | | * further parent-side setup to complete before continuing. |
2495 | | */ |
2496 | | static void wait_for_parent_setup(int *pipe_fds) |
2497 | 0 | { |
2498 | 0 | char buf; |
2499 | |
|
2500 | 0 | close_and_reset(&pipe_fds[1]); |
2501 | | |
2502 | | /* Wait for parent to complete setup and close the pipe. */ |
2503 | 0 | if (read(pipe_fds[0], &buf, 1) != 0) |
2504 | 0 | die("failed to sync with parent"); |
2505 | 0 | close_and_reset(&pipe_fds[0]); |
2506 | 0 | } |
2507 | | |
2508 | | static void drop_ugid(const struct minijail *j) |
2509 | 0 | { |
2510 | 0 | if (j->flags.inherit_suppl_gids + j->flags.keep_suppl_gids + |
2511 | 0 | j->flags.set_suppl_gids > |
2512 | 0 | 1) { |
2513 | 0 | die("can only do one of inherit, keep, or set supplementary " |
2514 | 0 | "groups"); |
2515 | 0 | } |
2516 | |
|
2517 | 0 | if (j->flags.inherit_suppl_gids) { |
2518 | 0 | if (initgroups(j->user, j->usergid)) |
2519 | 0 | pdie("initgroups(%s, %d) failed", j->user, j->usergid); |
2520 | 0 | } else if (j->flags.set_suppl_gids) { |
2521 | 0 | if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) |
2522 | 0 | pdie("setgroups(suppl_gids) failed"); |
2523 | 0 | } else if (!j->flags.keep_suppl_gids && !j->flags.disable_setgroups) { |
2524 | | /* |
2525 | | * Only attempt to clear supplementary groups if we are changing |
2526 | | * users or groups, and if the caller did not request to disable |
2527 | | * setgroups (used when entering a user namespace as a |
2528 | | * non-privileged user). |
2529 | | */ |
2530 | 0 | if ((j->flags.uid || j->flags.gid) && setgroups(0, NULL)) |
2531 | 0 | pdie("setgroups(0, NULL) failed"); |
2532 | 0 | } |
2533 | |
|
2534 | 0 | if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) |
2535 | 0 | pdie("setresgid(%d, %d, %d) failed", j->gid, j->gid, j->gid); |
2536 | |
|
2537 | 0 | if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) |
2538 | 0 | pdie("setresuid(%d, %d, %d) failed", j->uid, j->uid, j->uid); |
2539 | 0 | } |
2540 | | |
2541 | | static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap) |
2542 | 0 | { |
2543 | 0 | const uint64_t one = 1; |
2544 | 0 | unsigned int i; |
2545 | 0 | for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) { |
2546 | 0 | if (keep_mask & (one << i)) |
2547 | 0 | continue; |
2548 | 0 | if (prctl(PR_CAPBSET_DROP, i)) |
2549 | 0 | pdie("could not drop capability from bounding set"); |
2550 | 0 | } |
2551 | 0 | } |
2552 | | |
2553 | | static void drop_caps(const struct minijail *j, unsigned int last_valid_cap) |
2554 | 0 | { |
2555 | 0 | if (!j->flags.use_caps) |
2556 | 0 | return; |
2557 | | |
2558 | 0 | cap_t caps = cap_get_proc(); |
2559 | 0 | cap_value_t flag[1]; |
2560 | 0 | const size_t ncaps = sizeof(j->caps) * 8; |
2561 | 0 | const uint64_t one = 1; |
2562 | 0 | unsigned int i; |
2563 | 0 | if (!caps) |
2564 | 0 | die("can't get process caps"); |
2565 | 0 | if (cap_clear(caps)) |
2566 | 0 | die("can't clear caps"); |
2567 | |
|
2568 | 0 | for (i = 0; i < ncaps && i <= last_valid_cap; ++i) { |
2569 | | /* Keep CAP_SETPCAP for dropping bounding set bits. */ |
2570 | 0 | if (i != CAP_SETPCAP && !(j->caps & (one << i))) |
2571 | 0 | continue; |
2572 | 0 | flag[0] = i; |
2573 | 0 | if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) |
2574 | 0 | die("can't add effective cap"); |
2575 | 0 | if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) |
2576 | 0 | die("can't add permitted cap"); |
2577 | 0 | if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) |
2578 | 0 | die("can't add inheritable cap"); |
2579 | 0 | } |
2580 | 0 | if (cap_set_proc(caps)) |
2581 | 0 | die("can't apply initial cleaned capset"); |
2582 | | |
2583 | | /* |
2584 | | * Instead of dropping the bounding set first, do it here in case |
2585 | | * the caller had a more permissive bounding set which could |
2586 | | * have been used above to raise a capability that wasn't already |
2587 | | * present. This requires CAP_SETPCAP, so we raised/kept it above. |
2588 | | * |
2589 | | * However, if we're asked to skip setting *and* locking the |
2590 | | * SECURE_NOROOT securebit, also skip dropping the bounding set. |
2591 | | * If the caller wants to regain all capabilities when executing a |
2592 | | * set-user-ID-root program, allow them to do so. The default behavior |
2593 | | * (i.e. the behavior without |securebits_skip_mask| set) will still put |
2594 | | * the jailed process tree in a capabilities-only environment. |
2595 | | * |
2596 | | * We check the negated skip mask for SECURE_NOROOT and |
2597 | | * SECURE_NOROOT_LOCKED. If the bits are set in the negated mask they |
2598 | | * will *not* be skipped in lock_securebits(), and therefore we should |
2599 | | * drop the bounding set. |
2600 | | */ |
2601 | 0 | if (secure_noroot_set_and_locked(~j->securebits_skip_mask)) { |
2602 | 0 | drop_capbset(j->caps, last_valid_cap); |
2603 | 0 | } else { |
2604 | 0 | warn("SECURE_NOROOT not set, not dropping bounding set"); |
2605 | 0 | } |
2606 | | |
2607 | | /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ |
2608 | 0 | if ((j->caps & (one << CAP_SETPCAP)) == 0) { |
2609 | 0 | flag[0] = CAP_SETPCAP; |
2610 | 0 | if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) |
2611 | 0 | die("can't clear effective cap"); |
2612 | 0 | if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) |
2613 | 0 | die("can't clear permitted cap"); |
2614 | 0 | if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) |
2615 | 0 | die("can't clear inheritable cap"); |
2616 | 0 | } |
2617 | |
|
2618 | 0 | if (cap_set_proc(caps)) |
2619 | 0 | die("can't apply final cleaned capset"); |
2620 | | |
2621 | | /* |
2622 | | * If ambient capabilities are supported, clear all capabilities first, |
2623 | | * then raise the requested ones. |
2624 | | */ |
2625 | 0 | if (j->flags.set_ambient_caps) { |
2626 | 0 | if (!cap_ambient_supported()) { |
2627 | 0 | pdie("ambient capabilities not supported"); |
2628 | 0 | } |
2629 | 0 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) != |
2630 | 0 | 0) { |
2631 | 0 | pdie("can't clear ambient capabilities"); |
2632 | 0 | } |
2633 | |
|
2634 | 0 | for (i = 0; i < ncaps && i <= last_valid_cap; ++i) { |
2635 | 0 | if (!(j->caps & (one << i))) |
2636 | 0 | continue; |
2637 | | |
2638 | 0 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i, 0, |
2639 | 0 | 0) != 0) { |
2640 | 0 | pdie("prctl(PR_CAP_AMBIENT, " |
2641 | 0 | "PR_CAP_AMBIENT_RAISE, %u) failed", |
2642 | 0 | i); |
2643 | 0 | } |
2644 | 0 | } |
2645 | 0 | } |
2646 | |
|
2647 | 0 | cap_free(caps); |
2648 | 0 | } |
2649 | | |
2650 | | /* Calls landlock_restrict_self(), based on current inodes. */ |
2651 | | static void apply_landlock_restrictions(const struct minijail *j) |
2652 | 0 | { |
2653 | 0 | struct fs_rule *r = j->fs_rules_head; |
2654 | | /* The ruleset_fd needs to be mutable so use a stack copy from now on. |
2655 | | */ |
2656 | 0 | int ruleset_fd = j->fs_rules_fd; |
2657 | 0 | if (!j->flags.enable_fs_restrictions || !r) { |
2658 | 0 | return; |
2659 | 0 | } |
2660 | | |
2661 | 0 | if (minijail_is_fs_restriction_available()) { |
2662 | 0 | while (r) { |
2663 | 0 | populate_ruleset_internal(r->path, ruleset_fd, |
2664 | 0 | r->landlock_flags); |
2665 | 0 | r = r->next; |
2666 | 0 | } |
2667 | 0 | } |
2668 | |
|
2669 | 0 | if (ruleset_fd >= 0) { |
2670 | 0 | if (j->filename != NULL) { |
2671 | 0 | info("applying Landlock to process %s", j->filename); |
2672 | 0 | } |
2673 | 0 | if (landlock_restrict_self(ruleset_fd, 0)) { |
2674 | 0 | pdie("failed to enforce ruleset"); |
2675 | 0 | } |
2676 | 0 | close(ruleset_fd); |
2677 | 0 | } |
2678 | 0 | } |
2679 | | |
2680 | | static void set_no_new_privs(const struct minijail *j) |
2681 | 0 | { |
2682 | 0 | if (j->flags.no_new_privs) { |
2683 | 0 | if (!sys_set_no_new_privs()) { |
2684 | 0 | die("set_no_new_privs() failed"); |
2685 | 0 | } |
2686 | 0 | } |
2687 | 0 | } |
2688 | | |
2689 | | static void set_seccomp_filter(const struct minijail *j) |
2690 | 0 | { |
2691 | | /* |
2692 | | * Code running with ASan |
2693 | | * (https://github.com/google/sanitizers/wiki/AddressSanitizer) |
2694 | | * will make system calls not included in the syscall filter policy, |
2695 | | * which will likely crash the program. Skip setting seccomp filter in |
2696 | | * that case. |
2697 | | * 'running_with_asan()' has no inputs and is completely defined at |
2698 | | * build time, so this cannot be used by an attacker to skip setting |
2699 | | * seccomp filter. |
2700 | | */ |
2701 | 0 | if (j->flags.seccomp_filter && running_with_asan()) { |
2702 | 0 | warn("running with (HW)ASan, not setting seccomp filter"); |
2703 | 0 | return; |
2704 | 0 | } |
2705 | | |
2706 | 0 | if (j->flags.seccomp_filter) { |
2707 | 0 | if (seccomp_is_logging_allowed(j)) { |
2708 | 0 | warn("logging seccomp filter failures"); |
2709 | 0 | if (!seccomp_ret_log_available()) { |
2710 | | /* |
2711 | | * If SECCOMP_RET_LOG is not available, |
2712 | | * install the SIGSYS handler first. |
2713 | | */ |
2714 | 0 | if (install_sigsys_handler()) |
2715 | 0 | pdie( |
2716 | 0 | "failed to install SIGSYS handler"); |
2717 | 0 | } |
2718 | 0 | } else if (j->flags.seccomp_filter_tsync) { |
2719 | | /* |
2720 | | * If setting thread sync, |
2721 | | * reset the SIGSYS signal handler so that |
2722 | | * the entire thread group is killed. |
2723 | | */ |
2724 | 0 | if (signal(SIGSYS, SIG_DFL) == SIG_ERR) |
2725 | 0 | pdie("failed to reset SIGSYS disposition"); |
2726 | 0 | } |
2727 | 0 | } |
2728 | | |
2729 | | /* |
2730 | | * Install the syscall filter. |
2731 | | */ |
2732 | 0 | if (j->flags.seccomp_filter) { |
2733 | 0 | if (j->flags.seccomp_filter_tsync || |
2734 | 0 | j->flags.seccomp_filter_allow_speculation) { |
2735 | 0 | int filter_flags = |
2736 | 0 | (j->flags.seccomp_filter_tsync |
2737 | 0 | ? SECCOMP_FILTER_FLAG_TSYNC |
2738 | 0 | : 0) | |
2739 | 0 | (j->flags.seccomp_filter_allow_speculation |
2740 | 0 | ? SECCOMP_FILTER_FLAG_SPEC_ALLOW |
2741 | 0 | : 0); |
2742 | 0 | if (sys_seccomp(SECCOMP_SET_MODE_FILTER, filter_flags, |
2743 | 0 | j->filter_prog)) { |
2744 | 0 | pdie("seccomp(tsync) failed"); |
2745 | 0 | } |
2746 | 0 | } else { |
2747 | 0 | if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, |
2748 | 0 | j->filter_prog)) { |
2749 | 0 | pdie("prctl(seccomp_filter) failed"); |
2750 | 0 | } |
2751 | 0 | } |
2752 | 0 | } |
2753 | 0 | } |
2754 | | |
2755 | | static pid_t forward_pid = -1; |
2756 | | |
2757 | | static void forward_signal(int sig, siginfo_t *siginfo attribute_unused, |
2758 | | void *void_context attribute_unused) |
2759 | 0 | { |
2760 | 0 | if (forward_pid != -1) { |
2761 | 0 | kill(forward_pid, sig); |
2762 | 0 | } |
2763 | 0 | } |
2764 | | |
2765 | | static void install_signal_handlers(void) |
2766 | 0 | { |
2767 | 0 | struct sigaction act; |
2768 | |
|
2769 | 0 | memset(&act, 0, sizeof(act)); |
2770 | 0 | act.sa_sigaction = &forward_signal; |
2771 | 0 | act.sa_flags = SA_SIGINFO | SA_RESTART; |
2772 | | |
2773 | | /* Handle all signals, except SIGCHLD. */ |
2774 | 0 | for (int sig = 1; sig < NSIG; sig++) { |
2775 | | /* |
2776 | | * We don't care if we get EINVAL: that just means that we |
2777 | | * can't handle this signal, so let's skip it and continue. |
2778 | | */ |
2779 | 0 | sigaction(sig, &act, NULL); |
2780 | 0 | } |
2781 | | /* Reset SIGCHLD's handler. */ |
2782 | 0 | signal(SIGCHLD, SIG_DFL); |
2783 | | |
2784 | | /* Handle real-time signals. */ |
2785 | 0 | for (int sig = SIGRTMIN; sig <= SIGRTMAX; sig++) { |
2786 | 0 | sigaction(sig, &act, NULL); |
2787 | 0 | } |
2788 | 0 | } |
2789 | | |
2790 | | static const char *lookup_hook_name(minijail_hook_event_t event) |
2791 | 0 | { |
2792 | 0 | switch (event) { |
2793 | 0 | case MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS: |
2794 | 0 | return "pre-drop-caps"; |
2795 | 0 | case MINIJAIL_HOOK_EVENT_PRE_EXECVE: |
2796 | 0 | return "pre-execve"; |
2797 | 0 | case MINIJAIL_HOOK_EVENT_PRE_CHROOT: |
2798 | 0 | return "pre-chroot"; |
2799 | 0 | case MINIJAIL_HOOK_EVENT_MAX: |
2800 | | /* |
2801 | | * Adding this in favor of a default case to force the |
2802 | | * compiler to error out if a new enum value is added. |
2803 | | */ |
2804 | 0 | break; |
2805 | 0 | } |
2806 | 0 | return "unknown"; |
2807 | 0 | } |
2808 | | |
2809 | | static void run_hooks_or_die(const struct minijail *j, |
2810 | | minijail_hook_event_t event) |
2811 | 0 | { |
2812 | 0 | int rc; |
2813 | 0 | int hook_index = 0; |
2814 | 0 | for (struct hook *c = j->hooks_head; c; c = c->next) { |
2815 | 0 | if (c->event != event) |
2816 | 0 | continue; |
2817 | 0 | rc = c->hook(c->payload); |
2818 | 0 | if (rc != 0) { |
2819 | 0 | errno = -rc; |
2820 | 0 | pdie("%s hook (index %d) failed", |
2821 | 0 | lookup_hook_name(event), hook_index); |
2822 | 0 | } |
2823 | | /* Only increase the index within the same hook event type. */ |
2824 | 0 | ++hook_index; |
2825 | 0 | } |
2826 | 0 | } |
2827 | | |
2828 | | void API minijail_enter(const struct minijail *j) |
2829 | 0 | { |
2830 | | /* |
2831 | | * If we're dropping caps, get the last valid cap from /proc now, |
2832 | | * since /proc can be unmounted before drop_caps() is called. |
2833 | | */ |
2834 | 0 | unsigned int last_valid_cap = 0; |
2835 | 0 | if (j->flags.capbset_drop || j->flags.use_caps) |
2836 | 0 | last_valid_cap = get_last_valid_cap(); |
2837 | |
|
2838 | 0 | if (j->flags.pids) |
2839 | 0 | die("tried to enter a pid-namespaced jail;" |
2840 | 0 | " try minijail_run()?"); |
2841 | |
|
2842 | 0 | if (j->flags.inherit_suppl_gids && !j->user) |
2843 | 0 | die("cannot inherit supplementary groups without setting a " |
2844 | 0 | "username"); |
2845 | | |
2846 | | /* |
2847 | | * We can't recover from failures if we've dropped privileges partially, |
2848 | | * so we don't even try. If any of our operations fail, we abort() the |
2849 | | * entire process. |
2850 | | */ |
2851 | 0 | if (j->flags.enter_vfs) { |
2852 | 0 | if (setns(j->mountns_fd, CLONE_NEWNS)) |
2853 | 0 | pdie("setns(CLONE_NEWNS) failed"); |
2854 | 0 | close(j->mountns_fd); |
2855 | 0 | } |
2856 | |
|
2857 | 0 | if (j->flags.vfs) { |
2858 | 0 | if (unshare(CLONE_NEWNS)) |
2859 | 0 | pdie("unshare(CLONE_NEWNS) failed"); |
2860 | | /* |
2861 | | * By default, remount all filesystems as private, unless |
2862 | | * - Passed a specific remount mode, in which case remount with |
2863 | | * that, |
2864 | | * - Asked not to remount at all, in which case skip the |
2865 | | * mount(2) call. |
2866 | | * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt |
2867 | | */ |
2868 | 0 | if (j->remount_mode) { |
2869 | 0 | if (mount(NULL, "/", NULL, MS_REC | j->remount_mode, |
2870 | 0 | NULL)) |
2871 | 0 | pdie("mount(NULL, /, NULL, " |
2872 | 0 | "MS_REC | j->remount_mode, NULL) failed"); |
2873 | |
|
2874 | 0 | struct minijail_remount *temp = j->remounts_head; |
2875 | 0 | while (temp) { |
2876 | 0 | if (temp->remount_mode < j->remount_mode) |
2877 | 0 | die("cannot remount %s as stricter " |
2878 | 0 | "than the root dir", |
2879 | 0 | temp->mount_name); |
2880 | 0 | if (mount(NULL, temp->mount_name, NULL, |
2881 | 0 | MS_REC | temp->remount_mode, NULL)) |
2882 | 0 | pdie("mount(NULL, %s, NULL, " |
2883 | 0 | "MS_REC | temp->remount_mode, " |
2884 | 0 | "NULL) failed", |
2885 | 0 | temp->mount_name); |
2886 | 0 | temp = temp->next; |
2887 | 0 | } |
2888 | 0 | } |
2889 | 0 | } |
2890 | |
|
2891 | 0 | if (j->flags.ipc && unshare(CLONE_NEWIPC)) { |
2892 | 0 | pdie("unshare(CLONE_NEWIPC) failed"); |
2893 | 0 | } |
2894 | |
|
2895 | 0 | if (j->flags.uts) { |
2896 | 0 | if (unshare(CLONE_NEWUTS)) |
2897 | 0 | pdie("unshare(CLONE_NEWUTS) failed"); |
2898 | |
|
2899 | 0 | if (j->hostname && |
2900 | 0 | sethostname(j->hostname, strlen(j->hostname))) |
2901 | 0 | pdie("sethostname(%s) failed", j->hostname); |
2902 | 0 | } |
2903 | |
|
2904 | 0 | if (j->flags.enter_net) { |
2905 | 0 | if (setns(j->netns_fd, CLONE_NEWNET)) |
2906 | 0 | pdie("setns(CLONE_NEWNET) failed"); |
2907 | 0 | close(j->netns_fd); |
2908 | 0 | } else if (j->flags.net) { |
2909 | 0 | if (unshare(CLONE_NEWNET)) |
2910 | 0 | pdie("unshare(CLONE_NEWNET) failed"); |
2911 | 0 | if (j->flags.net_loopback) |
2912 | 0 | config_net_loopback(); |
2913 | 0 | } |
2914 | |
|
2915 | 0 | if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP)) |
2916 | 0 | pdie("unshare(CLONE_NEWCGROUP) failed"); |
2917 | |
|
2918 | 0 | if (j->flags.new_session_keyring) { |
2919 | 0 | if (syscall(SYS_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL) < 0) |
2920 | 0 | pdie("keyctl(KEYCTL_JOIN_SESSION_KEYRING) failed"); |
2921 | 0 | } |
2922 | | |
2923 | | /* We have to process all the mounts before we chroot/pivot_root. */ |
2924 | 0 | process_mounts_or_die(j); |
2925 | |
|
2926 | 0 | if (j->flags.chroot && enter_chroot(j)) |
2927 | 0 | pdie("chroot"); |
2928 | |
|
2929 | 0 | if (j->flags.pivot_root && enter_pivot_root(j)) |
2930 | 0 | pdie("pivot_root"); |
2931 | |
|
2932 | 0 | if (j->flags.mount_tmp && mount_tmp(j)) |
2933 | 0 | pdie("mount_tmp"); |
2934 | |
|
2935 | 0 | if (j->flags.remount_proc_ro && remount_proc_readonly(j)) |
2936 | 0 | pdie("remount"); |
2937 | |
|
2938 | 0 | run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS); |
2939 | | |
2940 | | /* |
2941 | | * If we're only dropping capabilities from the bounding set, but not |
2942 | | * from the thread's (permitted|inheritable|effective) sets, do it now. |
2943 | | */ |
2944 | 0 | if (j->flags.capbset_drop) { |
2945 | 0 | drop_capbset(j->cap_bset, last_valid_cap); |
2946 | 0 | } |
2947 | | |
2948 | | /* |
2949 | | * POSIX capabilities are a bit tricky. We must set SECBIT_KEEP_CAPS |
2950 | | * before drop_ugid() below as the latter would otherwise drop all |
2951 | | * capabilities. |
2952 | | */ |
2953 | 0 | if (j->flags.use_caps) { |
2954 | | /* |
2955 | | * When using ambient capabilities, CAP_SET{GID,UID} can be |
2956 | | * inherited across execve(2), so SECBIT_KEEP_CAPS is not |
2957 | | * strictly needed. |
2958 | | */ |
2959 | 0 | bool require_keep_caps = !j->flags.set_ambient_caps; |
2960 | 0 | if (lock_securebits(j->securebits_skip_mask, |
2961 | 0 | require_keep_caps) < 0) { |
2962 | 0 | pdie("locking securebits failed"); |
2963 | 0 | } |
2964 | 0 | } |
2965 | |
|
2966 | 0 | if (j->flags.no_new_privs) { |
2967 | | /* |
2968 | | * If we're setting no_new_privs, we can drop privileges |
2969 | | * before setting seccomp filter. This way filter policies |
2970 | | * don't need to allow privilege-dropping syscalls. |
2971 | | */ |
2972 | 0 | drop_ugid(j); |
2973 | 0 | drop_caps(j, last_valid_cap); |
2974 | | |
2975 | | /* |
2976 | | * Landlock is applied as late as possible. If no_new_privs is |
2977 | | * requested, then we need to set that first because the |
2978 | | * landlock_restrict_self() syscall has a seccomp(2) like check |
2979 | | * for that. See: |
2980 | | * https://elixir.bootlin.com/linux/v5.15.74/source/security/landlock/syscalls.c#L409 |
2981 | | */ |
2982 | 0 | set_no_new_privs(j); |
2983 | 0 | apply_landlock_restrictions(j); |
2984 | 0 | set_seccomp_filter(j); |
2985 | 0 | } else { |
2986 | 0 | apply_landlock_restrictions(j); |
2987 | | |
2988 | | /* |
2989 | | * If we're not setting no_new_privs, |
2990 | | * we need to set seccomp filter *before* dropping privileges. |
2991 | | * WARNING: this means that filter policies *must* allow |
2992 | | * setgroups()/setresgid()/setresuid() for dropping root and |
2993 | | * capget()/capset()/prctl() for dropping caps. |
2994 | | */ |
2995 | 0 | set_seccomp_filter(j); |
2996 | 0 | drop_ugid(j); |
2997 | 0 | drop_caps(j, last_valid_cap); |
2998 | 0 | } |
2999 | | |
3000 | | /* |
3001 | | * Select the specified alternate syscall table. The table must not |
3002 | | * block prctl(2) if we're using seccomp as well. |
3003 | | */ |
3004 | 0 | if (j->flags.alt_syscall) { |
3005 | 0 | if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) |
3006 | 0 | pdie("prctl(PR_ALT_SYSCALL) failed"); |
3007 | 0 | } |
3008 | | |
3009 | | /* |
3010 | | * seccomp has to come last since it cuts off all the other |
3011 | | * privilege-dropping syscalls :) |
3012 | | */ |
3013 | 0 | if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { |
3014 | 0 | if ((errno == EINVAL) && seccomp_can_softfail()) { |
3015 | 0 | warn("seccomp not supported"); |
3016 | 0 | return; |
3017 | 0 | } |
3018 | 0 | pdie("prctl(PR_SET_SECCOMP) failed"); |
3019 | 0 | } |
3020 | 0 | } |
3021 | | |
3022 | | /* TODO(wad): will visibility affect this variable? */ |
3023 | | static int init_exitstatus = 0; |
3024 | | |
3025 | | static void init_term(int sig attribute_unused) |
3026 | 0 | { |
3027 | 0 | _exit(init_exitstatus); |
3028 | 0 | } |
3029 | | |
3030 | | static void init(pid_t rootpid) |
3031 | 0 | { |
3032 | 0 | pid_t pid; |
3033 | 0 | int status; |
3034 | | /* So that we exit with the right status. */ |
3035 | 0 | signal(SIGTERM, init_term); |
3036 | | /* TODO(wad): self jail with seccomp filters here. */ |
3037 | 0 | while ((pid = wait(&status)) > 0) { |
3038 | | /* |
3039 | | * This loop will only end when either there are no processes |
3040 | | * left inside our pid namespace or we get a signal. |
3041 | | */ |
3042 | 0 | if (pid == rootpid) |
3043 | 0 | init_exitstatus = status; |
3044 | 0 | } |
3045 | 0 | if (!WIFEXITED(init_exitstatus)) |
3046 | 0 | _exit(MINIJAIL_ERR_INIT); |
3047 | 0 | _exit(WEXITSTATUS(init_exitstatus)); |
3048 | 0 | } |
3049 | | |
3050 | | int API minijail_from_fd(int fd, struct minijail *j) |
3051 | 0 | { |
3052 | 0 | size_t sz = 0; |
3053 | 0 | int err = read_exactly(fd, &sz, sizeof(sz)); |
3054 | 0 | attribute_cleanup_str char *buf = NULL; |
3055 | 0 | if (err) { |
3056 | 0 | pwarn("failed to read marshalled minijail size"); |
3057 | 0 | return err; |
3058 | 0 | } |
3059 | 0 | if (sz > USHRT_MAX) /* arbitrary check */ |
3060 | 0 | return -E2BIG; |
3061 | 0 | buf = malloc(sz); |
3062 | 0 | if (!buf) |
3063 | 0 | return -ENOMEM; |
3064 | 0 | err = read_exactly(fd, buf, sz); |
3065 | 0 | if (err) { |
3066 | 0 | pwarn("failed to read marshalled minijail payload"); |
3067 | 0 | return err; |
3068 | 0 | } |
3069 | 0 | return minijail_unmarshal(j, buf, sz); |
3070 | 0 | } |
3071 | | |
3072 | | int API minijail_to_fd(struct minijail *j, int fd) |
3073 | 0 | { |
3074 | 0 | size_t sz = minijail_size(j); |
3075 | 0 | if (!sz) |
3076 | 0 | return -EINVAL; |
3077 | | |
3078 | 0 | attribute_cleanup_str char *buf = malloc(sz); |
3079 | 0 | if (!buf) |
3080 | 0 | return -ENOMEM; |
3081 | | |
3082 | 0 | int err = minijail_marshal(j, buf, sz); |
3083 | 0 | if (err) |
3084 | 0 | return err; |
3085 | | |
3086 | | /* Sends [size][minijail]. */ |
3087 | 0 | err = write_exactly(fd, &sz, sizeof(sz)); |
3088 | 0 | if (err) |
3089 | 0 | return err; |
3090 | | |
3091 | 0 | return write_exactly(fd, buf, sz); |
3092 | 0 | } |
3093 | | |
3094 | | int API minijail_copy_jail(const struct minijail *from, struct minijail *out) |
3095 | 0 | { |
3096 | 0 | size_t sz = minijail_size(from); |
3097 | 0 | if (!sz) |
3098 | 0 | return -EINVAL; |
3099 | | |
3100 | 0 | attribute_cleanup_str char *buf = malloc(sz); |
3101 | 0 | if (!buf) |
3102 | 0 | return -ENOMEM; |
3103 | | |
3104 | 0 | int err = minijail_marshal(from, buf, sz); |
3105 | 0 | if (err) |
3106 | 0 | return err; |
3107 | | |
3108 | 0 | return minijail_unmarshal(out, buf, sz); |
3109 | 0 | } |
3110 | | |
3111 | | static int setup_preload(const struct minijail *j attribute_unused, |
3112 | | char ***child_env attribute_unused) |
3113 | 0 | { |
3114 | | #if defined(__ANDROID__) |
3115 | | /* Don't use LDPRELOAD on Android. */ |
3116 | | return 0; |
3117 | | #else |
3118 | 0 | const char *preload_path = j->preload_path ?: PRELOADPATH; |
3119 | 0 | char *newenv = NULL; |
3120 | 0 | int ret = 0; |
3121 | 0 | const char *oldenv = minijail_getenv(*child_env, kLdPreloadEnvVar); |
3122 | |
|
3123 | 0 | if (!oldenv) |
3124 | 0 | oldenv = ""; |
3125 | | |
3126 | | /* Only insert a separating space if we have something to separate... */ |
3127 | 0 | if (asprintf(&newenv, "%s%s%s", oldenv, oldenv[0] != '\0' ? " " : "", |
3128 | 0 | preload_path) < 0) { |
3129 | 0 | return -1; |
3130 | 0 | } |
3131 | | |
3132 | 0 | ret = minijail_setenv(child_env, kLdPreloadEnvVar, newenv, 1); |
3133 | 0 | free(newenv); |
3134 | 0 | return ret; |
3135 | 0 | #endif |
3136 | 0 | } |
3137 | | |
3138 | | /* |
3139 | | * This is for logging purposes and does not change the enforced seccomp |
3140 | | * filter. |
3141 | | */ |
3142 | | static int setup_seccomp_policy_path(const struct minijail *j, |
3143 | | char ***child_env) |
3144 | 0 | { |
3145 | 0 | return minijail_setenv(child_env, kSeccompPolicyPathEnvVar, |
3146 | 0 | j->seccomp_policy_path ? j->seccomp_policy_path |
3147 | 0 | : "NO-LABEL", |
3148 | 0 | 1 /* overwrite */); |
3149 | 0 | } |
3150 | | |
3151 | | static int setup_pipe(char ***child_env, int fds[2]) |
3152 | 0 | { |
3153 | 0 | int r = pipe(fds); |
3154 | 0 | char fd_buf[11]; |
3155 | 0 | if (r) |
3156 | 0 | return r; |
3157 | 0 | r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); |
3158 | 0 | if (r <= 0) |
3159 | 0 | return -EINVAL; |
3160 | 0 | return minijail_setenv(child_env, kFdEnvVar, fd_buf, 1); |
3161 | 0 | } |
3162 | | |
3163 | | static int close_open_fds(int *inheritable_fds, size_t size) |
3164 | 0 | { |
3165 | 0 | const char *kFdPath = "/proc/self/fd"; |
3166 | |
|
3167 | 0 | DIR *d = opendir(kFdPath); |
3168 | 0 | struct dirent *dir_entry; |
3169 | |
|
3170 | 0 | if (d == NULL) |
3171 | 0 | return -1; |
3172 | 0 | int dir_fd = dirfd(d); |
3173 | 0 | while ((dir_entry = readdir(d)) != NULL) { |
3174 | 0 | size_t i; |
3175 | 0 | char *end; |
3176 | 0 | bool should_close = true; |
3177 | 0 | const int fd = strtol(dir_entry->d_name, &end, 10); |
3178 | |
|
3179 | 0 | if ((*end) != '\0') { |
3180 | 0 | continue; |
3181 | 0 | } |
3182 | | /* |
3183 | | * We might have set up some pipes that we want to share with |
3184 | | * the parent process, and should not be closed. |
3185 | | */ |
3186 | 0 | for (i = 0; i < size; ++i) { |
3187 | 0 | if (fd == inheritable_fds[i]) { |
3188 | 0 | should_close = false; |
3189 | 0 | break; |
3190 | 0 | } |
3191 | 0 | } |
3192 | | /* Also avoid closing the directory fd. */ |
3193 | 0 | if (should_close && fd != dir_fd) |
3194 | 0 | close(fd); |
3195 | 0 | } |
3196 | 0 | closedir(d); |
3197 | 0 | return 0; |
3198 | 0 | } |
3199 | | |
3200 | | /* Return true if the specified file descriptor is already open. */ |
3201 | | int minijail_fd_is_open(int fd) |
3202 | 0 | { |
3203 | 0 | return fcntl(fd, F_GETFD) != -1 || errno != EBADF; |
3204 | 0 | } |
3205 | | |
3206 | | /* |
3207 | | * Returns true if |check_fd| is one of j->preserved_fds[:max_index].child_fd. |
3208 | | */ |
3209 | | static bool is_preserved_child_fd(struct minijail *j, int check_fd, |
3210 | | size_t max_index) |
3211 | 0 | { |
3212 | 0 | max_index = MIN(max_index, j->preserved_fd_count); |
3213 | 0 | for (size_t i = 0; i < max_index; i++) { |
3214 | 0 | if (j->preserved_fds[i].child_fd == check_fd) { |
3215 | 0 | return true; |
3216 | 0 | } |
3217 | 0 | } |
3218 | 0 | return false; |
3219 | 0 | } |
3220 | | |
3221 | | /* If parent_fd will be used by a child fd, move it to an unused fd. */ |
3222 | | static int ensure_no_fd_conflict(struct minijail *j, int child_fd, |
3223 | | int *parent_fd, size_t max_index) |
3224 | 0 | { |
3225 | 0 | if (!is_preserved_child_fd(j, *parent_fd, max_index)) { |
3226 | 0 | return 0; |
3227 | 0 | } |
3228 | | |
3229 | | /* |
3230 | | * If no other parent_fd matches the child_fd then use it instead of a |
3231 | | * temporary. |
3232 | | */ |
3233 | 0 | int fd = child_fd; |
3234 | 0 | if (fd == -1 || minijail_fd_is_open(fd)) { |
3235 | 0 | fd = 1023; |
3236 | 0 | while (is_preserved_child_fd(j, fd, j->preserved_fd_count) || |
3237 | 0 | minijail_fd_is_open(fd)) { |
3238 | 0 | --fd; |
3239 | 0 | if (fd < 0) { |
3240 | 0 | die("failed to find an unused fd"); |
3241 | 0 | } |
3242 | 0 | } |
3243 | 0 | } |
3244 | |
|
3245 | 0 | int ret = dup2(*parent_fd, fd); |
3246 | | /* |
3247 | | * warn() opens a file descriptor so it needs to happen after dup2 to |
3248 | | * avoid unintended side effects. This can be avoided by reordering the |
3249 | | * mapping requests so that the source fds with overlap are mapped |
3250 | | * first (unless there are cycles). |
3251 | | */ |
3252 | 0 | warn("mapped fd overlap: moving %d to %d", *parent_fd, fd); |
3253 | 0 | if (ret == -1) { |
3254 | 0 | return -1; |
3255 | 0 | } |
3256 | | |
3257 | 0 | *parent_fd = fd; |
3258 | 0 | return 0; |
3259 | 0 | } |
3260 | | |
3261 | | /* |
3262 | | * Check for contradictory mappings and create temporaries for parent file |
3263 | | * descriptors that would otherwise be overwritten during redirect_fds(). |
3264 | | */ |
3265 | | static int prepare_preserved_fds(struct minijail *j) |
3266 | 0 | { |
3267 | | /* Relocate parent_fds that would be replaced by a child_fd. */ |
3268 | 0 | for (size_t i = 0; i < j->preserved_fd_count; i++) { |
3269 | 0 | int child_fd = j->preserved_fds[i].child_fd; |
3270 | 0 | if (is_preserved_child_fd(j, child_fd, i)) { |
3271 | 0 | die("fd %d is mapped more than once", child_fd); |
3272 | 0 | } |
3273 | |
|
3274 | 0 | int *parent_fd = &j->preserved_fds[i].parent_fd; |
3275 | 0 | if (ensure_no_fd_conflict(j, child_fd, parent_fd, i) == -1) { |
3276 | 0 | return -1; |
3277 | 0 | } |
3278 | 0 | } |
3279 | 0 | return 0; |
3280 | 0 | } |
3281 | | |
3282 | | /* |
3283 | | * Structure holding resources and state created when running a minijail. |
3284 | | */ |
3285 | | struct minijail_run_state { |
3286 | | pid_t child_pid; |
3287 | | int pipe_fds[2]; |
3288 | | int stdin_fds[2]; |
3289 | | int stdout_fds[2]; |
3290 | | int stderr_fds[2]; |
3291 | | int child_sync_pipe_fds[2]; |
3292 | | char **child_env; |
3293 | | }; |
3294 | | |
3295 | | /* |
3296 | | * Move pipe_fds if they conflict with a child_fd. |
3297 | | */ |
3298 | | static int avoid_pipe_conflicts(struct minijail *j, |
3299 | | struct minijail_run_state *state) |
3300 | 0 | { |
3301 | 0 | int *pipe_fds[] = { |
3302 | 0 | state->pipe_fds, state->child_sync_pipe_fds, state->stdin_fds, |
3303 | 0 | state->stdout_fds, state->stderr_fds, |
3304 | 0 | }; |
3305 | 0 | for (size_t i = 0; i < ARRAY_SIZE(pipe_fds); ++i) { |
3306 | 0 | if (pipe_fds[i][0] != -1 && |
3307 | 0 | ensure_no_fd_conflict(j, -1, &pipe_fds[i][0], |
3308 | 0 | j->preserved_fd_count) == -1) { |
3309 | 0 | return -1; |
3310 | 0 | } |
3311 | 0 | if (pipe_fds[i][1] != -1 && |
3312 | 0 | ensure_no_fd_conflict(j, -1, &pipe_fds[i][1], |
3313 | 0 | j->preserved_fd_count) == -1) { |
3314 | 0 | return -1; |
3315 | 0 | } |
3316 | 0 | } |
3317 | 0 | return 0; |
3318 | 0 | } |
3319 | | |
3320 | | /* |
3321 | | * Redirect j->preserved_fds from the parent_fd to the child_fd. |
3322 | | * |
3323 | | * NOTE: This will clear FD_CLOEXEC since otherwise the child_fd would not be |
3324 | | * inherited after the exec call. |
3325 | | */ |
3326 | | static int redirect_fds(struct minijail *j) |
3327 | 0 | { |
3328 | 0 | for (size_t i = 0; i < j->preserved_fd_count; i++) { |
3329 | 0 | if (j->preserved_fds[i].parent_fd == |
3330 | 0 | j->preserved_fds[i].child_fd) { |
3331 | | // Clear CLOEXEC if it is set so the FD will be |
3332 | | // inherited by the child. |
3333 | 0 | int flags = |
3334 | 0 | fcntl(j->preserved_fds[i].child_fd, F_GETFD); |
3335 | 0 | if (flags == -1 || (flags & FD_CLOEXEC) == 0) { |
3336 | 0 | continue; |
3337 | 0 | } |
3338 | | |
3339 | | // Currently FD_CLOEXEC is cleared without being |
3340 | | // restored. It may make sense to track when this |
3341 | | // happens and restore FD_CLOEXEC in the child process. |
3342 | 0 | flags &= ~FD_CLOEXEC; |
3343 | 0 | if (fcntl(j->preserved_fds[i].child_fd, F_SETFD, |
3344 | 0 | flags) == -1) { |
3345 | 0 | pwarn("failed to clear CLOEXEC for %d", |
3346 | 0 | j->preserved_fds[i].parent_fd); |
3347 | 0 | } |
3348 | 0 | continue; |
3349 | 0 | } |
3350 | 0 | if (dup2(j->preserved_fds[i].parent_fd, |
3351 | 0 | j->preserved_fds[i].child_fd) == -1) { |
3352 | 0 | return -1; |
3353 | 0 | } |
3354 | 0 | } |
3355 | | |
3356 | | /* |
3357 | | * After all fds have been duped, we are now free to close all parent |
3358 | | * fds that are *not* child fds. |
3359 | | */ |
3360 | 0 | for (size_t i = 0; i < j->preserved_fd_count; i++) { |
3361 | 0 | int parent_fd = j->preserved_fds[i].parent_fd; |
3362 | 0 | if (!is_preserved_child_fd(j, parent_fd, |
3363 | 0 | j->preserved_fd_count)) { |
3364 | 0 | close(parent_fd); |
3365 | 0 | } |
3366 | 0 | } |
3367 | 0 | return 0; |
3368 | 0 | } |
3369 | | |
3370 | | static void minijail_free_run_state(struct minijail_run_state *state) |
3371 | 0 | { |
3372 | 0 | state->child_pid = -1; |
3373 | |
|
3374 | 0 | int *fd_pairs[] = {state->pipe_fds, state->stdin_fds, state->stdout_fds, |
3375 | 0 | state->stderr_fds, state->child_sync_pipe_fds}; |
3376 | 0 | for (size_t i = 0; i < ARRAY_SIZE(fd_pairs); ++i) { |
3377 | 0 | close_and_reset(&fd_pairs[i][0]); |
3378 | 0 | close_and_reset(&fd_pairs[i][1]); |
3379 | 0 | } |
3380 | |
|
3381 | 0 | minijail_free_env(state->child_env); |
3382 | 0 | state->child_env = NULL; |
3383 | 0 | } |
3384 | | |
3385 | | /* Set up stdin/stdout/stderr file descriptors in the child. */ |
3386 | | static void setup_child_std_fds(struct minijail *j, |
3387 | | struct minijail_run_state *state) |
3388 | 0 | { |
3389 | 0 | struct { |
3390 | 0 | const char *name; |
3391 | 0 | int from; |
3392 | 0 | int to; |
3393 | 0 | } fd_map[] = { |
3394 | 0 | {"stdin", state->stdin_fds[0], STDIN_FILENO}, |
3395 | 0 | {"stdout", state->stdout_fds[1], STDOUT_FILENO}, |
3396 | 0 | {"stderr", state->stderr_fds[1], STDERR_FILENO}, |
3397 | 0 | }; |
3398 | |
|
3399 | 0 | for (size_t i = 0; i < ARRAY_SIZE(fd_map); ++i) { |
3400 | 0 | if (fd_map[i].from == -1 || fd_map[i].from == fd_map[i].to) |
3401 | 0 | continue; |
3402 | 0 | if (dup2(fd_map[i].from, fd_map[i].to) == -1) |
3403 | 0 | die("failed to set up %s pipe", fd_map[i].name); |
3404 | 0 | } |
3405 | | |
3406 | | /* Close temporary pipe file descriptors. */ |
3407 | 0 | int *std_pipes[] = {state->stdin_fds, state->stdout_fds, |
3408 | 0 | state->stderr_fds}; |
3409 | 0 | for (size_t i = 0; i < ARRAY_SIZE(std_pipes); ++i) { |
3410 | 0 | close_and_reset(&std_pipes[i][0]); |
3411 | 0 | close_and_reset(&std_pipes[i][1]); |
3412 | 0 | } |
3413 | | |
3414 | | /* Make sure we're not trying to skip setsid() with a PID namespace. */ |
3415 | 0 | if (!j->flags.enable_new_sessions && j->flags.pids) { |
3416 | 0 | die("cannot skip setsid() with PID namespace"); |
3417 | 0 | } |
3418 | | |
3419 | | /* |
3420 | | * If new sessions are enabled and any of stdin, stdout, or stderr are |
3421 | | * TTYs, or setsid flag is set, create a new session. This prevents |
3422 | | * the jailed process from using the TIOCSTI ioctl to push characters |
3423 | | * into the parent process terminal's input buffer, therefore escaping |
3424 | | * the jail. |
3425 | | * |
3426 | | * Since it has just forked, the child will not be a process group |
3427 | | * leader, and this call to setsid() should always succeed. |
3428 | | */ |
3429 | 0 | if (j->flags.enable_new_sessions && |
3430 | 0 | (j->flags.setsid || isatty(STDIN_FILENO) || isatty(STDOUT_FILENO) || |
3431 | 0 | isatty(STDERR_FILENO))) { |
3432 | 0 | if (setsid() < 0) { |
3433 | 0 | pdie("setsid() failed"); |
3434 | 0 | } |
3435 | |
|
3436 | 0 | if (isatty(STDIN_FILENO)) { |
3437 | 0 | ioctl(STDIN_FILENO, TIOCSCTTY, 0); |
3438 | 0 | } |
3439 | 0 | } |
3440 | 0 | } |
3441 | | |
3442 | | /* |
3443 | | * Structure that specifies how to start a minijail. |
3444 | | * |
3445 | | * filename - The program to exec in the child. Should be NULL if elf_fd is set. |
3446 | | * elf_fd - A fd to be used with fexecve. Should be -1 if filename is set. |
3447 | | * NOTE: either filename or elf_fd is required if |exec_in_child| = 1. |
3448 | | * argv - Arguments for the child program. Required if |exec_in_child| = 1. |
3449 | | * envp - Environment for the child program. Available if |exec_in_child| = 1. |
3450 | | * use_preload - If true use LD_PRELOAD. |
3451 | | * exec_in_child - If true, run |filename|. Otherwise, the child will return to |
3452 | | * the caller. |
3453 | | * pstdin_fd - Filled with stdin pipe if non-NULL. |
3454 | | * pstdout_fd - Filled with stdout pipe if non-NULL. |
3455 | | * pstderr_fd - Filled with stderr pipe if non-NULL. |
3456 | | * pchild_pid - Filled with the pid of the child process if non-NULL. |
3457 | | */ |
3458 | | struct minijail_run_config { |
3459 | | const char *filename; |
3460 | | int elf_fd; |
3461 | | char *const *argv; |
3462 | | char *const *envp; |
3463 | | int use_preload; |
3464 | | int exec_in_child; |
3465 | | int *pstdin_fd; |
3466 | | int *pstdout_fd; |
3467 | | int *pstderr_fd; |
3468 | | pid_t *pchild_pid; |
3469 | | }; |
3470 | | |
3471 | | static int |
3472 | | minijail_run_config_internal(struct minijail *j, |
3473 | | const struct minijail_run_config *config); |
3474 | | |
3475 | | int API minijail_run(struct minijail *j, const char *filename, |
3476 | | char *const argv[]) |
3477 | 0 | { |
3478 | 0 | struct minijail_run_config config = { |
3479 | 0 | .filename = filename, |
3480 | 0 | .elf_fd = -1, |
3481 | 0 | .argv = argv, |
3482 | 0 | .envp = NULL, |
3483 | 0 | .use_preload = true, |
3484 | 0 | .exec_in_child = true, |
3485 | 0 | }; |
3486 | 0 | return minijail_run_config_internal(j, &config); |
3487 | 0 | } |
3488 | | |
3489 | | int API minijail_run_env(struct minijail *j, const char *filename, |
3490 | | char *const argv[], char *const envp[]) |
3491 | 0 | { |
3492 | 0 | struct minijail_run_config config = { |
3493 | 0 | .filename = filename, |
3494 | 0 | .elf_fd = -1, |
3495 | 0 | .argv = argv, |
3496 | 0 | .envp = envp, |
3497 | 0 | .use_preload = true, |
3498 | 0 | .exec_in_child = true, |
3499 | 0 | }; |
3500 | 0 | return minijail_run_config_internal(j, &config); |
3501 | 0 | } |
3502 | | |
3503 | | int API minijail_run_pid(struct minijail *j, const char *filename, |
3504 | | char *const argv[], pid_t *pchild_pid) |
3505 | 0 | { |
3506 | 0 | struct minijail_run_config config = { |
3507 | 0 | .filename = filename, |
3508 | 0 | .elf_fd = -1, |
3509 | 0 | .argv = argv, |
3510 | 0 | .envp = NULL, |
3511 | 0 | .use_preload = true, |
3512 | 0 | .exec_in_child = true, |
3513 | 0 | .pchild_pid = pchild_pid, |
3514 | 0 | }; |
3515 | 0 | return minijail_run_config_internal(j, &config); |
3516 | 0 | } |
3517 | | |
3518 | | int API minijail_run_pipe(struct minijail *j, const char *filename, |
3519 | | char *const argv[], int *pstdin_fd) |
3520 | 0 | { |
3521 | 0 | struct minijail_run_config config = { |
3522 | 0 | .filename = filename, |
3523 | 0 | .elf_fd = -1, |
3524 | 0 | .argv = argv, |
3525 | 0 | .envp = NULL, |
3526 | 0 | .use_preload = true, |
3527 | 0 | .exec_in_child = true, |
3528 | 0 | .pstdin_fd = pstdin_fd, |
3529 | 0 | }; |
3530 | 0 | return minijail_run_config_internal(j, &config); |
3531 | 0 | } |
3532 | | |
3533 | | int API minijail_run_pid_pipes(struct minijail *j, const char *filename, |
3534 | | char *const argv[], pid_t *pchild_pid, |
3535 | | int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) |
3536 | 0 | { |
3537 | 0 | struct minijail_run_config config = { |
3538 | 0 | .filename = filename, |
3539 | 0 | .elf_fd = -1, |
3540 | 0 | .argv = argv, |
3541 | 0 | .envp = NULL, |
3542 | 0 | .use_preload = true, |
3543 | 0 | .exec_in_child = true, |
3544 | 0 | .pstdin_fd = pstdin_fd, |
3545 | 0 | .pstdout_fd = pstdout_fd, |
3546 | 0 | .pstderr_fd = pstderr_fd, |
3547 | 0 | .pchild_pid = pchild_pid, |
3548 | 0 | }; |
3549 | 0 | return minijail_run_config_internal(j, &config); |
3550 | 0 | } |
3551 | | |
3552 | | int API minijail_run_env_pid_pipes(struct minijail *j, const char *filename, |
3553 | | char *const argv[], char *const envp[], |
3554 | | pid_t *pchild_pid, int *pstdin_fd, |
3555 | | int *pstdout_fd, int *pstderr_fd) |
3556 | 0 | { |
3557 | 0 | struct minijail_run_config config = { |
3558 | 0 | .filename = filename, |
3559 | 0 | .elf_fd = -1, |
3560 | 0 | .argv = argv, |
3561 | 0 | .envp = envp, |
3562 | 0 | .use_preload = true, |
3563 | 0 | .exec_in_child = true, |
3564 | 0 | .pstdin_fd = pstdin_fd, |
3565 | 0 | .pstdout_fd = pstdout_fd, |
3566 | 0 | .pstderr_fd = pstderr_fd, |
3567 | 0 | .pchild_pid = pchild_pid, |
3568 | 0 | }; |
3569 | 0 | return minijail_run_config_internal(j, &config); |
3570 | 0 | } |
3571 | | |
3572 | | int API minijail_run_fd_env_pid_pipes(struct minijail *j, int elf_fd, |
3573 | | char *const argv[], char *const envp[], |
3574 | | pid_t *pchild_pid, int *pstdin_fd, |
3575 | | int *pstdout_fd, int *pstderr_fd) |
3576 | 0 | { |
3577 | 0 | struct minijail_run_config config = { |
3578 | 0 | .filename = NULL, |
3579 | 0 | .elf_fd = elf_fd, |
3580 | 0 | .argv = argv, |
3581 | 0 | .envp = envp, |
3582 | 0 | .use_preload = true, |
3583 | 0 | .exec_in_child = true, |
3584 | 0 | .pstdin_fd = pstdin_fd, |
3585 | 0 | .pstdout_fd = pstdout_fd, |
3586 | 0 | .pstderr_fd = pstderr_fd, |
3587 | 0 | .pchild_pid = pchild_pid, |
3588 | 0 | }; |
3589 | 0 | return minijail_run_config_internal(j, &config); |
3590 | 0 | } |
3591 | | |
3592 | | int API minijail_run_no_preload(struct minijail *j, const char *filename, |
3593 | | char *const argv[]) |
3594 | 0 | { |
3595 | 0 | struct minijail_run_config config = { |
3596 | 0 | .filename = filename, |
3597 | 0 | .elf_fd = -1, |
3598 | 0 | .argv = argv, |
3599 | 0 | .envp = NULL, |
3600 | 0 | .use_preload = false, |
3601 | 0 | .exec_in_child = true, |
3602 | 0 | }; |
3603 | 0 | return minijail_run_config_internal(j, &config); |
3604 | 0 | } |
3605 | | |
3606 | | int API minijail_run_pid_pipes_no_preload(struct minijail *j, |
3607 | | const char *filename, |
3608 | | char *const argv[], pid_t *pchild_pid, |
3609 | | int *pstdin_fd, int *pstdout_fd, |
3610 | | int *pstderr_fd) |
3611 | 0 | { |
3612 | 0 | struct minijail_run_config config = { |
3613 | 0 | .filename = filename, |
3614 | 0 | .elf_fd = -1, |
3615 | 0 | .argv = argv, |
3616 | 0 | .envp = NULL, |
3617 | 0 | .use_preload = false, |
3618 | 0 | .exec_in_child = true, |
3619 | 0 | .pstdin_fd = pstdin_fd, |
3620 | 0 | .pstdout_fd = pstdout_fd, |
3621 | 0 | .pstderr_fd = pstderr_fd, |
3622 | 0 | .pchild_pid = pchild_pid, |
3623 | 0 | }; |
3624 | 0 | return minijail_run_config_internal(j, &config); |
3625 | 0 | } |
3626 | | |
3627 | | int API minijail_run_env_pid_pipes_no_preload(struct minijail *j, |
3628 | | const char *filename, |
3629 | | char *const argv[], |
3630 | | char *const envp[], |
3631 | | pid_t *pchild_pid, int *pstdin_fd, |
3632 | | int *pstdout_fd, int *pstderr_fd) |
3633 | 0 | { |
3634 | 0 | struct minijail_run_config config = { |
3635 | 0 | .filename = filename, |
3636 | 0 | .elf_fd = -1, |
3637 | 0 | .argv = argv, |
3638 | 0 | .envp = envp, |
3639 | 0 | .use_preload = false, |
3640 | 0 | .exec_in_child = true, |
3641 | 0 | .pstdin_fd = pstdin_fd, |
3642 | 0 | .pstdout_fd = pstdout_fd, |
3643 | 0 | .pstderr_fd = pstderr_fd, |
3644 | 0 | .pchild_pid = pchild_pid, |
3645 | 0 | }; |
3646 | 0 | return minijail_run_config_internal(j, &config); |
3647 | 0 | } |
3648 | | |
3649 | | pid_t API minijail_fork(struct minijail *j) |
3650 | 0 | { |
3651 | 0 | struct minijail_run_config config = { |
3652 | 0 | .elf_fd = -1, |
3653 | 0 | }; |
3654 | 0 | return minijail_run_config_internal(j, &config); |
3655 | 0 | } |
3656 | | |
3657 | | static int minijail_run_internal(struct minijail *j, |
3658 | | const struct minijail_run_config *config, |
3659 | | struct minijail_run_state *state_out) |
3660 | 0 | { |
3661 | 0 | int sync_child = 0; |
3662 | 0 | int ret; |
3663 | | /* We need to remember this across the minijail_preexec() call. */ |
3664 | 0 | int pid_namespace = j->flags.pids; |
3665 | | /* |
3666 | | * Create an init process if we are entering a pid namespace, unless the |
3667 | | * user has explicitly opted out by calling minijail_run_as_init(). |
3668 | | */ |
3669 | 0 | int do_init = j->flags.do_init && !j->flags.run_as_init; |
3670 | 0 | int use_preload = config->use_preload; |
3671 | |
|
3672 | 0 | if (config->filename != NULL && config->elf_fd != -1) { |
3673 | 0 | die("filename and elf_fd cannot be set at the same time"); |
3674 | 0 | } |
3675 | 0 | if (config->filename != NULL) { |
3676 | 0 | j->filename = strdup(config->filename); |
3677 | 0 | } |
3678 | | |
3679 | | /* |
3680 | | * Only copy the environment if we need to modify it. If this is done |
3681 | | * unconditionally, it triggers odd behavior in the ARC container. |
3682 | | */ |
3683 | 0 | if (use_preload || j->seccomp_policy_path) { |
3684 | 0 | state_out->child_env = |
3685 | 0 | minijail_copy_env(config->envp ? config->envp : environ); |
3686 | 0 | if (!state_out->child_env) |
3687 | 0 | return ENOMEM; |
3688 | 0 | } |
3689 | | |
3690 | 0 | if (j->seccomp_policy_path && |
3691 | 0 | setup_seccomp_policy_path(j, &state_out->child_env)) |
3692 | 0 | return -EFAULT; |
3693 | | |
3694 | 0 | if (use_preload) { |
3695 | 0 | if (j->hooks_head != NULL) |
3696 | 0 | die("Minijail hooks are not supported with LD_PRELOAD"); |
3697 | 0 | if (!config->exec_in_child) |
3698 | 0 | die("minijail_fork is not supported with LD_PRELOAD"); |
3699 | | |
3700 | | /* |
3701 | | * Before we fork(2) and execve(2) the child process, we need |
3702 | | * to open a pipe(2) to send the minijail configuration over. |
3703 | | */ |
3704 | 0 | if (setup_preload(j, &state_out->child_env) || |
3705 | 0 | setup_pipe(&state_out->child_env, state_out->pipe_fds)) |
3706 | 0 | return -EFAULT; |
3707 | 0 | } else { |
3708 | 0 | if (j->flags.use_caps && j->caps != 0 && |
3709 | 0 | !j->flags.set_ambient_caps) { |
3710 | 0 | die("non-empty, non-ambient capabilities are not " |
3711 | 0 | "supported without LD_PRELOAD"); |
3712 | 0 | } |
3713 | 0 | } |
3714 | | |
3715 | | /* Create pipes for stdin/stdout/stderr as requested by caller. */ |
3716 | 0 | struct { |
3717 | 0 | bool requested; |
3718 | 0 | int *pipe_fds; |
3719 | 0 | } pipe_fd_req[] = { |
3720 | 0 | {config->pstdin_fd != NULL, state_out->stdin_fds}, |
3721 | 0 | {config->pstdout_fd != NULL, state_out->stdout_fds}, |
3722 | 0 | {config->pstderr_fd != NULL, state_out->stderr_fds}, |
3723 | 0 | }; |
3724 | |
|
3725 | 0 | for (size_t i = 0; i < ARRAY_SIZE(pipe_fd_req); ++i) { |
3726 | 0 | if (pipe_fd_req[i].requested && |
3727 | 0 | pipe(pipe_fd_req[i].pipe_fds) == -1) |
3728 | 0 | return EFAULT; |
3729 | 0 | } |
3730 | | |
3731 | | /* |
3732 | | * If the parent process needs to configure the child's runtime |
3733 | | * environment after forking, create a pipe(2) to block the child until |
3734 | | * configuration is done. |
3735 | | */ |
3736 | 0 | if (j->flags.forward_signals || j->flags.pid_file || j->flags.cgroups || |
3737 | 0 | j->rlimit_count || j->flags.userns) { |
3738 | 0 | sync_child = 1; |
3739 | 0 | if (pipe(state_out->child_sync_pipe_fds)) |
3740 | 0 | return -EFAULT; |
3741 | 0 | } |
3742 | | |
3743 | | /* |
3744 | | * Use sys_clone() if and only if we're creating a pid namespace. |
3745 | | * |
3746 | | * tl;dr: WARNING: do not mix pid namespaces and multithreading. |
3747 | | * |
3748 | | * In multithreaded programs, there are a bunch of locks inside libc, |
3749 | | * some of which may be held by other threads at the time that we call |
3750 | | * minijail_run_pid(). If we call fork(), glibc does its level best to |
3751 | | * ensure that we hold all of these locks before it calls clone() |
3752 | | * internally and drop them after clone() returns, but when we call |
3753 | | * sys_clone(2) directly, all that gets bypassed and we end up with a |
3754 | | * child address space where some of libc's important locks are held by |
3755 | | * other threads (which did not get cloned, and hence will never release |
3756 | | * those locks). This is okay so long as we call exec() immediately |
3757 | | * after, but a bunch of seemingly-innocent libc functions like setenv() |
3758 | | * take locks. |
3759 | | * |
3760 | | * Hence, only call sys_clone() if we need to, in order to get at pid |
3761 | | * namespacing. If we follow this path, the child's address space might |
3762 | | * have broken locks; you may only call functions that do not acquire |
3763 | | * any locks. |
3764 | | * |
3765 | | * Unfortunately, fork() acquires every lock it can get its hands on, as |
3766 | | * previously detailed, so this function is highly likely to deadlock |
3767 | | * later on (see "deadlock here") if we're multithreaded. |
3768 | | * |
3769 | | * We might hack around this by having the clone()d child (init of the |
3770 | | * pid namespace) return directly, rather than leaving the clone()d |
3771 | | * process hanging around to be init for the new namespace (and having |
3772 | | * its fork()ed child return in turn), but that process would be |
3773 | | * crippled with its libc locks potentially broken. We might try |
3774 | | * fork()ing in the parent before we clone() to ensure that we own all |
3775 | | * the locks, but then we have to have the forked child hanging around |
3776 | | * consuming resources (and possibly having file descriptors / shared |
3777 | | * memory regions / etc attached). We'd need to keep the child around to |
3778 | | * avoid having its children get reparented to init. |
3779 | | * |
3780 | | * TODO(b/317404364): figure out if the "forked child hanging around" |
3781 | | * problem is fixable or not. It would be nice if we worked in this |
3782 | | * case. |
3783 | | */ |
3784 | 0 | pid_t child_pid; |
3785 | 0 | if (pid_namespace) { |
3786 | 0 | unsigned long clone_flags = CLONE_NEWPID | SIGCHLD; |
3787 | 0 | if (j->flags.userns) |
3788 | 0 | clone_flags |= CLONE_NEWUSER; |
3789 | |
|
3790 | 0 | child_pid = syscall(SYS_clone, clone_flags, NULL, 0L, 0L, 0L); |
3791 | |
|
3792 | 0 | if (child_pid < 0) { |
3793 | 0 | if (errno == EPERM) |
3794 | 0 | pdie("clone(CLONE_NEWPID | ...) failed with " |
3795 | 0 | "EPERM; is this process missing " |
3796 | 0 | "CAP_SYS_ADMIN?"); |
3797 | 0 | pdie("clone(CLONE_NEWPID | ...) failed"); |
3798 | 0 | } |
3799 | 0 | } else { |
3800 | 0 | if (j->flags.userns) |
3801 | 0 | die("user namespaces in Minijail require a PID " |
3802 | 0 | "namespace"); |
3803 | |
|
3804 | 0 | child_pid = fork(); |
3805 | |
|
3806 | 0 | if (child_pid < 0) |
3807 | 0 | pdie("fork failed"); |
3808 | 0 | } |
3809 | | |
3810 | | /* |
3811 | | * setup_fs_rules_fd() needs to be called before close_open_fds(), and |
3812 | | * before logic for the child process. |
3813 | | */ |
3814 | 0 | if (j->fs_rules_head) { |
3815 | 0 | setup_fs_rules_fd(j); |
3816 | 0 | minijail_preserve_fd(j, j->fs_rules_fd, j->fs_rules_fd); |
3817 | 0 | } |
3818 | |
|
3819 | 0 | state_out->child_pid = child_pid; |
3820 | 0 | if (child_pid) { |
3821 | 0 | j->initpid = child_pid; |
3822 | |
|
3823 | 0 | if (j->flags.forward_signals) { |
3824 | 0 | forward_pid = child_pid; |
3825 | 0 | install_signal_handlers(); |
3826 | 0 | } |
3827 | |
|
3828 | 0 | if (j->flags.pid_file) |
3829 | 0 | write_pid_file_or_die(j); |
3830 | |
|
3831 | 0 | if (j->flags.cgroups) |
3832 | 0 | add_to_cgroups_or_die(j); |
3833 | |
|
3834 | 0 | if (j->rlimit_count) |
3835 | 0 | set_rlimits_or_die(j); |
3836 | |
|
3837 | 0 | if (j->flags.userns) |
3838 | 0 | write_ugid_maps_or_die(j); |
3839 | |
|
3840 | 0 | if (j->flags.enter_vfs) |
3841 | 0 | close(j->mountns_fd); |
3842 | |
|
3843 | 0 | if (j->flags.enter_net) |
3844 | 0 | close(j->netns_fd); |
3845 | |
|
3846 | 0 | if (sync_child) |
3847 | 0 | parent_setup_complete(state_out->child_sync_pipe_fds); |
3848 | |
|
3849 | 0 | if (use_preload) { |
3850 | | /* |
3851 | | * Add SIGPIPE to the signal mask to avoid getting |
3852 | | * killed if the child process finishes or closes its |
3853 | | * end of the pipe prematurely. |
3854 | | * |
3855 | | * TODO(crbug.com/1022170): Use pthread_sigmask instead |
3856 | | * of sigprocmask if Minijail is used in multithreaded |
3857 | | * programs. |
3858 | | */ |
3859 | 0 | sigset_t to_block, to_restore; |
3860 | 0 | if (sigemptyset(&to_block) < 0) |
3861 | 0 | pdie("sigemptyset failed"); |
3862 | 0 | if (sigaddset(&to_block, SIGPIPE) < 0) |
3863 | 0 | pdie("sigaddset failed"); |
3864 | 0 | if (sigprocmask(SIG_BLOCK, &to_block, &to_restore) < 0) |
3865 | 0 | pdie("sigprocmask failed"); |
3866 | | |
3867 | | /* Send marshalled minijail. */ |
3868 | 0 | close_and_reset(&state_out->pipe_fds[0]); |
3869 | 0 | ret = minijail_to_fd(j, state_out->pipe_fds[1]); |
3870 | 0 | close_and_reset(&state_out->pipe_fds[1]); |
3871 | | |
3872 | | /* Accept any pending SIGPIPE. */ |
3873 | 0 | while (true) { |
3874 | 0 | const struct timespec zero_time = {0, 0}; |
3875 | 0 | const int sig = |
3876 | 0 | sigtimedwait(&to_block, NULL, &zero_time); |
3877 | 0 | if (sig < 0) { |
3878 | 0 | if (errno != EINTR) |
3879 | 0 | break; |
3880 | 0 | } else { |
3881 | 0 | if (sig != SIGPIPE) |
3882 | 0 | die("unexpected signal %d", |
3883 | 0 | sig); |
3884 | 0 | } |
3885 | 0 | } |
3886 | | |
3887 | | /* Restore the signal mask to its original state. */ |
3888 | 0 | if (sigprocmask(SIG_SETMASK, &to_restore, NULL) < 0) |
3889 | 0 | pdie("sigprocmask failed"); |
3890 | |
|
3891 | 0 | if (ret) { |
3892 | 0 | warn("failed to send marshalled minijail: %s", |
3893 | 0 | strerror(-ret)); |
3894 | 0 | kill(j->initpid, SIGKILL); |
3895 | 0 | } |
3896 | 0 | } |
3897 | |
|
3898 | 0 | return 0; |
3899 | 0 | } |
3900 | | |
3901 | | /* Child process. */ |
3902 | 0 | if (j->flags.reset_signal_mask) { |
3903 | 0 | sigset_t signal_mask; |
3904 | 0 | if (sigemptyset(&signal_mask) != 0) |
3905 | 0 | pdie("sigemptyset failed"); |
3906 | 0 | if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0) |
3907 | 0 | pdie("sigprocmask failed"); |
3908 | 0 | } |
3909 | |
|
3910 | 0 | if (j->flags.reset_signal_handlers) { |
3911 | 0 | int signum; |
3912 | 0 | for (signum = 0; signum <= SIGRTMAX; signum++) { |
3913 | | /* |
3914 | | * Ignore EINVAL since some signal numbers in the range |
3915 | | * might not be valid. |
3916 | | */ |
3917 | 0 | if (signal(signum, SIG_DFL) == SIG_ERR && |
3918 | 0 | errno != EINVAL) { |
3919 | 0 | pdie("failed to reset signal %d disposition", |
3920 | 0 | signum); |
3921 | 0 | } |
3922 | 0 | } |
3923 | 0 | } |
3924 | |
|
3925 | 0 | if (j->flags.close_open_fds) { |
3926 | 0 | const size_t kMaxInheritableFdsSize = 11 + MAX_PRESERVED_FDS; |
3927 | 0 | int inheritable_fds[kMaxInheritableFdsSize]; |
3928 | 0 | size_t size = 0; |
3929 | |
|
3930 | 0 | int *pipe_fds[] = { |
3931 | 0 | state_out->pipe_fds, state_out->child_sync_pipe_fds, |
3932 | 0 | state_out->stdin_fds, state_out->stdout_fds, |
3933 | 0 | state_out->stderr_fds, |
3934 | 0 | }; |
3935 | |
|
3936 | 0 | for (size_t i = 0; i < ARRAY_SIZE(pipe_fds); ++i) { |
3937 | 0 | if (pipe_fds[i][0] != -1) { |
3938 | 0 | inheritable_fds[size++] = pipe_fds[i][0]; |
3939 | 0 | } |
3940 | 0 | if (pipe_fds[i][1] != -1) { |
3941 | 0 | inheritable_fds[size++] = pipe_fds[i][1]; |
3942 | 0 | } |
3943 | 0 | } |
3944 | | |
3945 | | /* |
3946 | | * Preserve namespace file descriptors over the close_open_fds() |
3947 | | * call. These are closed in minijail_enter() so they won't leak |
3948 | | * into the child process. |
3949 | | */ |
3950 | 0 | if (j->flags.enter_vfs) |
3951 | 0 | minijail_preserve_fd(j, j->mountns_fd, j->mountns_fd); |
3952 | 0 | if (j->flags.enter_net) |
3953 | 0 | minijail_preserve_fd(j, j->netns_fd, j->netns_fd); |
3954 | |
|
3955 | 0 | for (size_t i = 0; i < j->preserved_fd_count; i++) { |
3956 | | /* |
3957 | | * Preserve all parent_fds. They will be dup2(2)-ed in |
3958 | | * the child later. |
3959 | | */ |
3960 | 0 | inheritable_fds[size++] = j->preserved_fds[i].parent_fd; |
3961 | 0 | } |
3962 | |
|
3963 | 0 | if (config->elf_fd > -1) { |
3964 | 0 | inheritable_fds[size++] = config->elf_fd; |
3965 | 0 | } |
3966 | |
|
3967 | 0 | if (close_open_fds(inheritable_fds, size) < 0) |
3968 | 0 | die("failed to close open file descriptors"); |
3969 | 0 | } |
3970 | | |
3971 | | /* The set of fds will be replaced. */ |
3972 | 0 | if (prepare_preserved_fds(j)) |
3973 | 0 | die("failed to set up fd redirections"); |
3974 | |
|
3975 | 0 | if (avoid_pipe_conflicts(j, state_out)) |
3976 | 0 | die("failed to redirect conflicting pipes"); |
3977 | | |
3978 | | /* The elf_fd needs to be mutable so use a stack copy from now on. */ |
3979 | 0 | int elf_fd = config->elf_fd; |
3980 | 0 | if (elf_fd != -1 && |
3981 | 0 | ensure_no_fd_conflict(j, -1, &elf_fd, j->preserved_fd_count)) |
3982 | 0 | die("failed to redirect elf_fd"); |
3983 | |
|
3984 | 0 | if (redirect_fds(j)) |
3985 | 0 | die("failed to set up fd redirections"); |
3986 | |
|
3987 | 0 | if (sync_child) |
3988 | 0 | wait_for_parent_setup(state_out->child_sync_pipe_fds); |
3989 | |
|
3990 | 0 | if (j->flags.userns) |
3991 | 0 | enter_user_namespace(j); |
3992 | |
|
3993 | 0 | setup_child_std_fds(j, state_out); |
3994 | | |
3995 | | /* If running an init program, let it decide when/how to mount /proc. */ |
3996 | 0 | if (pid_namespace && !do_init) |
3997 | 0 | j->flags.remount_proc_ro = 0; |
3998 | |
|
3999 | 0 | if (use_preload) { |
4000 | | /* Strip out flags that cannot be inherited across execve(2). */ |
4001 | 0 | minijail_preexec(j); |
4002 | 0 | } else { |
4003 | | /* |
4004 | | * If not using LD_PRELOAD, do all jailing before execve(2). |
4005 | | * Note that PID namespaces can only be entered on fork(2), |
4006 | | * so that flag is still cleared. |
4007 | | */ |
4008 | 0 | j->flags.pids = 0; |
4009 | 0 | } |
4010 | | |
4011 | | /* |
4012 | | * Jail this process. |
4013 | | * If forking, return. |
4014 | | * If not, execve(2) the target. |
4015 | | */ |
4016 | 0 | minijail_enter(j); |
4017 | |
|
4018 | 0 | if (config->exec_in_child && pid_namespace && do_init) { |
4019 | | /* |
4020 | | * pid namespace: this process will become init inside the new |
4021 | | * namespace. We don't want all programs we might exec to have |
4022 | | * to know how to be init. Normally (do_init == 1) we fork off |
4023 | | * a child to actually run the program. If |do_init == 0|, we |
4024 | | * let the program keep pid 1 and be init. |
4025 | | * |
4026 | | * If we're multithreaded, we'll probably deadlock here. See |
4027 | | * WARNING above. |
4028 | | */ |
4029 | 0 | child_pid = fork(); |
4030 | 0 | if (child_pid < 0) { |
4031 | 0 | _exit(child_pid); |
4032 | 0 | } else if (child_pid > 0) { |
4033 | 0 | minijail_free_run_state(state_out); |
4034 | | |
4035 | | /* |
4036 | | * Best effort. Don't bother checking the return value. |
4037 | | */ |
4038 | 0 | prctl(PR_SET_NAME, "minijail-init"); |
4039 | 0 | init(child_pid); /* Never returns. */ |
4040 | 0 | } |
4041 | 0 | state_out->child_pid = child_pid; |
4042 | 0 | } |
4043 | | |
4044 | 0 | run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_EXECVE); |
4045 | |
|
4046 | 0 | if (!config->exec_in_child) |
4047 | 0 | return 0; |
4048 | | |
4049 | | /* |
4050 | | * We're going to execve(), so make sure any remaining resources are |
4051 | | * freed. Exceptions are: |
4052 | | * 1. The child environment. No need to worry about freeing it since |
4053 | | * execve reinitializes the heap anyways. |
4054 | | * 2. The read side of the LD_PRELOAD pipe, which we need to hand down |
4055 | | * into the target in which the preloaded code will read from it and |
4056 | | * then close it. |
4057 | | */ |
4058 | 0 | state_out->pipe_fds[0] = -1; |
4059 | 0 | char *const *child_env = state_out->child_env; |
4060 | 0 | state_out->child_env = NULL; |
4061 | 0 | minijail_free_run_state(state_out); |
4062 | | |
4063 | | /* |
4064 | | * If we aren't pid-namespaced, or the jailed program asked to be init: |
4065 | | * calling process |
4066 | | * -> execve()-ing process |
4067 | | * If we are: |
4068 | | * calling process |
4069 | | * -> init()-ing process |
4070 | | * -> execve()-ing process |
4071 | | */ |
4072 | 0 | if (!child_env) |
4073 | 0 | child_env = config->envp ? config->envp : environ; |
4074 | 0 | if (elf_fd > -1) { |
4075 | 0 | fexecve(elf_fd, config->argv, child_env); |
4076 | 0 | pwarn("fexecve(%d) failed", config->elf_fd); |
4077 | 0 | } else { |
4078 | 0 | execve(config->filename, config->argv, child_env); |
4079 | 0 | pwarn("execve(%s) failed", config->filename); |
4080 | 0 | } |
4081 | |
|
4082 | 0 | ret = (errno == ENOENT ? MINIJAIL_ERR_NO_COMMAND |
4083 | 0 | : MINIJAIL_ERR_NO_ACCESS); |
4084 | 0 | _exit(ret); |
4085 | 0 | } |
4086 | | |
4087 | | static int |
4088 | | minijail_run_config_internal(struct minijail *j, |
4089 | | const struct minijail_run_config *config) |
4090 | 0 | { |
4091 | 0 | struct minijail_run_state state = { |
4092 | 0 | .child_pid = -1, |
4093 | 0 | .pipe_fds = {-1, -1}, |
4094 | 0 | .stdin_fds = {-1, -1}, |
4095 | 0 | .stdout_fds = {-1, -1}, |
4096 | 0 | .stderr_fds = {-1, -1}, |
4097 | 0 | .child_sync_pipe_fds = {-1, -1}, |
4098 | 0 | .child_env = NULL, |
4099 | 0 | }; |
4100 | 0 | int ret = minijail_run_internal(j, config, &state); |
4101 | |
|
4102 | 0 | if (ret == 0) { |
4103 | 0 | if (config->pchild_pid) |
4104 | 0 | *config->pchild_pid = state.child_pid; |
4105 | | |
4106 | | /* Grab stdin/stdout/stderr descriptors requested by caller. */ |
4107 | 0 | struct { |
4108 | 0 | int *pfd; |
4109 | 0 | int *psrc; |
4110 | 0 | } fd_map[] = { |
4111 | 0 | {config->pstdin_fd, &state.stdin_fds[1]}, |
4112 | 0 | {config->pstdout_fd, &state.stdout_fds[0]}, |
4113 | 0 | {config->pstderr_fd, &state.stderr_fds[0]}, |
4114 | 0 | }; |
4115 | |
|
4116 | 0 | for (size_t i = 0; i < ARRAY_SIZE(fd_map); ++i) { |
4117 | 0 | if (fd_map[i].pfd) { |
4118 | 0 | *fd_map[i].pfd = *fd_map[i].psrc; |
4119 | 0 | *fd_map[i].psrc = -1; |
4120 | 0 | } |
4121 | 0 | } |
4122 | |
|
4123 | 0 | if (!config->exec_in_child) |
4124 | 0 | ret = state.child_pid; |
4125 | 0 | } |
4126 | |
|
4127 | 0 | minijail_free_run_state(&state); |
4128 | |
|
4129 | 0 | return ret; |
4130 | 0 | } |
4131 | | |
4132 | | static int minijail_wait_internal(struct minijail *j, int expected_signal) |
4133 | 0 | { |
4134 | 0 | if (j->initpid <= 0) |
4135 | 0 | return -ECHILD; |
4136 | | |
4137 | 0 | int st; |
4138 | 0 | while (true) { |
4139 | 0 | const int ret = waitpid(j->initpid, &st, 0); |
4140 | 0 | if (ret >= 0) |
4141 | 0 | break; |
4142 | 0 | if (errno != EINTR) |
4143 | 0 | return -errno; |
4144 | 0 | } |
4145 | | |
4146 | 0 | if (!WIFEXITED(st)) { |
4147 | 0 | int error_status = st; |
4148 | 0 | if (!WIFSIGNALED(st)) { |
4149 | 0 | return error_status; |
4150 | 0 | } |
4151 | | |
4152 | 0 | int signum = WTERMSIG(st); |
4153 | | /* |
4154 | | * We return MINIJAIL_ERR_SECCOMP_VIOLATION if the process |
4155 | | * received SIGSYS, which happens when a syscall is blocked by |
4156 | | * SECCOMP filters. |
4157 | | * |
4158 | | * If not, we do what bash(1) does: $? = 128 + signum |
4159 | | */ |
4160 | 0 | if (signum == SIGSYS) { |
4161 | 0 | warn("child process %d had a policy violation (%s)", |
4162 | 0 | j->initpid, |
4163 | 0 | j->seccomp_policy_path ? j->seccomp_policy_path |
4164 | 0 | : "NO-LABEL"); |
4165 | 0 | error_status = MINIJAIL_ERR_SECCOMP_VIOLATION; |
4166 | 0 | } else { |
4167 | 0 | if (signum != expected_signal) { |
4168 | 0 | warn("child process %d received signal %d", |
4169 | 0 | j->initpid, signum); |
4170 | 0 | } |
4171 | 0 | error_status = MINIJAIL_ERR_SIG_BASE + signum; |
4172 | 0 | } |
4173 | 0 | return error_status; |
4174 | 0 | } |
4175 | | |
4176 | 0 | int exit_status = WEXITSTATUS(st); |
4177 | 0 | if (exit_status != 0) |
4178 | 0 | info("child process %d exited with status %d", j->initpid, |
4179 | 0 | exit_status); |
4180 | |
|
4181 | 0 | return exit_status; |
4182 | 0 | } |
4183 | | |
4184 | | int API minijail_kill(struct minijail *j) |
4185 | 0 | { |
4186 | 0 | if (j->initpid <= 0) |
4187 | 0 | return -ECHILD; |
4188 | | |
4189 | 0 | if (kill(j->initpid, SIGTERM)) |
4190 | 0 | return -errno; |
4191 | | |
4192 | 0 | return minijail_wait_internal(j, SIGTERM); |
4193 | 0 | } |
4194 | | |
4195 | | int API minijail_wait(struct minijail *j) |
4196 | 0 | { |
4197 | 0 | return minijail_wait_internal(j, 0); |
4198 | 0 | } |
4199 | | |
4200 | | void API minijail_destroy(struct minijail *j) |
4201 | 0 | { |
4202 | 0 | size_t i; |
4203 | |
|
4204 | 0 | if (j->filter_prog) { |
4205 | 0 | free(j->filter_prog->filter); |
4206 | 0 | free(j->filter_prog); |
4207 | 0 | } |
4208 | 0 | free_mounts_list(j); |
4209 | 0 | free_remounts_list(j); |
4210 | 0 | while (j->hooks_head) { |
4211 | 0 | struct hook *c = j->hooks_head; |
4212 | 0 | j->hooks_head = c->next; |
4213 | 0 | free(c); |
4214 | 0 | } |
4215 | 0 | j->hooks_tail = NULL; |
4216 | 0 | free_fs_rules_list(j); |
4217 | 0 | if (j->user) |
4218 | 0 | free(j->user); |
4219 | 0 | if (j->suppl_gid_list) |
4220 | 0 | free(j->suppl_gid_list); |
4221 | 0 | if (j->chrootdir) |
4222 | 0 | free(j->chrootdir); |
4223 | 0 | if (j->pid_file_path) |
4224 | 0 | free(j->pid_file_path); |
4225 | 0 | if (j->uidmap) |
4226 | 0 | free(j->uidmap); |
4227 | 0 | if (j->gidmap) |
4228 | 0 | free(j->gidmap); |
4229 | 0 | if (j->hostname) |
4230 | 0 | free(j->hostname); |
4231 | 0 | if (j->preload_path) |
4232 | 0 | free(j->preload_path); |
4233 | 0 | if (j->filename) |
4234 | 0 | free(j->filename); |
4235 | 0 | if (j->alt_syscall_table) |
4236 | 0 | free(j->alt_syscall_table); |
4237 | 0 | for (i = 0; i < j->cgroup_count; ++i) |
4238 | 0 | free(j->cgroups[i]); |
4239 | 0 | if (j->seccomp_policy_path) |
4240 | 0 | free(j->seccomp_policy_path); |
4241 | 0 | free(j); |
4242 | 0 | } |
4243 | | |
4244 | | void API minijail_log_to_fd(int fd, int min_priority) |
4245 | 0 | { |
4246 | 0 | init_logging(LOG_TO_FD, fd, min_priority); |
4247 | 0 | } |
4248 | | |
4249 | | const char API *minijail_syscall_name(const struct minijail *j, long nr) |
4250 | 0 | { |
4251 | 0 | if (j && j->flags.alt_syscall) |
4252 | 0 | return kAltSyscallNamePlaceholder; |
4253 | 0 | return lookup_syscall_name(nr); |
4254 | 0 | } |