/src/systemd/src/core/namespace.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* SPDX-License-Identifier: LGPL-2.1+ */ |
2 | | |
3 | | #include <errno.h> |
4 | | #include <sched.h> |
5 | | #include <stdio.h> |
6 | | #include <string.h> |
7 | | #include <sys/mount.h> |
8 | | #include <sys/stat.h> |
9 | | #include <unistd.h> |
10 | | #include <linux/fs.h> |
11 | | |
12 | | #include "alloc-util.h" |
13 | | #include "base-filesystem.h" |
14 | | #include "dev-setup.h" |
15 | | #include "fd-util.h" |
16 | | #include "fs-util.h" |
17 | | #include "label.h" |
18 | | #include "loop-util.h" |
19 | | #include "loopback-setup.h" |
20 | | #include "missing.h" |
21 | | #include "mkdir.h" |
22 | | #include "mount-util.h" |
23 | | #include "mountpoint-util.h" |
24 | | #include "namespace-util.h" |
25 | | #include "namespace.h" |
26 | | #include "nulstr-util.h" |
27 | | #include "path-util.h" |
28 | | #include "selinux-util.h" |
29 | | #include "socket-util.h" |
30 | | #include "sort-util.h" |
31 | | #include "stat-util.h" |
32 | | #include "string-table.h" |
33 | | #include "string-util.h" |
34 | | #include "strv.h" |
35 | | #include "umask-util.h" |
36 | | #include "user-util.h" |
37 | | |
38 | 0 | #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC) |
39 | | |
40 | | typedef enum MountMode { |
41 | | /* This is ordered by priority! */ |
42 | | INACCESSIBLE, |
43 | | BIND_MOUNT, |
44 | | BIND_MOUNT_RECURSIVE, |
45 | | PRIVATE_TMP, |
46 | | PRIVATE_DEV, |
47 | | BIND_DEV, |
48 | | EMPTY_DIR, |
49 | | SYSFS, |
50 | | PROCFS, |
51 | | READONLY, |
52 | | READWRITE, |
53 | | TMPFS, |
54 | | READWRITE_IMPLICIT, /* Should have the lowest priority. */ |
55 | | _MOUNT_MODE_MAX, |
56 | | } MountMode; |
57 | | |
58 | | typedef struct MountEntry { |
59 | | const char *path_const; /* Memory allocated on stack or static */ |
60 | | MountMode mode:5; |
61 | | bool ignore:1; /* Ignore if path does not exist? */ |
62 | | bool has_prefix:1; /* Already is prefixed by the root dir? */ |
63 | | bool read_only:1; /* Shall this mount point be read-only? */ |
64 | | bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */ |
65 | | bool applied:1; /* Already applied */ |
66 | | char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */ |
67 | | const char *source_const; /* The source path, for bind mounts */ |
68 | | char *source_malloc; |
69 | | const char *options_const;/* Mount options for tmpfs */ |
70 | | char *options_malloc; |
71 | | unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */ |
72 | | unsigned n_followed; |
73 | | } MountEntry; |
74 | | |
75 | | /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted |
76 | | * something there already. These mounts are hence overridden by any other explicitly configured mounts. */ |
77 | | static const MountEntry apivfs_table[] = { |
78 | | { "/proc", PROCFS, false }, |
79 | | { "/dev", BIND_DEV, false }, |
80 | | { "/sys", SYSFS, false }, |
81 | | }; |
82 | | |
83 | | /* ProtectKernelTunables= option and the related filesystem APIs */ |
84 | | static const MountEntry protect_kernel_tunables_table[] = { |
85 | | { "/proc/acpi", READONLY, true }, |
86 | | { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */ |
87 | | { "/proc/asound", READONLY, true }, |
88 | | { "/proc/bus", READONLY, true }, |
89 | | { "/proc/fs", READONLY, true }, |
90 | | { "/proc/irq", READONLY, true }, |
91 | | { "/proc/kallsyms", INACCESSIBLE, true }, |
92 | | { "/proc/kcore", INACCESSIBLE, true }, |
93 | | { "/proc/latency_stats", READONLY, true }, |
94 | | { "/proc/mtrr", READONLY, true }, |
95 | | { "/proc/scsi", READONLY, true }, |
96 | | { "/proc/sys", READONLY, false }, |
97 | | { "/proc/sysrq-trigger", READONLY, true }, |
98 | | { "/proc/timer_stats", READONLY, true }, |
99 | | { "/sys", READONLY, false }, |
100 | | { "/sys/fs/bpf", READONLY, true }, |
101 | | { "/sys/fs/cgroup", READWRITE_IMPLICIT, false }, /* READONLY is set by ProtectControlGroups= option */ |
102 | | { "/sys/fs/selinux", READWRITE_IMPLICIT, true }, |
103 | | { "/sys/kernel/debug", READONLY, true }, |
104 | | { "/sys/kernel/tracing", READONLY, true }, |
105 | | }; |
106 | | |
107 | | /* ProtectKernelModules= option */ |
108 | | static const MountEntry protect_kernel_modules_table[] = { |
109 | | #if HAVE_SPLIT_USR |
110 | | { "/lib/modules", INACCESSIBLE, true }, |
111 | | #endif |
112 | | { "/usr/lib/modules", INACCESSIBLE, true }, |
113 | | }; |
114 | | |
115 | | /* |
116 | | * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of |
117 | | * system should be protected by ProtectSystem= |
118 | | */ |
119 | | static const MountEntry protect_home_read_only_table[] = { |
120 | | { "/home", READONLY, true }, |
121 | | { "/run/user", READONLY, true }, |
122 | | { "/root", READONLY, true }, |
123 | | }; |
124 | | |
125 | | /* ProtectHome=tmpfs table */ |
126 | | static const MountEntry protect_home_tmpfs_table[] = { |
127 | | { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME }, |
128 | | { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME }, |
129 | | { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME }, |
130 | | }; |
131 | | |
132 | | /* ProtectHome=yes table */ |
133 | | static const MountEntry protect_home_yes_table[] = { |
134 | | { "/home", INACCESSIBLE, true }, |
135 | | { "/run/user", INACCESSIBLE, true }, |
136 | | { "/root", INACCESSIBLE, true }, |
137 | | }; |
138 | | |
139 | | /* ProtectSystem=yes table */ |
140 | | static const MountEntry protect_system_yes_table[] = { |
141 | | { "/usr", READONLY, false }, |
142 | | { "/boot", READONLY, true }, |
143 | | { "/efi", READONLY, true }, |
144 | | #if HAVE_SPLIT_USR |
145 | | { "/lib", READONLY, true }, |
146 | | { "/lib64", READONLY, true }, |
147 | | { "/bin", READONLY, true }, |
148 | | # if HAVE_SPLIT_BIN |
149 | | { "/sbin", READONLY, true }, |
150 | | # endif |
151 | | #endif |
152 | | }; |
153 | | |
154 | | /* ProtectSystem=full includes ProtectSystem=yes */ |
155 | | static const MountEntry protect_system_full_table[] = { |
156 | | { "/usr", READONLY, false }, |
157 | | { "/boot", READONLY, true }, |
158 | | { "/efi", READONLY, true }, |
159 | | { "/etc", READONLY, false }, |
160 | | #if HAVE_SPLIT_USR |
161 | | { "/lib", READONLY, true }, |
162 | | { "/lib64", READONLY, true }, |
163 | | { "/bin", READONLY, true }, |
164 | | # if HAVE_SPLIT_BIN |
165 | | { "/sbin", READONLY, true }, |
166 | | # endif |
167 | | #endif |
168 | | }; |
169 | | |
170 | | /* |
171 | | * ProtectSystem=strict table. In this strict mode, we mount everything |
172 | | * read-only, except for /proc, /dev, /sys which are the kernel API VFS, |
173 | | * which are left writable, but PrivateDevices= + ProtectKernelTunables= |
174 | | * protect those, and these options should be fully orthogonal. |
175 | | * (And of course /home and friends are also left writable, as ProtectHome= |
176 | | * shall manage those, orthogonally). |
177 | | */ |
178 | | static const MountEntry protect_system_strict_table[] = { |
179 | | { "/", READONLY, false }, |
180 | | { "/proc", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */ |
181 | | { "/sys", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */ |
182 | | { "/dev", READWRITE_IMPLICIT, false }, /* PrivateDevices= */ |
183 | | { "/home", READWRITE_IMPLICIT, true }, /* ProtectHome= */ |
184 | | { "/run/user", READWRITE_IMPLICIT, true }, /* ProtectHome= */ |
185 | | { "/root", READWRITE_IMPLICIT, true }, /* ProtectHome= */ |
186 | | }; |
187 | | |
188 | | static const char * const mount_mode_table[_MOUNT_MODE_MAX] = { |
189 | | [INACCESSIBLE] = "inaccessible", |
190 | | [BIND_MOUNT] = "bind", |
191 | | [BIND_MOUNT_RECURSIVE] = "rbind", |
192 | | [PRIVATE_TMP] = "private-tmp", |
193 | | [PRIVATE_DEV] = "private-dev", |
194 | | [BIND_DEV] = "bind-dev", |
195 | | [EMPTY_DIR] = "empty", |
196 | | [SYSFS] = "sysfs", |
197 | | [PROCFS] = "procfs", |
198 | | [READONLY] = "read-only", |
199 | | [READWRITE] = "read-write", |
200 | | [TMPFS] = "tmpfs", |
201 | | [READWRITE_IMPLICIT] = "rw-implicit", |
202 | | }; |
203 | | |
204 | | DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode); |
205 | | |
206 | 0 | static const char *mount_entry_path(const MountEntry *p) { |
207 | 0 | assert(p); |
208 | 0 |
|
209 | 0 | /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that, |
210 | 0 | * otherwise the stack/static ->path field is returned. */ |
211 | 0 |
|
212 | 0 | return p->path_malloc ?: p->path_const; |
213 | 0 | } |
214 | | |
215 | 0 | static bool mount_entry_read_only(const MountEntry *p) { |
216 | 0 | assert(p); |
217 | 0 |
|
218 | 0 | return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE); |
219 | 0 | } |
220 | | |
221 | 0 | static const char *mount_entry_source(const MountEntry *p) { |
222 | 0 | assert(p); |
223 | 0 |
|
224 | 0 | return p->source_malloc ?: p->source_const; |
225 | 0 | } |
226 | | |
227 | 0 | static const char *mount_entry_options(const MountEntry *p) { |
228 | 0 | assert(p); |
229 | 0 |
|
230 | 0 | return p->options_malloc ?: p->options_const; |
231 | 0 | } |
232 | | |
233 | 0 | static void mount_entry_done(MountEntry *p) { |
234 | 0 | assert(p); |
235 | 0 |
|
236 | 0 | p->path_malloc = mfree(p->path_malloc); |
237 | 0 | p->source_malloc = mfree(p->source_malloc); |
238 | 0 | p->options_malloc = mfree(p->options_malloc); |
239 | 0 | } |
240 | | |
241 | 0 | static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) { |
242 | 0 | char **i; |
243 | 0 |
|
244 | 0 | assert(p); |
245 | 0 |
|
246 | 0 | /* Adds a list of user-supplied READWRITE/READWRITE_IMPLICIT/READONLY/INACCESSIBLE entries */ |
247 | 0 |
|
248 | 0 | STRV_FOREACH(i, strv) { |
249 | 0 | bool ignore = false, needs_prefix = false; |
250 | 0 | const char *e = *i; |
251 | 0 |
|
252 | 0 | /* Look for any prefixes */ |
253 | 0 | if (startswith(e, "-")) { |
254 | 0 | e++; |
255 | 0 | ignore = true; |
256 | 0 | } |
257 | 0 | if (startswith(e, "+")) { |
258 | 0 | e++; |
259 | 0 | needs_prefix = true; |
260 | 0 | } |
261 | 0 |
|
262 | 0 | if (!path_is_absolute(e)) |
263 | 0 | return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), |
264 | 0 | "Path is not absolute: %s", e); |
265 | 0 | |
266 | 0 | *((*p)++) = (MountEntry) { |
267 | 0 | .path_const = e, |
268 | 0 | .mode = mode, |
269 | 0 | .ignore = ignore, |
270 | 0 | .has_prefix = !needs_prefix && !forcibly_require_prefix, |
271 | 0 | }; |
272 | 0 | } |
273 | 0 |
|
274 | 0 | return 0; |
275 | 0 | } |
276 | | |
277 | 0 | static int append_empty_dir_mounts(MountEntry **p, char **strv) { |
278 | 0 | char **i; |
279 | 0 |
|
280 | 0 | assert(p); |
281 | 0 |
|
282 | 0 | /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the |
283 | 0 | * "/private/" boundary directories for DynamicUser=1. */ |
284 | 0 |
|
285 | 0 | STRV_FOREACH(i, strv) { |
286 | 0 |
|
287 | 0 | *((*p)++) = (MountEntry) { |
288 | 0 | .path_const = *i, |
289 | 0 | .mode = EMPTY_DIR, |
290 | 0 | .ignore = false, |
291 | 0 | .read_only = true, |
292 | 0 | .options_const = "mode=755", |
293 | 0 | .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, |
294 | 0 | }; |
295 | 0 | } |
296 | 0 |
|
297 | 0 | return 0; |
298 | 0 | } |
299 | | |
300 | 0 | static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) { |
301 | 0 | size_t i; |
302 | 0 |
|
303 | 0 | assert(p); |
304 | 0 |
|
305 | 0 | for (i = 0; i < n; i++) { |
306 | 0 | const BindMount *b = binds + i; |
307 | 0 |
|
308 | 0 | *((*p)++) = (MountEntry) { |
309 | 0 | .path_const = b->destination, |
310 | 0 | .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT, |
311 | 0 | .read_only = b->read_only, |
312 | 0 | .nosuid = b->nosuid, |
313 | 0 | .source_const = b->source, |
314 | 0 | .ignore = b->ignore_enoent, |
315 | 0 | }; |
316 | 0 | } |
317 | 0 |
|
318 | 0 | return 0; |
319 | 0 | } |
320 | | |
321 | 0 | static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) { |
322 | 0 | size_t i; |
323 | 0 | int r; |
324 | 0 |
|
325 | 0 | assert(p); |
326 | 0 |
|
327 | 0 | for (i = 0; i < n; i++) { |
328 | 0 | const TemporaryFileSystem *t = tmpfs + i; |
329 | 0 | _cleanup_free_ char *o = NULL, *str = NULL; |
330 | 0 | unsigned long flags; |
331 | 0 | bool ro = false; |
332 | 0 |
|
333 | 0 | if (!path_is_absolute(t->path)) |
334 | 0 | return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), |
335 | 0 | "Path is not absolute: %s", |
336 | 0 | t->path); |
337 | 0 | |
338 | 0 | str = strjoin("mode=0755,", t->options); |
339 | 0 | if (!str) |
340 | 0 | return -ENOMEM; |
341 | 0 | |
342 | 0 | r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o); |
343 | 0 | if (r < 0) |
344 | 0 | return log_debug_errno(r, "Failed to parse mount option '%s': %m", str); |
345 | 0 | |
346 | 0 | ro = flags & MS_RDONLY; |
347 | 0 | if (ro) |
348 | 0 | flags ^= MS_RDONLY; |
349 | 0 |
|
350 | 0 | *((*p)++) = (MountEntry) { |
351 | 0 | .path_const = t->path, |
352 | 0 | .mode = TMPFS, |
353 | 0 | .read_only = ro, |
354 | 0 | .options_malloc = TAKE_PTR(o), |
355 | 0 | .flags = flags, |
356 | 0 | }; |
357 | 0 | } |
358 | 0 |
|
359 | 0 | return 0; |
360 | 0 | } |
361 | | |
362 | 0 | static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) { |
363 | 0 | size_t i; |
364 | 0 |
|
365 | 0 | assert(p); |
366 | 0 | assert(mounts); |
367 | 0 |
|
368 | 0 | /* Adds a list of static pre-defined entries */ |
369 | 0 |
|
370 | 0 | for (i = 0; i < n; i++) |
371 | 0 | *((*p)++) = (MountEntry) { |
372 | 0 | .path_const = mount_entry_path(mounts+i), |
373 | 0 | .mode = mounts[i].mode, |
374 | 0 | .ignore = mounts[i].ignore || ignore_protect, |
375 | 0 | }; |
376 | 0 |
|
377 | 0 | return 0; |
378 | 0 | } |
379 | | |
380 | 0 | static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) { |
381 | 0 | assert(p); |
382 | 0 |
|
383 | 0 | switch (protect_home) { |
384 | 0 |
|
385 | 0 | case PROTECT_HOME_NO: |
386 | 0 | return 0; |
387 | 0 |
|
388 | 0 | case PROTECT_HOME_READ_ONLY: |
389 | 0 | return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect); |
390 | 0 |
|
391 | 0 | case PROTECT_HOME_TMPFS: |
392 | 0 | return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect); |
393 | 0 |
|
394 | 0 | case PROTECT_HOME_YES: |
395 | 0 | return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect); |
396 | 0 |
|
397 | 0 | default: |
398 | 0 | assert_not_reached("Unexpected ProtectHome= value"); |
399 | 0 | } |
400 | 0 | } |
401 | | |
402 | 0 | static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) { |
403 | 0 | assert(p); |
404 | 0 |
|
405 | 0 | switch (protect_system) { |
406 | 0 |
|
407 | 0 | case PROTECT_SYSTEM_NO: |
408 | 0 | return 0; |
409 | 0 |
|
410 | 0 | case PROTECT_SYSTEM_STRICT: |
411 | 0 | return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect); |
412 | 0 |
|
413 | 0 | case PROTECT_SYSTEM_YES: |
414 | 0 | return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect); |
415 | 0 |
|
416 | 0 | case PROTECT_SYSTEM_FULL: |
417 | 0 | return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect); |
418 | 0 |
|
419 | 0 | default: |
420 | 0 | assert_not_reached("Unexpected ProtectSystem= value"); |
421 | 0 | } |
422 | 0 | } |
423 | | |
424 | 0 | static int mount_path_compare(const MountEntry *a, const MountEntry *b) { |
425 | 0 | int d; |
426 | 0 |
|
427 | 0 | /* If the paths are not equal, then order prefixes first */ |
428 | 0 | d = path_compare(mount_entry_path(a), mount_entry_path(b)); |
429 | 0 | if (d != 0) |
430 | 0 | return d; |
431 | 0 | |
432 | 0 | /* If the paths are equal, check the mode */ |
433 | 0 | return CMP((int) a->mode, (int) b->mode); |
434 | 0 | } |
435 | | |
436 | 0 | static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) { |
437 | 0 | size_t i; |
438 | 0 |
|
439 | 0 | /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */ |
440 | 0 |
|
441 | 0 | for (i = 0; i < n; i++) { |
442 | 0 | char *s; |
443 | 0 |
|
444 | 0 | if (m[i].has_prefix) |
445 | 0 | continue; |
446 | 0 | |
447 | 0 | s = prefix_root(root_directory, mount_entry_path(m+i)); |
448 | 0 | if (!s) |
449 | 0 | return -ENOMEM; |
450 | 0 | |
451 | 0 | free_and_replace(m[i].path_malloc, s); |
452 | 0 | m[i].has_prefix = true; |
453 | 0 | } |
454 | 0 |
|
455 | 0 | return 0; |
456 | 0 | } |
457 | | |
458 | 0 | static void drop_duplicates(MountEntry *m, size_t *n) { |
459 | 0 | MountEntry *f, *t, *previous; |
460 | 0 |
|
461 | 0 | assert(m); |
462 | 0 | assert(n); |
463 | 0 |
|
464 | 0 | /* Drops duplicate entries. Expects that the array is properly ordered already. */ |
465 | 0 |
|
466 | 0 | for (f = m, t = m, previous = NULL; f < m + *n; f++) { |
467 | 0 |
|
468 | 0 | /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare() |
469 | 0 | * above. Note that we only drop duplicates that haven't been mounted yet. */ |
470 | 0 | if (previous && |
471 | 0 | path_equal(mount_entry_path(f), mount_entry_path(previous)) && |
472 | 0 | !f->applied && !previous->applied) { |
473 | 0 | log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode)); |
474 | 0 | previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */ |
475 | 0 | mount_entry_done(f); |
476 | 0 | continue; |
477 | 0 | } |
478 | 0 |
|
479 | 0 | *t = *f; |
480 | 0 | previous = t; |
481 | 0 | t++; |
482 | 0 | } |
483 | 0 |
|
484 | 0 | *n = t - m; |
485 | 0 | } |
486 | | |
487 | 0 | static void drop_inaccessible(MountEntry *m, size_t *n) { |
488 | 0 | MountEntry *f, *t; |
489 | 0 | const char *clear = NULL; |
490 | 0 |
|
491 | 0 | assert(m); |
492 | 0 | assert(n); |
493 | 0 |
|
494 | 0 | /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly |
495 | 0 | * ordered already. */ |
496 | 0 |
|
497 | 0 | for (f = m, t = m; f < m + *n; f++) { |
498 | 0 |
|
499 | 0 | /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop |
500 | 0 | * it, as inaccessible paths really should drop the entire subtree. */ |
501 | 0 | if (clear && path_startswith(mount_entry_path(f), clear)) { |
502 | 0 | log_debug("%s is masked by %s.", mount_entry_path(f), clear); |
503 | 0 | mount_entry_done(f); |
504 | 0 | continue; |
505 | 0 | } |
506 | 0 |
|
507 | 0 | clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL; |
508 | 0 |
|
509 | 0 | *t = *f; |
510 | 0 | t++; |
511 | 0 | } |
512 | 0 |
|
513 | 0 | *n = t - m; |
514 | 0 | } |
515 | | |
516 | 0 | static void drop_nop(MountEntry *m, size_t *n) { |
517 | 0 | MountEntry *f, *t; |
518 | 0 |
|
519 | 0 | assert(m); |
520 | 0 | assert(n); |
521 | 0 |
|
522 | 0 | /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the |
523 | 0 | * list is ordered by prefixes. */ |
524 | 0 |
|
525 | 0 | for (f = m, t = m; f < m + *n; f++) { |
526 | 0 |
|
527 | 0 | /* Only suppress such subtrees for READONLY, READWRITE and READWRITE_IMPLICIT entries */ |
528 | 0 | if (IN_SET(f->mode, READONLY, READWRITE, READWRITE_IMPLICIT)) { |
529 | 0 | MountEntry *p; |
530 | 0 | bool found = false; |
531 | 0 |
|
532 | 0 | /* Now let's find the first parent of the entry we are looking at. */ |
533 | 0 | for (p = t-1; p >= m; p--) { |
534 | 0 | if (path_startswith(mount_entry_path(f), mount_entry_path(p))) { |
535 | 0 | found = true; |
536 | 0 | break; |
537 | 0 | } |
538 | 0 | } |
539 | 0 |
|
540 | 0 | /* We found it, let's see if it's the same mode, if so, we can drop this entry */ |
541 | 0 | if (found && p->mode == f->mode) { |
542 | 0 | log_debug("%s (%s) is made redundant by %s (%s)", |
543 | 0 | mount_entry_path(f), mount_mode_to_string(f->mode), |
544 | 0 | mount_entry_path(p), mount_mode_to_string(p->mode)); |
545 | 0 | mount_entry_done(f); |
546 | 0 | continue; |
547 | 0 | } |
548 | 0 | } |
549 | 0 |
|
550 | 0 | *t = *f; |
551 | 0 | t++; |
552 | 0 | } |
553 | 0 |
|
554 | 0 | *n = t - m; |
555 | 0 | } |
556 | | |
557 | 0 | static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) { |
558 | 0 | MountEntry *f, *t; |
559 | 0 |
|
560 | 0 | assert(m); |
561 | 0 | assert(n); |
562 | 0 |
|
563 | 0 | /* Nothing to do */ |
564 | 0 | if (!root_directory) |
565 | 0 | return; |
566 | 0 | |
567 | 0 | /* Drops all mounts that are outside of the root directory. */ |
568 | 0 | |
569 | 0 | for (f = m, t = m; f < m + *n; f++) { |
570 | 0 |
|
571 | 0 | if (!path_startswith(mount_entry_path(f), root_directory)) { |
572 | 0 | log_debug("%s is outside of root directory.", mount_entry_path(f)); |
573 | 0 | mount_entry_done(f); |
574 | 0 | continue; |
575 | 0 | } |
576 | 0 |
|
577 | 0 | *t = *f; |
578 | 0 | t++; |
579 | 0 | } |
580 | 0 |
|
581 | 0 | *n = t - m; |
582 | 0 | } |
583 | | |
584 | | static int clone_device_node( |
585 | | const char *d, |
586 | | const char *temporary_mount, |
587 | 0 | bool *make_devnode) { |
588 | 0 |
|
589 | 0 | _cleanup_free_ char *sl = NULL; |
590 | 0 | const char *dn, *bn, *t; |
591 | 0 | struct stat st; |
592 | 0 | int r; |
593 | 0 |
|
594 | 0 | if (stat(d, &st) < 0) { |
595 | 0 | if (errno == ENOENT) { |
596 | 0 | log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d); |
597 | 0 | return -ENXIO; |
598 | 0 | } |
599 | 0 |
|
600 | 0 | return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d); |
601 | 0 | } |
602 | 0 |
|
603 | 0 | if (!S_ISBLK(st.st_mode) && |
604 | 0 | !S_ISCHR(st.st_mode)) |
605 | 0 | return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), |
606 | 0 | "Device node '%s' to clone is not a device node, ignoring.", |
607 | 0 | d); |
608 | 0 | |
609 | 0 | dn = strjoina(temporary_mount, d); |
610 | 0 |
|
611 | 0 | /* First, try to create device node properly */ |
612 | 0 | if (*make_devnode) { |
613 | 0 | mac_selinux_create_file_prepare(d, st.st_mode); |
614 | 0 | r = mknod(dn, st.st_mode, st.st_rdev); |
615 | 0 | mac_selinux_create_file_clear(); |
616 | 0 | if (r >= 0) |
617 | 0 | goto add_symlink; |
618 | 0 | if (errno != EPERM) |
619 | 0 | return log_debug_errno(errno, "mknod failed for %s: %m", d); |
620 | 0 | |
621 | 0 | /* This didn't work, let's not try this again for the next iterations. */ |
622 | 0 | *make_devnode = false; |
623 | 0 | } |
624 | 0 |
|
625 | 0 | /* We're about to fallback to bind-mounting the device |
626 | 0 | * node. So create a dummy bind-mount target. */ |
627 | 0 | mac_selinux_create_file_prepare(d, 0); |
628 | 0 | r = mknod(dn, S_IFREG, 0); |
629 | 0 | mac_selinux_create_file_clear(); |
630 | 0 | if (r < 0 && errno != EEXIST) |
631 | 0 | return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d); |
632 | 0 | |
633 | 0 | /* Fallback to bind-mounting: |
634 | 0 | * The assumption here is that all used device nodes carry standard |
635 | 0 | * properties. Specifically, the devices nodes we bind-mount should |
636 | 0 | * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx) |
637 | 0 | * and should not carry ACLs. */ |
638 | 0 | if (mount(d, dn, NULL, MS_BIND, NULL) < 0) |
639 | 0 | return log_debug_errno(errno, "Bind mounting failed for '%s': %m", d); |
640 | 0 | |
641 | 0 | add_symlink: |
642 | 0 | bn = path_startswith(d, "/dev/"); |
643 | 0 | if (!bn) |
644 | 0 | return 0; |
645 | 0 | |
646 | 0 | /* Create symlinks like /dev/char/1:9 → ../urandom */ |
647 | 0 | if (asprintf(&sl, "%s/dev/%s/%u:%u", temporary_mount, S_ISCHR(st.st_mode) ? "char" : "block", major(st.st_rdev), minor(st.st_rdev)) < 0) |
648 | 0 | return log_oom(); |
649 | 0 | |
650 | 0 | (void) mkdir_parents(sl, 0755); |
651 | 0 |
|
652 | 0 | t = strjoina("../", bn); |
653 | 0 |
|
654 | 0 | if (symlink(t, sl) < 0) |
655 | 0 | log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl); |
656 | 0 |
|
657 | 0 | return 0; |
658 | 0 | } |
659 | | |
660 | 0 | static int mount_private_dev(MountEntry *m) { |
661 | 0 | static const char devnodes[] = |
662 | 0 | "/dev/null\0" |
663 | 0 | "/dev/zero\0" |
664 | 0 | "/dev/full\0" |
665 | 0 | "/dev/random\0" |
666 | 0 | "/dev/urandom\0" |
667 | 0 | "/dev/tty\0"; |
668 | 0 |
|
669 | 0 | char temporary_mount[] = "/tmp/namespace-dev-XXXXXX"; |
670 | 0 | const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL; |
671 | 0 | bool can_mknod = true; |
672 | 0 | _cleanup_umask_ mode_t u; |
673 | 0 | int r; |
674 | 0 |
|
675 | 0 | assert(m); |
676 | 0 |
|
677 | 0 | u = umask(0000); |
678 | 0 |
|
679 | 0 | if (!mkdtemp(temporary_mount)) |
680 | 0 | return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount); |
681 | 0 | |
682 | 0 | dev = strjoina(temporary_mount, "/dev"); |
683 | 0 | (void) mkdir(dev, 0755); |
684 | 0 | if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) { |
685 | 0 | r = log_debug_errno(errno, "Failed to mount tmpfs on '%s': %m", dev); |
686 | 0 | goto fail; |
687 | 0 | } |
688 | 0 |
|
689 | 0 | devpts = strjoina(temporary_mount, "/dev/pts"); |
690 | 0 | (void) mkdir(devpts, 0755); |
691 | 0 | if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) { |
692 | 0 | r = log_debug_errno(errno, "Failed to bind mount /dev/pts on '%s': %m", devpts); |
693 | 0 | goto fail; |
694 | 0 | } |
695 | 0 |
|
696 | 0 | /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx. |
697 | 0 | * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible. |
698 | 0 | * Thus, in that case make a clone. |
699 | 0 | * In nspawn and other containers it will be a symlink, in that case make it a symlink. */ |
700 | 0 | r = is_symlink("/dev/ptmx"); |
701 | 0 | if (r < 0) { |
702 | 0 | log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m"); |
703 | 0 | goto fail; |
704 | 0 | } else if (r > 0) { |
705 | 0 | devptmx = strjoina(temporary_mount, "/dev/ptmx"); |
706 | 0 | if (symlink("pts/ptmx", devptmx) < 0) { |
707 | 0 | r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx); |
708 | 0 | goto fail; |
709 | 0 | } |
710 | 0 | } else { |
711 | 0 | r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod); |
712 | 0 | if (r < 0) |
713 | 0 | goto fail; |
714 | 0 | } |
715 | 0 | |
716 | 0 | devshm = strjoina(temporary_mount, "/dev/shm"); |
717 | 0 | (void) mkdir(devshm, 0755); |
718 | 0 | r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL); |
719 | 0 | if (r < 0) { |
720 | 0 | r = log_debug_errno(errno, "Failed to bind mount /dev/shm on '%s': %m", devshm); |
721 | 0 | goto fail; |
722 | 0 | } |
723 | 0 |
|
724 | 0 | devmqueue = strjoina(temporary_mount, "/dev/mqueue"); |
725 | 0 | (void) mkdir(devmqueue, 0755); |
726 | 0 | if (mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL) < 0) |
727 | 0 | log_debug_errno(errno, "Failed to bind mount /dev/mqueue on '%s', ignoring: %m", devmqueue); |
728 | 0 |
|
729 | 0 | devhugepages = strjoina(temporary_mount, "/dev/hugepages"); |
730 | 0 | (void) mkdir(devhugepages, 0755); |
731 | 0 | if (mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL) < 0) |
732 | 0 | log_debug_errno(errno, "Failed to bind mount /dev/hugepages on '%s', ignoring: %m", devhugepages); |
733 | 0 |
|
734 | 0 | devlog = strjoina(temporary_mount, "/dev/log"); |
735 | 0 | if (symlink("/run/systemd/journal/dev-log", devlog) < 0) |
736 | 0 | log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog); |
737 | 0 |
|
738 | 0 | NULSTR_FOREACH(d, devnodes) { |
739 | 0 | r = clone_device_node(d, temporary_mount, &can_mknod); |
740 | 0 | /* ENXIO means the the *source* is not a device file, skip creation in that case */ |
741 | 0 | if (r < 0 && r != -ENXIO) |
742 | 0 | goto fail; |
743 | 0 | } |
744 | 0 |
|
745 | 0 | r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID); |
746 | 0 | if (r < 0) |
747 | 0 | log_debug_errno(r, "Failed to setup basic device tree at '%s', ignoring: %m", temporary_mount); |
748 | 0 |
|
749 | 0 | /* Create the /dev directory if missing. It is more likely to be |
750 | 0 | * missing when the service is started with RootDirectory. This is |
751 | 0 | * consistent with mount units creating the mount points when missing. |
752 | 0 | */ |
753 | 0 | (void) mkdir_p_label(mount_entry_path(m), 0755); |
754 | 0 |
|
755 | 0 | /* Unmount everything in old /dev */ |
756 | 0 | r = umount_recursive(mount_entry_path(m), 0); |
757 | 0 | if (r < 0) |
758 | 0 | log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m)); |
759 | 0 |
|
760 | 0 | if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) { |
761 | 0 | r = log_debug_errno(errno, "Failed to move mount point '%s' to '%s': %m", dev, mount_entry_path(m)); |
762 | 0 | goto fail; |
763 | 0 | } |
764 | 0 |
|
765 | 0 | (void) rmdir(dev); |
766 | 0 | (void) rmdir(temporary_mount); |
767 | 0 |
|
768 | 0 | return 0; |
769 | 0 | |
770 | 0 | fail: |
771 | 0 | if (devpts) |
772 | 0 | (void) umount(devpts); |
773 | 0 |
|
774 | 0 | if (devshm) |
775 | 0 | (void) umount(devshm); |
776 | 0 |
|
777 | 0 | if (devhugepages) |
778 | 0 | (void) umount(devhugepages); |
779 | 0 |
|
780 | 0 | if (devmqueue) |
781 | 0 | (void) umount(devmqueue); |
782 | 0 |
|
783 | 0 | (void) umount(dev); |
784 | 0 | (void) rmdir(dev); |
785 | 0 | (void) rmdir(temporary_mount); |
786 | 0 |
|
787 | 0 | return r; |
788 | 0 | } |
789 | | |
790 | 0 | static int mount_bind_dev(const MountEntry *m) { |
791 | 0 | int r; |
792 | 0 |
|
793 | 0 | assert(m); |
794 | 0 |
|
795 | 0 | /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's |
796 | 0 | * /dev. This is only used when RootDirectory= is set. */ |
797 | 0 |
|
798 | 0 | (void) mkdir_p_label(mount_entry_path(m), 0755); |
799 | 0 |
|
800 | 0 | r = path_is_mount_point(mount_entry_path(m), NULL, 0); |
801 | 0 | if (r < 0) |
802 | 0 | return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m"); |
803 | 0 | if (r > 0) /* make this a NOP if /dev is already a mount point */ |
804 | 0 | return 0; |
805 | 0 | |
806 | 0 | if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0) |
807 | 0 | return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m)); |
808 | 0 | |
809 | 0 | return 1; |
810 | 0 | } |
811 | | |
812 | 0 | static int mount_sysfs(const MountEntry *m) { |
813 | 0 | int r; |
814 | 0 |
|
815 | 0 | assert(m); |
816 | 0 |
|
817 | 0 | (void) mkdir_p_label(mount_entry_path(m), 0755); |
818 | 0 |
|
819 | 0 | r = path_is_mount_point(mount_entry_path(m), NULL, 0); |
820 | 0 | if (r < 0) |
821 | 0 | return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m"); |
822 | 0 | if (r > 0) /* make this a NOP if /sys is already a mount point */ |
823 | 0 | return 0; |
824 | 0 | |
825 | 0 | /* Bind mount the host's version so that we get all child mounts of it, too. */ |
826 | 0 | if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0) |
827 | 0 | return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); |
828 | 0 | |
829 | 0 | return 1; |
830 | 0 | } |
831 | | |
832 | 0 | static int mount_procfs(const MountEntry *m) { |
833 | 0 | int r; |
834 | 0 |
|
835 | 0 | assert(m); |
836 | 0 |
|
837 | 0 | (void) mkdir_p_label(mount_entry_path(m), 0755); |
838 | 0 |
|
839 | 0 | r = path_is_mount_point(mount_entry_path(m), NULL, 0); |
840 | 0 | if (r < 0) |
841 | 0 | return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m"); |
842 | 0 | if (r > 0) /* make this a NOP if /proc is already a mount point */ |
843 | 0 | return 0; |
844 | 0 | |
845 | 0 | /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */ |
846 | 0 | if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0) |
847 | 0 | return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); |
848 | 0 | |
849 | 0 | return 1; |
850 | 0 | } |
851 | | |
852 | 0 | static int mount_tmpfs(const MountEntry *m) { |
853 | 0 | assert(m); |
854 | 0 |
|
855 | 0 | /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */ |
856 | 0 |
|
857 | 0 | (void) mkdir_p_label(mount_entry_path(m), 0755); |
858 | 0 | (void) umount_recursive(mount_entry_path(m), 0); |
859 | 0 |
|
860 | 0 | if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0) |
861 | 0 | return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); |
862 | 0 | |
863 | 0 | return 1; |
864 | 0 | } |
865 | | |
866 | | static int follow_symlink( |
867 | | const char *root_directory, |
868 | 0 | MountEntry *m) { |
869 | 0 |
|
870 | 0 | _cleanup_free_ char *target = NULL; |
871 | 0 | int r; |
872 | 0 |
|
873 | 0 | /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we |
874 | 0 | * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at |
875 | 0 | * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the |
876 | 0 | * end and already have a fully normalized name. */ |
877 | 0 |
|
878 | 0 | r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target); |
879 | 0 | if (r < 0) |
880 | 0 | return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m)); |
881 | 0 | if (r > 0) /* Reached the end, nothing more to resolve */ |
882 | 0 | return 1; |
883 | 0 | |
884 | 0 | if (m->n_followed >= CHASE_SYMLINKS_MAX) /* put a boundary on things */ |
885 | 0 | return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), |
886 | 0 | "Symlink loop on '%s'.", |
887 | 0 | mount_entry_path(m)); |
888 | 0 | |
889 | 0 | log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target); |
890 | 0 |
|
891 | 0 | free_and_replace(m->path_malloc, target); |
892 | 0 | m->has_prefix = true; |
893 | 0 |
|
894 | 0 | m->n_followed ++; |
895 | 0 |
|
896 | 0 | return 0; |
897 | 0 | } |
898 | | |
899 | | static int apply_mount( |
900 | | const char *root_directory, |
901 | 0 | MountEntry *m) { |
902 | 0 |
|
903 | 0 | bool rbind = true, make = false; |
904 | 0 | const char *what; |
905 | 0 | int r; |
906 | 0 |
|
907 | 0 | assert(m); |
908 | 0 |
|
909 | 0 | log_debug("Applying namespace mount on %s", mount_entry_path(m)); |
910 | 0 |
|
911 | 0 | switch (m->mode) { |
912 | 0 |
|
913 | 0 | case INACCESSIBLE: { |
914 | 0 | struct stat target; |
915 | 0 |
|
916 | 0 | /* First, get rid of everything that is below if there |
917 | 0 | * is anything... Then, overmount it with an |
918 | 0 | * inaccessible path. */ |
919 | 0 | (void) umount_recursive(mount_entry_path(m), 0); |
920 | 0 |
|
921 | 0 | if (lstat(mount_entry_path(m), &target) < 0) { |
922 | 0 | if (errno == ENOENT && m->ignore) |
923 | 0 | return 0; |
924 | 0 | |
925 | 0 | return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m)); |
926 | 0 | } |
927 | 0 |
|
928 | 0 | what = mode_to_inaccessible_node(target.st_mode); |
929 | 0 | if (!what) |
930 | 0 | return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), |
931 | 0 | "File type not supported for inaccessible mounts. Note that symlinks are not allowed"); |
932 | 0 | break; |
933 | 0 | } |
934 | 0 |
|
935 | 0 | case READONLY: |
936 | 0 | case READWRITE: |
937 | 0 | case READWRITE_IMPLICIT: |
938 | 0 | r = path_is_mount_point(mount_entry_path(m), root_directory, 0); |
939 | 0 | if (r == -ENOENT && m->ignore) |
940 | 0 | return 0; |
941 | 0 | if (r < 0) |
942 | 0 | return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m)); |
943 | 0 | if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */ |
944 | 0 | return 0; |
945 | 0 | /* This isn't a mount point yet, let's make it one. */ |
946 | 0 | what = mount_entry_path(m); |
947 | 0 | break; |
948 | 0 |
|
949 | 0 | case BIND_MOUNT: |
950 | 0 | rbind = false; |
951 | 0 |
|
952 | 0 | _fallthrough_; |
953 | 0 | case BIND_MOUNT_RECURSIVE: { |
954 | 0 | _cleanup_free_ char *chased = NULL; |
955 | 0 |
|
956 | 0 | /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note that bind |
957 | 0 | * mount source paths are always relative to the host root, hence we pass NULL as root directory to |
958 | 0 | * chase_symlinks() here. */ |
959 | 0 |
|
960 | 0 | r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased); |
961 | 0 | if (r == -ENOENT && m->ignore) { |
962 | 0 | log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m)); |
963 | 0 | return 0; |
964 | 0 | } |
965 | 0 | if (r < 0) |
966 | 0 | return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m)); |
967 | 0 | |
968 | 0 | log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased); |
969 | 0 |
|
970 | 0 | free_and_replace(m->source_malloc, chased); |
971 | 0 |
|
972 | 0 | what = mount_entry_source(m); |
973 | 0 | make = true; |
974 | 0 | break; |
975 | 0 | } |
976 | 0 |
|
977 | 0 | case EMPTY_DIR: |
978 | 0 | case TMPFS: |
979 | 0 | return mount_tmpfs(m); |
980 | 0 |
|
981 | 0 | case PRIVATE_TMP: |
982 | 0 | what = mount_entry_source(m); |
983 | 0 | make = true; |
984 | 0 | break; |
985 | 0 |
|
986 | 0 | case PRIVATE_DEV: |
987 | 0 | return mount_private_dev(m); |
988 | 0 |
|
989 | 0 | case BIND_DEV: |
990 | 0 | return mount_bind_dev(m); |
991 | 0 |
|
992 | 0 | case SYSFS: |
993 | 0 | return mount_sysfs(m); |
994 | 0 |
|
995 | 0 | case PROCFS: |
996 | 0 | return mount_procfs(m); |
997 | 0 |
|
998 | 0 | default: |
999 | 0 | assert_not_reached("Unknown mode"); |
1000 | 0 | } |
1001 | 0 |
|
1002 | 0 | assert(what); |
1003 | 0 |
|
1004 | 0 | if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) { |
1005 | 0 | bool try_again = false; |
1006 | 0 | r = -errno; |
1007 | 0 |
|
1008 | 0 | if (r == -ENOENT && make) { |
1009 | 0 | struct stat st; |
1010 | 0 |
|
1011 | 0 | /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */ |
1012 | 0 |
|
1013 | 0 | if (stat(what, &st) < 0) |
1014 | 0 | log_debug_errno(errno, "Mount point source '%s' is not accessible: %m", what); |
1015 | 0 | else { |
1016 | 0 | int q; |
1017 | 0 |
|
1018 | 0 | (void) mkdir_parents(mount_entry_path(m), 0755); |
1019 | 0 |
|
1020 | 0 | if (S_ISDIR(st.st_mode)) |
1021 | 0 | q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0; |
1022 | 0 | else |
1023 | 0 | q = touch(mount_entry_path(m)); |
1024 | 0 |
|
1025 | 0 | if (q < 0) |
1026 | 0 | log_debug_errno(q, "Failed to create destination mount point node '%s': %m", mount_entry_path(m)); |
1027 | 0 | else |
1028 | 0 | try_again = true; |
1029 | 0 | } |
1030 | 0 | } |
1031 | 0 |
|
1032 | 0 | if (try_again) { |
1033 | 0 | if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) |
1034 | 0 | r = -errno; |
1035 | 0 | else |
1036 | 0 | r = 0; |
1037 | 0 | } |
1038 | 0 |
|
1039 | 0 | if (r < 0) |
1040 | 0 | return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m)); |
1041 | 0 | } |
1042 | 0 | |
1043 | 0 | log_debug("Successfully mounted %s to %s", what, mount_entry_path(m)); |
1044 | 0 | return 0; |
1045 | 0 | } |
1046 | | |
1047 | | /* Change per-mount flags on an existing mount */ |
1048 | 0 | static int bind_remount_one(const char *path, unsigned long orig_flags, unsigned long new_flags, unsigned long flags_mask) { |
1049 | 0 | if (mount(NULL, path, NULL, (orig_flags & ~flags_mask) | MS_REMOUNT | MS_BIND | new_flags, NULL) < 0) |
1050 | 0 | return -errno; |
1051 | 0 | |
1052 | 0 | return 0; |
1053 | 0 | } |
1054 | | |
1055 | 0 | static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) { |
1056 | 0 | unsigned long new_flags = 0, flags_mask = 0; |
1057 | 0 | bool submounts = false; |
1058 | 0 | int r = 0; |
1059 | 0 |
|
1060 | 0 | assert(m); |
1061 | 0 | assert(proc_self_mountinfo); |
1062 | 0 |
|
1063 | 0 | if (mount_entry_read_only(m) || m->mode == PRIVATE_DEV) { |
1064 | 0 | new_flags |= MS_RDONLY; |
1065 | 0 | flags_mask |= MS_RDONLY; |
1066 | 0 | } |
1067 | 0 |
|
1068 | 0 | if (m->nosuid) { |
1069 | 0 | new_flags |= MS_NOSUID; |
1070 | 0 | flags_mask |= MS_NOSUID; |
1071 | 0 | } |
1072 | 0 |
|
1073 | 0 | if (flags_mask == 0) /* No Change? */ |
1074 | 0 | return 0; |
1075 | 0 | |
1076 | 0 | /* We generally apply these changes recursively, except for /dev, and the cases we know there's |
1077 | 0 | * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the |
1078 | 0 | * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace |
1079 | 0 | * and running Linux <= 4.17. */ |
1080 | 0 | submounts = |
1081 | 0 | mount_entry_read_only(m) && |
1082 | 0 | !IN_SET(m->mode, EMPTY_DIR, TMPFS); |
1083 | 0 | if (submounts) |
1084 | 0 | r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, blacklist, proc_self_mountinfo); |
1085 | 0 | else |
1086 | 0 | r = bind_remount_one(mount_entry_path(m), m->flags, new_flags, flags_mask); |
1087 | 0 |
|
1088 | 0 | /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked |
1089 | 0 | * read-only already stays this way. This improves compatibility with container managers, where we |
1090 | 0 | * won't attempt to undo read-only mounts already applied. */ |
1091 | 0 |
|
1092 | 0 | if (r == -ENOENT && m->ignore) |
1093 | 0 | return 0; |
1094 | 0 | if (r < 0) |
1095 | 0 | return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m), |
1096 | 0 | submounts ? " and its submounts" : ""); |
1097 | 0 | return 0; |
1098 | 0 | } |
1099 | | |
1100 | 0 | static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) { |
1101 | 0 | assert(ns_info); |
1102 | 0 |
|
1103 | 0 | /* |
1104 | 0 | * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, |
1105 | 0 | * since to protect the API VFS mounts, they need to be around in the |
1106 | 0 | * first place... |
1107 | 0 | */ |
1108 | 0 |
|
1109 | 0 | return ns_info->mount_apivfs || |
1110 | 0 | ns_info->protect_control_groups || |
1111 | 0 | ns_info->protect_kernel_tunables; |
1112 | 0 | } |
1113 | | |
1114 | | static size_t namespace_calculate_mounts( |
1115 | | const NamespaceInfo *ns_info, |
1116 | | char** read_write_paths, |
1117 | | char** read_only_paths, |
1118 | | char** inaccessible_paths, |
1119 | | char** empty_directories, |
1120 | | size_t n_bind_mounts, |
1121 | | size_t n_temporary_filesystems, |
1122 | | const char* tmp_dir, |
1123 | | const char* var_tmp_dir, |
1124 | | ProtectHome protect_home, |
1125 | 0 | ProtectSystem protect_system) { |
1126 | 0 |
|
1127 | 0 | size_t protect_home_cnt; |
1128 | 0 | size_t protect_system_cnt = |
1129 | 0 | (protect_system == PROTECT_SYSTEM_STRICT ? |
1130 | 0 | ELEMENTSOF(protect_system_strict_table) : |
1131 | 0 | ((protect_system == PROTECT_SYSTEM_FULL) ? |
1132 | 0 | ELEMENTSOF(protect_system_full_table) : |
1133 | 0 | ((protect_system == PROTECT_SYSTEM_YES) ? |
1134 | 0 | ELEMENTSOF(protect_system_yes_table) : 0))); |
1135 | 0 |
|
1136 | 0 | protect_home_cnt = |
1137 | 0 | (protect_home == PROTECT_HOME_YES ? |
1138 | 0 | ELEMENTSOF(protect_home_yes_table) : |
1139 | 0 | ((protect_home == PROTECT_HOME_READ_ONLY) ? |
1140 | 0 | ELEMENTSOF(protect_home_read_only_table) : |
1141 | 0 | ((protect_home == PROTECT_HOME_TMPFS) ? |
1142 | 0 | ELEMENTSOF(protect_home_tmpfs_table) : 0))); |
1143 | 0 |
|
1144 | 0 | return !!tmp_dir + !!var_tmp_dir + |
1145 | 0 | strv_length(read_write_paths) + |
1146 | 0 | strv_length(read_only_paths) + |
1147 | 0 | strv_length(inaccessible_paths) + |
1148 | 0 | strv_length(empty_directories) + |
1149 | 0 | n_bind_mounts + |
1150 | 0 | n_temporary_filesystems + |
1151 | 0 | ns_info->private_dev + |
1152 | 0 | (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + |
1153 | 0 | (ns_info->protect_control_groups ? 1 : 0) + |
1154 | 0 | (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + |
1155 | 0 | protect_home_cnt + protect_system_cnt + |
1156 | 0 | (ns_info->protect_hostname ? 2 : 0) + |
1157 | 0 | (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0); |
1158 | 0 | } |
1159 | | |
1160 | 0 | static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) { |
1161 | 0 | assert(root_directory); |
1162 | 0 | assert(n_mounts); |
1163 | 0 | assert(mounts || *n_mounts == 0); |
1164 | 0 |
|
1165 | 0 | typesafe_qsort(mounts, *n_mounts, mount_path_compare); |
1166 | 0 |
|
1167 | 0 | drop_duplicates(mounts, n_mounts); |
1168 | 0 | drop_outside_root(root_directory, mounts, n_mounts); |
1169 | 0 | drop_inaccessible(mounts, n_mounts); |
1170 | 0 | drop_nop(mounts, n_mounts); |
1171 | 0 | } |
1172 | | |
1173 | | int setup_namespace( |
1174 | | const char* root_directory, |
1175 | | const char* root_image, |
1176 | | const NamespaceInfo *ns_info, |
1177 | | char** read_write_paths, |
1178 | | char** read_only_paths, |
1179 | | char** inaccessible_paths, |
1180 | | char** empty_directories, |
1181 | | const BindMount *bind_mounts, |
1182 | | size_t n_bind_mounts, |
1183 | | const TemporaryFileSystem *temporary_filesystems, |
1184 | | size_t n_temporary_filesystems, |
1185 | | const char* tmp_dir, |
1186 | | const char* var_tmp_dir, |
1187 | | ProtectHome protect_home, |
1188 | | ProtectSystem protect_system, |
1189 | | unsigned long mount_flags, |
1190 | | DissectImageFlags dissect_image_flags, |
1191 | 0 | char **error_path) { |
1192 | 0 |
|
1193 | 0 | _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; |
1194 | 0 | _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; |
1195 | 0 | _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; |
1196 | 0 | _cleanup_free_ void *root_hash = NULL; |
1197 | 0 | MountEntry *m = NULL, *mounts = NULL; |
1198 | 0 | size_t n_mounts, root_hash_size = 0; |
1199 | 0 | bool require_prefix = false; |
1200 | 0 | const char *root; |
1201 | 0 | int r = 0; |
1202 | 0 |
|
1203 | 0 | assert(ns_info); |
1204 | 0 |
|
1205 | 0 | if (mount_flags == 0) |
1206 | 0 | mount_flags = MS_SHARED; |
1207 | 0 |
|
1208 | 0 | if (root_image) { |
1209 | 0 | dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT; |
1210 | 0 |
|
1211 | 0 | if (protect_system == PROTECT_SYSTEM_STRICT && |
1212 | 0 | protect_home != PROTECT_HOME_NO && |
1213 | 0 | strv_isempty(read_write_paths)) |
1214 | 0 | dissect_image_flags |= DISSECT_IMAGE_READ_ONLY; |
1215 | 0 |
|
1216 | 0 | r = loop_device_make_by_path(root_image, |
1217 | 0 | dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR, |
1218 | 0 | &loop_device); |
1219 | 0 | if (r < 0) |
1220 | 0 | return log_debug_errno(r, "Failed to create loop device for root image: %m"); |
1221 | 0 | |
1222 | 0 | r = root_hash_load(root_image, &root_hash, &root_hash_size); |
1223 | 0 | if (r < 0) |
1224 | 0 | return log_debug_errno(r, "Failed to load root hash: %m"); |
1225 | 0 | |
1226 | 0 | r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image); |
1227 | 0 | if (r < 0) |
1228 | 0 | return log_debug_errno(r, "Failed to dissect image: %m"); |
1229 | 0 | |
1230 | 0 | r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image); |
1231 | 0 | if (r < 0) |
1232 | 0 | return log_debug_errno(r, "Failed to decrypt dissected image: %m"); |
1233 | 0 | } |
1234 | 0 | |
1235 | 0 | if (root_directory) |
1236 | 0 | root = root_directory; |
1237 | 0 | else { |
1238 | 0 | /* Always create the mount namespace in a temporary directory, instead of operating |
1239 | 0 | * directly in the root. The temporary directory prevents any mounts from being |
1240 | 0 | * potentially obscured my other mounts we already applied. |
1241 | 0 | * We use the same mount point for all images, which is safe, since they all live |
1242 | 0 | * in their own namespaces after all, and hence won't see each other. */ |
1243 | 0 |
|
1244 | 0 | root = "/run/systemd/unit-root"; |
1245 | 0 | (void) mkdir_label(root, 0700); |
1246 | 0 | require_prefix = true; |
1247 | 0 | } |
1248 | 0 |
|
1249 | 0 | n_mounts = namespace_calculate_mounts( |
1250 | 0 | ns_info, |
1251 | 0 | read_write_paths, |
1252 | 0 | read_only_paths, |
1253 | 0 | inaccessible_paths, |
1254 | 0 | empty_directories, |
1255 | 0 | n_bind_mounts, |
1256 | 0 | n_temporary_filesystems, |
1257 | 0 | tmp_dir, var_tmp_dir, |
1258 | 0 | protect_home, protect_system); |
1259 | 0 |
|
1260 | 0 | if (n_mounts > 0) { |
1261 | 0 | m = mounts = new0(MountEntry, n_mounts); |
1262 | 0 | if (!mounts) |
1263 | 0 | return -ENOMEM; |
1264 | 0 | |
1265 | 0 | r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix); |
1266 | 0 | if (r < 0) |
1267 | 0 | goto finish; |
1268 | 0 | |
1269 | 0 | r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix); |
1270 | 0 | if (r < 0) |
1271 | 0 | goto finish; |
1272 | 0 | |
1273 | 0 | r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix); |
1274 | 0 | if (r < 0) |
1275 | 0 | goto finish; |
1276 | 0 | |
1277 | 0 | r = append_empty_dir_mounts(&m, empty_directories); |
1278 | 0 | if (r < 0) |
1279 | 0 | goto finish; |
1280 | 0 | |
1281 | 0 | r = append_bind_mounts(&m, bind_mounts, n_bind_mounts); |
1282 | 0 | if (r < 0) |
1283 | 0 | goto finish; |
1284 | 0 | |
1285 | 0 | r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems); |
1286 | 0 | if (r < 0) |
1287 | 0 | goto finish; |
1288 | 0 | |
1289 | 0 | if (tmp_dir) { |
1290 | 0 | *(m++) = (MountEntry) { |
1291 | 0 | .path_const = "/tmp", |
1292 | 0 | .mode = PRIVATE_TMP, |
1293 | 0 | .source_const = tmp_dir, |
1294 | 0 | }; |
1295 | 0 | } |
1296 | 0 |
|
1297 | 0 | if (var_tmp_dir) { |
1298 | 0 | *(m++) = (MountEntry) { |
1299 | 0 | .path_const = "/var/tmp", |
1300 | 0 | .mode = PRIVATE_TMP, |
1301 | 0 | .source_const = var_tmp_dir, |
1302 | 0 | }; |
1303 | 0 | } |
1304 | 0 |
|
1305 | 0 | if (ns_info->private_dev) { |
1306 | 0 | *(m++) = (MountEntry) { |
1307 | 0 | .path_const = "/dev", |
1308 | 0 | .mode = PRIVATE_DEV, |
1309 | 0 | .flags = DEV_MOUNT_OPTIONS, |
1310 | 0 | }; |
1311 | 0 | } |
1312 | 0 |
|
1313 | 0 | if (ns_info->protect_kernel_tunables) { |
1314 | 0 | r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths); |
1315 | 0 | if (r < 0) |
1316 | 0 | goto finish; |
1317 | 0 | } |
1318 | 0 | |
1319 | 0 | if (ns_info->protect_kernel_modules) { |
1320 | 0 | r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths); |
1321 | 0 | if (r < 0) |
1322 | 0 | goto finish; |
1323 | 0 | } |
1324 | 0 | |
1325 | 0 | if (ns_info->protect_control_groups) { |
1326 | 0 | *(m++) = (MountEntry) { |
1327 | 0 | .path_const = "/sys/fs/cgroup", |
1328 | 0 | .mode = READONLY, |
1329 | 0 | }; |
1330 | 0 | } |
1331 | 0 |
|
1332 | 0 | r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths); |
1333 | 0 | if (r < 0) |
1334 | 0 | goto finish; |
1335 | 0 | |
1336 | 0 | r = append_protect_system(&m, protect_system, false); |
1337 | 0 | if (r < 0) |
1338 | 0 | goto finish; |
1339 | 0 | |
1340 | 0 | if (namespace_info_mount_apivfs(ns_info)) { |
1341 | 0 | r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths); |
1342 | 0 | if (r < 0) |
1343 | 0 | goto finish; |
1344 | 0 | } |
1345 | 0 | |
1346 | 0 | if (ns_info->protect_hostname) { |
1347 | 0 | *(m++) = (MountEntry) { |
1348 | 0 | .path_const = "/proc/sys/kernel/hostname", |
1349 | 0 | .mode = READONLY, |
1350 | 0 | }; |
1351 | 0 | *(m++) = (MountEntry) { |
1352 | 0 | .path_const = "/proc/sys/kernel/domainname", |
1353 | 0 | .mode = READONLY, |
1354 | 0 | }; |
1355 | 0 | } |
1356 | 0 |
|
1357 | 0 | assert(mounts + n_mounts == m); |
1358 | 0 |
|
1359 | 0 | /* Prepend the root directory where that's necessary */ |
1360 | 0 | r = prefix_where_needed(mounts, n_mounts, root); |
1361 | 0 | if (r < 0) |
1362 | 0 | goto finish; |
1363 | 0 | |
1364 | 0 | normalize_mounts(root, mounts, &n_mounts); |
1365 | 0 | } |
1366 | 0 |
|
1367 | 0 | /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */ |
1368 | 0 |
|
1369 | 0 | if (unshare(CLONE_NEWNS) < 0) { |
1370 | 0 | r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m"); |
1371 | 0 | if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS)) |
1372 | 0 | /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter in place |
1373 | 0 | * that doesn't allow us to create namespaces (or a missing cap), then propagate a recognizable |
1374 | 0 | * error back, which the caller can use to detect this case (and only this) and optionally |
1375 | 0 | * continue without namespacing applied. */ |
1376 | 0 | r = -ENOANO; |
1377 | 0 |
|
1378 | 0 | goto finish; |
1379 | 0 | } |
1380 | 0 |
|
1381 | 0 | /* Remount / as SLAVE so that nothing now mounted in the namespace |
1382 | 0 | * shows up in the parent */ |
1383 | 0 | if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { |
1384 | 0 | r = log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m"); |
1385 | 0 | goto finish; |
1386 | 0 | } |
1387 | 0 |
|
1388 | 0 | if (root_image) { |
1389 | 0 | /* A root image is specified, mount it to the right place */ |
1390 | 0 | r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags); |
1391 | 0 | if (r < 0) { |
1392 | 0 | log_debug_errno(r, "Failed to mount root image: %m"); |
1393 | 0 | goto finish; |
1394 | 0 | } |
1395 | 0 |
|
1396 | 0 | if (decrypted_image) { |
1397 | 0 | r = decrypted_image_relinquish(decrypted_image); |
1398 | 0 | if (r < 0) { |
1399 | 0 | log_debug_errno(r, "Failed to relinquish decrypted image: %m"); |
1400 | 0 | goto finish; |
1401 | 0 | } |
1402 | 0 | } |
1403 | 0 |
|
1404 | 0 | loop_device_relinquish(loop_device); |
1405 | 0 |
|
1406 | 0 | } else if (root_directory) { |
1407 | 0 |
|
1408 | 0 | /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */ |
1409 | 0 | r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW); |
1410 | 0 | if (r < 0) { |
1411 | 0 | log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root); |
1412 | 0 | goto finish; |
1413 | 0 | } |
1414 | 0 | if (r == 0) { |
1415 | 0 | if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) { |
1416 | 0 | r = log_debug_errno(errno, "Failed to bind mount '%s': %m", root); |
1417 | 0 | goto finish; |
1418 | 0 | } |
1419 | 0 | } |
1420 | 0 |
|
1421 | 0 | } else { |
1422 | 0 |
|
1423 | 0 | /* Let's mount the main root directory to the root directory to use */ |
1424 | 0 | if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) { |
1425 | 0 | r = log_debug_errno(errno, "Failed to bind mount '/' on '%s': %m", root); |
1426 | 0 | goto finish; |
1427 | 0 | } |
1428 | 0 | } |
1429 | 0 |
|
1430 | 0 | /* Try to set up the new root directory before mounting anything else there. */ |
1431 | 0 | if (root_image || root_directory) |
1432 | 0 | (void) base_filesystem_create(root, UID_INVALID, GID_INVALID); |
1433 | 0 |
|
1434 | 0 | if (n_mounts > 0) { |
1435 | 0 | _cleanup_fclose_ FILE *proc_self_mountinfo = NULL; |
1436 | 0 | _cleanup_free_ char **blacklist = NULL; |
1437 | 0 | size_t j; |
1438 | 0 |
|
1439 | 0 | /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc. |
1440 | 0 | * For example, this is the case with the option: 'InaccessiblePaths=/proc' */ |
1441 | 0 | proc_self_mountinfo = fopen("/proc/self/mountinfo", "re"); |
1442 | 0 | if (!proc_self_mountinfo) { |
1443 | 0 | r = log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m"); |
1444 | 0 | if (error_path) |
1445 | 0 | *error_path = strdup("/proc/self/mountinfo"); |
1446 | 0 | goto finish; |
1447 | 0 | } |
1448 | 0 |
|
1449 | 0 | /* First round, establish all mounts we need */ |
1450 | 0 | for (;;) { |
1451 | 0 | bool again = false; |
1452 | 0 |
|
1453 | 0 | for (m = mounts; m < mounts + n_mounts; ++m) { |
1454 | 0 |
|
1455 | 0 | if (m->applied) |
1456 | 0 | continue; |
1457 | 0 | |
1458 | 0 | r = follow_symlink(root, m); |
1459 | 0 | if (r < 0) { |
1460 | 0 | if (error_path && mount_entry_path(m)) |
1461 | 0 | *error_path = strdup(mount_entry_path(m)); |
1462 | 0 | goto finish; |
1463 | 0 | } |
1464 | 0 | if (r == 0) { |
1465 | 0 | /* We hit a symlinked mount point. The entry got rewritten and might point to a |
1466 | 0 | * very different place now. Let's normalize the changed list, and start from |
1467 | 0 | * the beginning. After all to mount the entry at the new location we might |
1468 | 0 | * need some other mounts first */ |
1469 | 0 | again = true; |
1470 | 0 | break; |
1471 | 0 | } |
1472 | 0 | |
1473 | 0 | r = apply_mount(root, m); |
1474 | 0 | if (r < 0) { |
1475 | 0 | if (error_path && mount_entry_path(m)) |
1476 | 0 | *error_path = strdup(mount_entry_path(m)); |
1477 | 0 | goto finish; |
1478 | 0 | } |
1479 | 0 |
|
1480 | 0 | m->applied = true; |
1481 | 0 | } |
1482 | 0 |
|
1483 | 0 | if (!again) |
1484 | 0 | break; |
1485 | 0 | |
1486 | 0 | normalize_mounts(root, mounts, &n_mounts); |
1487 | 0 | } |
1488 | 0 |
|
1489 | 0 | /* Create a blacklist we can pass to bind_mount_recursive() */ |
1490 | 0 | blacklist = new(char*, n_mounts+1); |
1491 | 0 | if (!blacklist) { |
1492 | 0 | r = -ENOMEM; |
1493 | 0 | goto finish; |
1494 | 0 | } |
1495 | 0 | for (j = 0; j < n_mounts; j++) |
1496 | 0 | blacklist[j] = (char*) mount_entry_path(mounts+j); |
1497 | 0 | blacklist[j] = NULL; |
1498 | 0 |
|
1499 | 0 | /* Second round, flip the ro bits if necessary. */ |
1500 | 0 | for (m = mounts; m < mounts + n_mounts; ++m) { |
1501 | 0 | r = make_read_only(m, blacklist, proc_self_mountinfo); |
1502 | 0 | if (r < 0) { |
1503 | 0 | if (error_path && mount_entry_path(m)) |
1504 | 0 | *error_path = strdup(mount_entry_path(m)); |
1505 | 0 | goto finish; |
1506 | 0 | } |
1507 | 0 | } |
1508 | 0 | } |
1509 | 0 |
|
1510 | 0 | /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */ |
1511 | 0 | r = mount_move_root(root); |
1512 | 0 | if (r < 0) { |
1513 | 0 | log_debug_errno(r, "Failed to mount root with MS_MOVE: %m"); |
1514 | 0 | goto finish; |
1515 | 0 | } |
1516 | 0 |
|
1517 | 0 | /* Remount / as the desired mode. Note that this will not |
1518 | 0 | * reestablish propagation from our side to the host, since |
1519 | 0 | * what's disconnected is disconnected. */ |
1520 | 0 | if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) { |
1521 | 0 | r = log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m"); |
1522 | 0 | goto finish; |
1523 | 0 | } |
1524 | 0 |
|
1525 | 0 | r = 0; |
1526 | 0 |
|
1527 | 0 | finish: |
1528 | 0 | for (m = mounts; m < mounts + n_mounts; m++) |
1529 | 0 | mount_entry_done(m); |
1530 | 0 |
|
1531 | 0 | free(mounts); |
1532 | 0 |
|
1533 | 0 | return r; |
1534 | 0 | } |
1535 | | |
1536 | 21.2k | void bind_mount_free_many(BindMount *b, size_t n) { |
1537 | 21.2k | size_t i; |
1538 | 21.2k | |
1539 | 21.2k | assert(b || n == 0); |
1540 | 21.2k | |
1541 | 106k | for (i = 0; i < n; i++) { |
1542 | 85.4k | free(b[i].source); |
1543 | 85.4k | free(b[i].destination); |
1544 | 85.4k | } |
1545 | 21.2k | |
1546 | 21.2k | free(b); |
1547 | 21.2k | } |
1548 | | |
1549 | 85.4k | int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) { |
1550 | 85.4k | _cleanup_free_ char *s = NULL, *d = NULL; |
1551 | 85.4k | BindMount *c; |
1552 | 85.4k | |
1553 | 85.4k | assert(b); |
1554 | 85.4k | assert(n); |
1555 | 85.4k | assert(item); |
1556 | 85.4k | |
1557 | 85.4k | s = strdup(item->source); |
1558 | 85.4k | if (!s) |
1559 | 0 | return -ENOMEM; |
1560 | 85.4k | |
1561 | 85.4k | d = strdup(item->destination); |
1562 | 85.4k | if (!d) |
1563 | 0 | return -ENOMEM; |
1564 | 85.4k | |
1565 | 85.4k | c = reallocarray(*b, *n + 1, sizeof(BindMount)); |
1566 | 85.4k | if (!c) |
1567 | 0 | return -ENOMEM; |
1568 | 85.4k | |
1569 | 85.4k | *b = c; |
1570 | 85.4k | |
1571 | 85.4k | c[(*n) ++] = (BindMount) { |
1572 | 85.4k | .source = TAKE_PTR(s), |
1573 | 85.4k | .destination = TAKE_PTR(d), |
1574 | 85.4k | .read_only = item->read_only, |
1575 | 85.4k | .nosuid = item->nosuid, |
1576 | 85.4k | .recursive = item->recursive, |
1577 | 85.4k | .ignore_enoent = item->ignore_enoent, |
1578 | 85.4k | }; |
1579 | 85.4k | |
1580 | 85.4k | return 0; |
1581 | 85.4k | } |
1582 | | |
1583 | 21.1k | void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) { |
1584 | 21.1k | size_t i; |
1585 | 21.1k | |
1586 | 21.1k | assert(t || n == 0); |
1587 | 21.1k | |
1588 | 40.3k | for (i = 0; i < n; i++) { |
1589 | 19.1k | free(t[i].path); |
1590 | 19.1k | free(t[i].options); |
1591 | 19.1k | } |
1592 | 21.1k | |
1593 | 21.1k | free(t); |
1594 | 21.1k | } |
1595 | | |
1596 | | int temporary_filesystem_add( |
1597 | | TemporaryFileSystem **t, |
1598 | | size_t *n, |
1599 | | const char *path, |
1600 | 19.1k | const char *options) { |
1601 | 19.1k | |
1602 | 19.1k | _cleanup_free_ char *p = NULL, *o = NULL; |
1603 | 19.1k | TemporaryFileSystem *c; |
1604 | 19.1k | |
1605 | 19.1k | assert(t); |
1606 | 19.1k | assert(n); |
1607 | 19.1k | assert(path); |
1608 | 19.1k | |
1609 | 19.1k | p = strdup(path); |
1610 | 19.1k | if (!p) |
1611 | 0 | return -ENOMEM; |
1612 | 19.1k | |
1613 | 19.1k | if (!isempty(options)) { |
1614 | 224 | o = strdup(options); |
1615 | 224 | if (!o) |
1616 | 0 | return -ENOMEM; |
1617 | 19.1k | } |
1618 | 19.1k | |
1619 | 19.1k | c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem)); |
1620 | 19.1k | if (!c) |
1621 | 0 | return -ENOMEM; |
1622 | 19.1k | |
1623 | 19.1k | *t = c; |
1624 | 19.1k | |
1625 | 19.1k | c[(*n) ++] = (TemporaryFileSystem) { |
1626 | 19.1k | .path = TAKE_PTR(p), |
1627 | 19.1k | .options = TAKE_PTR(o), |
1628 | 19.1k | }; |
1629 | 19.1k | |
1630 | 19.1k | return 0; |
1631 | 19.1k | } |
1632 | | |
1633 | 0 | static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) { |
1634 | 0 | _cleanup_free_ char *x = NULL; |
1635 | 0 | char bid[SD_ID128_STRING_MAX]; |
1636 | 0 | sd_id128_t boot_id; |
1637 | 0 | int r; |
1638 | 0 |
|
1639 | 0 | assert(id); |
1640 | 0 | assert(prefix); |
1641 | 0 | assert(path); |
1642 | 0 |
|
1643 | 0 | /* We include the boot id in the directory so that after a |
1644 | 0 | * reboot we can easily identify obsolete directories. */ |
1645 | 0 |
|
1646 | 0 | r = sd_id128_get_boot(&boot_id); |
1647 | 0 | if (r < 0) |
1648 | 0 | return r; |
1649 | 0 | |
1650 | 0 | x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX"); |
1651 | 0 | if (!x) |
1652 | 0 | return -ENOMEM; |
1653 | 0 | |
1654 | 0 | RUN_WITH_UMASK(0077) |
1655 | 0 | if (!mkdtemp(x)) |
1656 | 0 | return -errno; |
1657 | 0 |
|
1658 | 0 | RUN_WITH_UMASK(0000) { |
1659 | 0 | char *y; |
1660 | 0 |
|
1661 | 0 | y = strjoina(x, "/tmp"); |
1662 | 0 |
|
1663 | 0 | if (mkdir(y, 0777 | S_ISVTX) < 0) |
1664 | 0 | return -errno; |
1665 | 0 | } |
1666 | 0 |
|
1667 | 0 | *path = TAKE_PTR(x); |
1668 | 0 |
|
1669 | 0 | return 0; |
1670 | 0 | } |
1671 | | |
1672 | 0 | int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) { |
1673 | 0 | char *a, *b; |
1674 | 0 | int r; |
1675 | 0 |
|
1676 | 0 | assert(id); |
1677 | 0 | assert(tmp_dir); |
1678 | 0 | assert(var_tmp_dir); |
1679 | 0 |
|
1680 | 0 | r = setup_one_tmp_dir(id, "/tmp", &a); |
1681 | 0 | if (r < 0) |
1682 | 0 | return r; |
1683 | 0 | |
1684 | 0 | r = setup_one_tmp_dir(id, "/var/tmp", &b); |
1685 | 0 | if (r < 0) { |
1686 | 0 | char *t; |
1687 | 0 |
|
1688 | 0 | t = strjoina(a, "/tmp"); |
1689 | 0 | (void) rmdir(t); |
1690 | 0 | (void) rmdir(a); |
1691 | 0 |
|
1692 | 0 | free(a); |
1693 | 0 | return r; |
1694 | 0 | } |
1695 | 0 | |
1696 | 0 | *tmp_dir = a; |
1697 | 0 | *var_tmp_dir = b; |
1698 | 0 |
|
1699 | 0 | return 0; |
1700 | 0 | } |
1701 | | |
1702 | 0 | int setup_netns(int netns_storage_socket[static 2]) { |
1703 | 0 | _cleanup_close_ int netns = -1; |
1704 | 0 | int r, q; |
1705 | 0 |
|
1706 | 0 | assert(netns_storage_socket); |
1707 | 0 | assert(netns_storage_socket[0] >= 0); |
1708 | 0 | assert(netns_storage_socket[1] >= 0); |
1709 | 0 |
|
1710 | 0 | /* We use the passed socketpair as a storage buffer for our |
1711 | 0 | * namespace reference fd. Whatever process runs this first |
1712 | 0 | * shall create a new namespace, all others should just join |
1713 | 0 | * it. To serialize that we use a file lock on the socket |
1714 | 0 | * pair. |
1715 | 0 | * |
1716 | 0 | * It's a bit crazy, but hey, works great! */ |
1717 | 0 |
|
1718 | 0 | if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0) |
1719 | 0 | return -errno; |
1720 | 0 | |
1721 | 0 | netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT); |
1722 | 0 | if (netns == -EAGAIN) { |
1723 | 0 | /* Nothing stored yet, so let's create a new namespace. */ |
1724 | 0 |
|
1725 | 0 | if (unshare(CLONE_NEWNET) < 0) { |
1726 | 0 | r = -errno; |
1727 | 0 | goto fail; |
1728 | 0 | } |
1729 | 0 | |
1730 | 0 | (void) loopback_setup(); |
1731 | 0 |
|
1732 | 0 | netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY); |
1733 | 0 | if (netns < 0) { |
1734 | 0 | r = -errno; |
1735 | 0 | goto fail; |
1736 | 0 | } |
1737 | 0 | |
1738 | 0 | r = 1; |
1739 | 0 |
|
1740 | 0 | } else if (netns < 0) { |
1741 | 0 | r = netns; |
1742 | 0 | goto fail; |
1743 | 0 |
|
1744 | 0 | } else { |
1745 | 0 | /* Yay, found something, so let's join the namespace */ |
1746 | 0 | if (setns(netns, CLONE_NEWNET) < 0) { |
1747 | 0 | r = -errno; |
1748 | 0 | goto fail; |
1749 | 0 | } |
1750 | 0 | |
1751 | 0 | r = 0; |
1752 | 0 | } |
1753 | 0 |
|
1754 | 0 | q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT); |
1755 | 0 | if (q < 0) { |
1756 | 0 | r = q; |
1757 | 0 | goto fail; |
1758 | 0 | } |
1759 | 0 | |
1760 | 0 | fail: |
1761 | 0 | (void) lockf(netns_storage_socket[0], F_ULOCK, 0); |
1762 | 0 | return r; |
1763 | 0 | } |
1764 | | |
1765 | 0 | int open_netns_path(int netns_storage_socket[static 2], const char *path) { |
1766 | 0 | _cleanup_close_ int netns = -1; |
1767 | 0 | int q, r; |
1768 | 0 |
|
1769 | 0 | assert(netns_storage_socket); |
1770 | 0 | assert(netns_storage_socket[0] >= 0); |
1771 | 0 | assert(netns_storage_socket[1] >= 0); |
1772 | 0 | assert(path); |
1773 | 0 |
|
1774 | 0 | /* If the storage socket doesn't contain a netns fd yet, open one via the file system and store it in |
1775 | 0 | * it. This is supposed to be called ahead of time, i.e. before setup_netns() which will allocate a |
1776 | 0 | * new anonymous netns if needed. */ |
1777 | 0 |
|
1778 | 0 | if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0) |
1779 | 0 | return -errno; |
1780 | 0 | |
1781 | 0 | netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT); |
1782 | 0 | if (netns == -EAGAIN) { |
1783 | 0 | /* Nothing stored yet. Open the file from the file system. */ |
1784 | 0 |
|
1785 | 0 | netns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC); |
1786 | 0 | if (netns < 0) { |
1787 | 0 | r = -errno; |
1788 | 0 | goto fail; |
1789 | 0 | } |
1790 | 0 | |
1791 | 0 | r = fd_is_network_ns(netns); |
1792 | 0 | if (r == 0) { /* Not a netns? Refuse early. */ |
1793 | 0 | r = -EINVAL; |
1794 | 0 | goto fail; |
1795 | 0 | } |
1796 | 0 | if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */ |
1797 | 0 | goto fail; |
1798 | 0 | |
1799 | 0 | r = 1; |
1800 | 0 |
|
1801 | 0 | } else if (netns < 0) { |
1802 | 0 | r = netns; |
1803 | 0 | goto fail; |
1804 | 0 | } else |
1805 | 0 | r = 0; /* Already allocated */ |
1806 | 0 |
|
1807 | 0 | q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT); |
1808 | 0 | if (q < 0) { |
1809 | 0 | r = q; |
1810 | 0 | goto fail; |
1811 | 0 | } |
1812 | 0 | |
1813 | 0 | fail: |
1814 | 0 | (void) lockf(netns_storage_socket[0], F_ULOCK, 0); |
1815 | 0 | return r; |
1816 | 0 | } |
1817 | | |
1818 | 0 | bool ns_type_supported(NamespaceType type) { |
1819 | 0 | const char *t, *ns_proc; |
1820 | 0 |
|
1821 | 0 | t = namespace_type_to_string(type); |
1822 | 0 | if (!t) /* Don't know how to translate this? Then it's not supported */ |
1823 | 0 | return false; |
1824 | 0 | |
1825 | 0 | ns_proc = strjoina("/proc/self/ns/", t); |
1826 | 0 | return access(ns_proc, F_OK) == 0; |
1827 | 0 | } |
1828 | | |
1829 | | static const char *const protect_home_table[_PROTECT_HOME_MAX] = { |
1830 | | [PROTECT_HOME_NO] = "no", |
1831 | | [PROTECT_HOME_YES] = "yes", |
1832 | | [PROTECT_HOME_READ_ONLY] = "read-only", |
1833 | | [PROTECT_HOME_TMPFS] = "tmpfs", |
1834 | | }; |
1835 | | |
1836 | | DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES); |
1837 | | |
1838 | | static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = { |
1839 | | [PROTECT_SYSTEM_NO] = "no", |
1840 | | [PROTECT_SYSTEM_YES] = "yes", |
1841 | | [PROTECT_SYSTEM_FULL] = "full", |
1842 | | [PROTECT_SYSTEM_STRICT] = "strict", |
1843 | | }; |
1844 | | |
1845 | | DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES); |
1846 | | |
1847 | | static const char* const namespace_type_table[] = { |
1848 | | [NAMESPACE_MOUNT] = "mnt", |
1849 | | [NAMESPACE_CGROUP] = "cgroup", |
1850 | | [NAMESPACE_UTS] = "uts", |
1851 | | [NAMESPACE_IPC] = "ipc", |
1852 | | [NAMESPACE_USER] = "user", |
1853 | | [NAMESPACE_PID] = "pid", |
1854 | | [NAMESPACE_NET] = "net", |
1855 | | }; |
1856 | | |
1857 | | DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType); |