1236 1254 8 34 34 202 202 1210 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 | // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/tomoyo.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/lsm_hooks.h> #include <uapi/linux/lsm.h> #include "common.h" /** * tomoyo_domain - Get "struct tomoyo_domain_info" for current thread. * * Returns pointer to "struct tomoyo_domain_info" for current thread. */ struct tomoyo_domain_info *tomoyo_domain(void) { struct tomoyo_task *s = tomoyo_task(current); if (s->old_domain_info && !current->in_execve) { atomic_dec(&s->old_domain_info->users); s->old_domain_info = NULL; } return s->domain_info; } /** * tomoyo_cred_prepare - Target for security_prepare_creds(). * * @new: Pointer to "struct cred". * @old: Pointer to "struct cred". * @gfp: Memory allocation flags. * * Returns 0. */ static int tomoyo_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { /* Restore old_domain_info saved by previous execve() request. */ struct tomoyo_task *s = tomoyo_task(current); if (s->old_domain_info && !current->in_execve) { atomic_dec(&s->domain_info->users); s->domain_info = s->old_domain_info; s->old_domain_info = NULL; } return 0; } /** * tomoyo_bprm_committed_creds - Target for security_bprm_committed_creds(). * * @bprm: Pointer to "struct linux_binprm". */ static void tomoyo_bprm_committed_creds(const struct linux_binprm *bprm) { /* Clear old_domain_info saved by execve() request. */ struct tomoyo_task *s = tomoyo_task(current); atomic_dec(&s->old_domain_info->users); s->old_domain_info = NULL; } #ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER /** * tomoyo_bprm_creds_for_exec - Target for security_bprm_creds_for_exec(). * * @bprm: Pointer to "struct linux_binprm". * * Returns 0. */ static int tomoyo_bprm_creds_for_exec(struct linux_binprm *bprm) { /* * Load policy if /sbin/tomoyo-init exists and /sbin/init is requested * for the first time. */ if (!tomoyo_policy_loaded) tomoyo_load_policy(bprm->filename); return 0; } #endif /** * tomoyo_bprm_check_security - Target for security_bprm_check(). * * @bprm: Pointer to "struct linux_binprm". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_bprm_check_security(struct linux_binprm *bprm) { struct tomoyo_task *s = tomoyo_task(current); /* * Execute permission is checked against pathname passed to execve() * using current domain. */ if (!s->old_domain_info) { const int idx = tomoyo_read_lock(); const int err = tomoyo_find_next_domain(bprm); tomoyo_read_unlock(idx); return err; } /* * Read permission is checked against interpreters using next domain. */ return tomoyo_check_open_permission(s->domain_info, &bprm->file->f_path, O_RDONLY); } /** * tomoyo_inode_getattr - Target for security_inode_getattr(). * * @path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_inode_getattr(const struct path *path) { return tomoyo_path_perm(TOMOYO_TYPE_GETATTR, path, NULL); } /** * tomoyo_path_truncate - Target for security_path_truncate(). * * @path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_truncate(const struct path *path) { return tomoyo_path_perm(TOMOYO_TYPE_TRUNCATE, path, NULL); } /** * tomoyo_file_truncate - Target for security_file_truncate(). * * @file: Pointer to "struct file". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_truncate(struct file *file) { return tomoyo_path_truncate(&file->f_path); } /** * tomoyo_path_unlink - Target for security_path_unlink(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_perm(TOMOYO_TYPE_UNLINK, &path, NULL); } /** * tomoyo_path_mkdir - Target for security_path_mkdir(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * @mode: DAC permission mode. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry, umode_t mode) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_number_perm(TOMOYO_TYPE_MKDIR, &path, mode & S_IALLUGO); } /** * tomoyo_path_rmdir - Target for security_path_rmdir(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_perm(TOMOYO_TYPE_RMDIR, &path, NULL); } /** * tomoyo_path_symlink - Target for security_path_symlink(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * @old_name: Symlink's content. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_symlink(const struct path *parent, struct dentry *dentry, const char *old_name) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_perm(TOMOYO_TYPE_SYMLINK, &path, old_name); } /** * tomoyo_path_mknod - Target for security_path_mknod(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * @mode: DAC permission mode. * @dev: Device attributes. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_mknod(const struct path *parent, struct dentry *dentry, umode_t mode, unsigned int dev) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; int type = TOMOYO_TYPE_CREATE; const unsigned int perm = mode & S_IALLUGO; switch (mode & S_IFMT) { case S_IFCHR: type = TOMOYO_TYPE_MKCHAR; break; case S_IFBLK: type = TOMOYO_TYPE_MKBLOCK; break; default: goto no_dev; } return tomoyo_mkdev_perm(type, &path, perm, dev); no_dev: switch (mode & S_IFMT) { case S_IFIFO: type = TOMOYO_TYPE_MKFIFO; break; case S_IFSOCK: type = TOMOYO_TYPE_MKSOCK; break; } return tomoyo_path_number_perm(type, &path, perm); } /** * tomoyo_path_link - Target for security_path_link(). * * @old_dentry: Pointer to "struct dentry". * @new_dir: Pointer to "struct path". * @new_dentry: Pointer to "struct dentry". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct path path1 = { .mnt = new_dir->mnt, .dentry = old_dentry }; struct path path2 = { .mnt = new_dir->mnt, .dentry = new_dentry }; return tomoyo_path2_perm(TOMOYO_TYPE_LINK, &path1, &path2); } /** * tomoyo_path_rename - Target for security_path_rename(). * * @old_parent: Pointer to "struct path". * @old_dentry: Pointer to "struct dentry". * @new_parent: Pointer to "struct path". * @new_dentry: Pointer to "struct dentry". * @flags: Rename options. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_rename(const struct path *old_parent, struct dentry *old_dentry, const struct path *new_parent, struct dentry *new_dentry, const unsigned int flags) { struct path path1 = { .mnt = old_parent->mnt, .dentry = old_dentry }; struct path path2 = { .mnt = new_parent->mnt, .dentry = new_dentry }; if (flags & RENAME_EXCHANGE) { const int err = tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path2, &path1); if (err) return err; } return tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path1, &path2); } /** * tomoyo_file_fcntl - Target for security_file_fcntl(). * * @file: Pointer to "struct file". * @cmd: Command for fcntl(). * @arg: Argument for @cmd. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { if (!(cmd == F_SETFL && ((arg ^ file->f_flags) & O_APPEND))) return 0; return tomoyo_check_open_permission(tomoyo_domain(), &file->f_path, O_WRONLY | (arg & O_APPEND)); } /** * tomoyo_file_open - Target for security_file_open(). * * @f: Pointer to "struct file". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_open(struct file *f) { /* Don't check read permission here if called from execve(). */ /* Illogically, FMODE_EXEC is in f_flags, not f_mode. */ if (f->f_flags & __FMODE_EXEC) return 0; return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path, f->f_flags); } /** * tomoyo_file_ioctl - Target for security_file_ioctl(). * * @file: Pointer to "struct file". * @cmd: Command for ioctl(). * @arg: Argument for @cmd. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return tomoyo_path_number_perm(TOMOYO_TYPE_IOCTL, &file->f_path, cmd); } /** * tomoyo_path_chmod - Target for security_path_chmod(). * * @path: Pointer to "struct path". * @mode: DAC permission mode. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_chmod(const struct path *path, umode_t mode) { return tomoyo_path_number_perm(TOMOYO_TYPE_CHMOD, path, mode & S_IALLUGO); } /** * tomoyo_path_chown - Target for security_path_chown(). * * @path: Pointer to "struct path". * @uid: Owner ID. * @gid: Group ID. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { int error = 0; if (uid_valid(uid)) error = tomoyo_path_number_perm(TOMOYO_TYPE_CHOWN, path, from_kuid(&init_user_ns, uid)); if (!error && gid_valid(gid)) error = tomoyo_path_number_perm(TOMOYO_TYPE_CHGRP, path, from_kgid(&init_user_ns, gid)); return error; } /** * tomoyo_path_chroot - Target for security_path_chroot(). * * @path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_chroot(const struct path *path) { return tomoyo_path_perm(TOMOYO_TYPE_CHROOT, path, NULL); } /** * tomoyo_sb_mount - Target for security_sb_mount(). * * @dev_name: Name of device file. Maybe NULL. * @path: Pointer to "struct path". * @type: Name of filesystem type. Maybe NULL. * @flags: Mount options. * @data: Optional data. Maybe NULL. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { return tomoyo_mount_permission(dev_name, path, type, flags, data); } /** * tomoyo_sb_umount - Target for security_sb_umount(). * * @mnt: Pointer to "struct vfsmount". * @flags: Unmount options. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_sb_umount(struct vfsmount *mnt, int flags) { struct path path = { .mnt = mnt, .dentry = mnt->mnt_root }; return tomoyo_path_perm(TOMOYO_TYPE_UMOUNT, &path, NULL); } /** * tomoyo_sb_pivotroot - Target for security_sb_pivotroot(). * * @old_path: Pointer to "struct path". * @new_path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_sb_pivotroot(const struct path *old_path, const struct path *new_path) { return tomoyo_path2_perm(TOMOYO_TYPE_PIVOT_ROOT, new_path, old_path); } /** * tomoyo_socket_listen - Check permission for listen(). * * @sock: Pointer to "struct socket". * @backlog: Backlog parameter. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_listen(struct socket *sock, int backlog) { return tomoyo_socket_listen_permission(sock); } /** * tomoyo_socket_connect - Check permission for connect(). * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_connect(struct socket *sock, struct sockaddr *addr, int addr_len) { return tomoyo_socket_connect_permission(sock, addr, addr_len); } /** * tomoyo_socket_bind - Check permission for bind(). * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { return tomoyo_socket_bind_permission(sock, addr, addr_len); } /** * tomoyo_socket_sendmsg - Check permission for sendmsg(). * * @sock: Pointer to "struct socket". * @msg: Pointer to "struct msghdr". * @size: Size of message. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return tomoyo_socket_sendmsg_permission(sock, msg, size); } struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = { .lbs_task = sizeof(struct tomoyo_task), }; /** * tomoyo_task_alloc - Target for security_task_alloc(). * * @task: Pointer to "struct task_struct". * @clone_flags: clone() flags. * * Returns 0. */ static int tomoyo_task_alloc(struct task_struct *task, unsigned long clone_flags) { struct tomoyo_task *old = tomoyo_task(current); struct tomoyo_task *new = tomoyo_task(task); new->domain_info = old->domain_info; atomic_inc(&new->domain_info->users); new->old_domain_info = NULL; return 0; } /** * tomoyo_task_free - Target for security_task_free(). * * @task: Pointer to "struct task_struct". */ static void tomoyo_task_free(struct task_struct *task) { struct tomoyo_task *s = tomoyo_task(task); if (s->domain_info) { atomic_dec(&s->domain_info->users); s->domain_info = NULL; } if (s->old_domain_info) { atomic_dec(&s->old_domain_info->users); s->old_domain_info = NULL; } } static const struct lsm_id tomoyo_lsmid = { .name = "tomoyo", .id = LSM_ID_TOMOYO, }; /* tomoyo_hooks is used for registering TOMOYO. */ static struct security_hook_list tomoyo_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare), LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds), LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc), LSM_HOOK_INIT(task_free, tomoyo_task_free), #ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER LSM_HOOK_INIT(bprm_creds_for_exec, tomoyo_bprm_creds_for_exec), #endif LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security), LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl), LSM_HOOK_INIT(file_open, tomoyo_file_open), LSM_HOOK_INIT(file_truncate, tomoyo_file_truncate), LSM_HOOK_INIT(path_truncate, tomoyo_path_truncate), LSM_HOOK_INIT(path_unlink, tomoyo_path_unlink), LSM_HOOK_INIT(path_mkdir, tomoyo_path_mkdir), LSM_HOOK_INIT(path_rmdir, tomoyo_path_rmdir), LSM_HOOK_INIT(path_symlink, tomoyo_path_symlink), LSM_HOOK_INIT(path_mknod, tomoyo_path_mknod), LSM_HOOK_INIT(path_link, tomoyo_path_link), LSM_HOOK_INIT(path_rename, tomoyo_path_rename), LSM_HOOK_INIT(inode_getattr, tomoyo_inode_getattr), LSM_HOOK_INIT(file_ioctl, tomoyo_file_ioctl), LSM_HOOK_INIT(file_ioctl_compat, tomoyo_file_ioctl), LSM_HOOK_INIT(path_chmod, tomoyo_path_chmod), LSM_HOOK_INIT(path_chown, tomoyo_path_chown), LSM_HOOK_INIT(path_chroot, tomoyo_path_chroot), LSM_HOOK_INIT(sb_mount, tomoyo_sb_mount), LSM_HOOK_INIT(sb_umount, tomoyo_sb_umount), LSM_HOOK_INIT(sb_pivotroot, tomoyo_sb_pivotroot), LSM_HOOK_INIT(socket_bind, tomoyo_socket_bind), LSM_HOOK_INIT(socket_connect, tomoyo_socket_connect), LSM_HOOK_INIT(socket_listen, tomoyo_socket_listen), LSM_HOOK_INIT(socket_sendmsg, tomoyo_socket_sendmsg), }; /* Lock for GC. */ DEFINE_SRCU(tomoyo_ss); int tomoyo_enabled __ro_after_init = 1; /** * tomoyo_init - Register TOMOYO Linux as a LSM module. * * Returns 0. */ static int __init tomoyo_init(void) { struct tomoyo_task *s = tomoyo_task(current); /* register ourselves with the security framework */ security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks), &tomoyo_lsmid); pr_info("TOMOYO Linux initialized\n"); s->domain_info = &tomoyo_kernel_domain; atomic_inc(&tomoyo_kernel_domain.users); s->old_domain_info = NULL; tomoyo_mm_init(); return 0; } DEFINE_LSM(tomoyo) = { .name = "tomoyo", .enabled = &tomoyo_enabled, .flags = LSM_FLAG_LEGACY_MAJOR, .blobs = &tomoyo_blob_sizes, .init = tomoyo_init, }; |
274 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/interval_tree.h> #include <linux/interval_tree_generic.h> #include <linux/compiler.h> #include <linux/export.h> #define START(node) ((node)->start) #define LAST(node) ((node)->last) INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long, __subtree_last, START, LAST,, interval_tree) EXPORT_SYMBOL_GPL(interval_tree_insert); EXPORT_SYMBOL_GPL(interval_tree_remove); EXPORT_SYMBOL_GPL(interval_tree_iter_first); EXPORT_SYMBOL_GPL(interval_tree_iter_next); #ifdef CONFIG_INTERVAL_TREE_SPAN_ITER /* * Roll nodes[1] into nodes[0] by advancing nodes[1] to the end of a contiguous * span of nodes. This makes nodes[0]->last the end of that contiguous used span * of indexes that started at the original nodes[1]->start. * * If there is an interior hole, nodes[1] is now the first node starting the * next used span. A hole span is between nodes[0]->last and nodes[1]->start. * * If there is a tailing hole, nodes[1] is now NULL. A hole span is between * nodes[0]->last and last_index. * * If the contiguous used range span to last_index, nodes[1] is set to NULL. */ static void interval_tree_span_iter_next_gap(struct interval_tree_span_iter *state) { struct interval_tree_node *cur = state->nodes[1]; state->nodes[0] = cur; do { if (cur->last > state->nodes[0]->last) state->nodes[0] = cur; cur = interval_tree_iter_next(cur, state->first_index, state->last_index); } while (cur && (state->nodes[0]->last >= cur->start || state->nodes[0]->last + 1 == cur->start)); state->nodes[1] = cur; } void interval_tree_span_iter_first(struct interval_tree_span_iter *iter, struct rb_root_cached *itree, unsigned long first_index, unsigned long last_index) { iter->first_index = first_index; iter->last_index = last_index; iter->nodes[0] = NULL; iter->nodes[1] = interval_tree_iter_first(itree, first_index, last_index); if (!iter->nodes[1]) { /* No nodes intersect the span, whole span is hole */ iter->start_hole = first_index; iter->last_hole = last_index; iter->is_hole = 1; return; } if (iter->nodes[1]->start > first_index) { /* Leading hole on first iteration */ iter->start_hole = first_index; iter->last_hole = iter->nodes[1]->start - 1; iter->is_hole = 1; interval_tree_span_iter_next_gap(iter); return; } /* Starting inside a used */ iter->start_used = first_index; iter->is_hole = 0; interval_tree_span_iter_next_gap(iter); iter->last_used = iter->nodes[0]->last; if (iter->last_used >= last_index) { iter->last_used = last_index; iter->nodes[0] = NULL; iter->nodes[1] = NULL; } } EXPORT_SYMBOL_GPL(interval_tree_span_iter_first); void interval_tree_span_iter_next(struct interval_tree_span_iter *iter) { if (!iter->nodes[0] && !iter->nodes[1]) { iter->is_hole = -1; return; } if (iter->is_hole) { iter->start_used = iter->last_hole + 1; iter->last_used = iter->nodes[0]->last; if (iter->last_used >= iter->last_index) { iter->last_used = iter->last_index; iter->nodes[0] = NULL; iter->nodes[1] = NULL; } iter->is_hole = 0; return; } if (!iter->nodes[1]) { /* Trailing hole */ iter->start_hole = iter->nodes[0]->last + 1; iter->last_hole = iter->last_index; iter->nodes[0] = NULL; iter->is_hole = 1; return; } /* must have both nodes[0] and [1], interior hole */ iter->start_hole = iter->nodes[0]->last + 1; iter->last_hole = iter->nodes[1]->start - 1; iter->is_hole = 1; interval_tree_span_iter_next_gap(iter); } EXPORT_SYMBOL_GPL(interval_tree_span_iter_next); /* * Advance the iterator index to a specific position. The returned used/hole is * updated to start at new_index. This is faster than calling * interval_tree_span_iter_first() as it can avoid full searches in several * cases where the iterator is already set. */ void interval_tree_span_iter_advance(struct interval_tree_span_iter *iter, struct rb_root_cached *itree, unsigned long new_index) { if (iter->is_hole == -1) return; iter->first_index = new_index; if (new_index > iter->last_index) { iter->is_hole = -1; return; } /* Rely on the union aliasing hole/used */ if (iter->start_hole <= new_index && new_index <= iter->last_hole) { iter->start_hole = new_index; return; } if (new_index == iter->last_hole + 1) interval_tree_span_iter_next(iter); else interval_tree_span_iter_first(iter, itree, new_index, iter->last_index); } EXPORT_SYMBOL_GPL(interval_tree_span_iter_advance); #endif |
34 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM signal #if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SIGNAL_H #include <linux/signal.h> #include <linux/sched.h> #include <linux/tracepoint.h> #define TP_STORE_SIGINFO(__entry, info) \ do { \ if (info == SEND_SIG_NOINFO) { \ __entry->errno = 0; \ __entry->code = SI_USER; \ } else if (info == SEND_SIG_PRIV) { \ __entry->errno = 0; \ __entry->code = SI_KERNEL; \ } else { \ __entry->errno = info->si_errno; \ __entry->code = info->si_code; \ } \ } while (0) #ifndef TRACE_HEADER_MULTI_READ enum { TRACE_SIGNAL_DELIVERED, TRACE_SIGNAL_IGNORED, TRACE_SIGNAL_ALREADY_PENDING, TRACE_SIGNAL_OVERFLOW_FAIL, TRACE_SIGNAL_LOSE_INFO, }; #endif /** * signal_generate - called when a signal is generated * @sig: signal number * @info: pointer to struct siginfo * @task: pointer to struct task_struct * @group: shared or private * @result: TRACE_SIGNAL_* * * Current process sends a 'sig' signal to 'task' process with * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV, * 'info' is not a pointer and you can't access its field. Instead, * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV * means that si_code is SI_KERNEL. */ TRACE_EVENT(signal_generate, TP_PROTO(int sig, struct kernel_siginfo *info, struct task_struct *task, int group, int result), TP_ARGS(sig, info, task, group, result), TP_STRUCT__entry( __field( int, sig ) __field( int, errno ) __field( int, code ) __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, group ) __field( int, result ) ), TP_fast_assign( __entry->sig = sig; TP_STORE_SIGINFO(__entry, info); memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->pid = task->pid; __entry->group = group; __entry->result = result; ), TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d", __entry->sig, __entry->errno, __entry->code, __entry->comm, __entry->pid, __entry->group, __entry->result) ); /** * signal_deliver - called when a signal is delivered * @sig: signal number * @info: pointer to struct siginfo * @ka: pointer to struct k_sigaction * * A 'sig' signal is delivered to current process with 'info' siginfo, * and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or * SIG_DFL. * Note that some signals reported by signal_generate tracepoint can be * lost, ignored or modified (by debugger) before hitting this tracepoint. * This means, this can show which signals are actually delivered, but * matching generated signals and delivered signals may not be correct. */ TRACE_EVENT(signal_deliver, TP_PROTO(int sig, struct kernel_siginfo *info, struct k_sigaction *ka), TP_ARGS(sig, info, ka), TP_STRUCT__entry( __field( int, sig ) __field( int, errno ) __field( int, code ) __field( unsigned long, sa_handler ) __field( unsigned long, sa_flags ) ), TP_fast_assign( __entry->sig = sig; TP_STORE_SIGINFO(__entry, info); __entry->sa_handler = (unsigned long)ka->sa.sa_handler; __entry->sa_flags = ka->sa.sa_flags; ), TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx", __entry->sig, __entry->errno, __entry->code, __entry->sa_handler, __entry->sa_flags) ); #endif /* _TRACE_SIGNAL_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 | // SPDX-License-Identifier: GPL-2.0 /* Watch queue and general notification mechanism, built on pipes * * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/core-api/watch_queue.rst */ #define pr_fmt(fmt) "watchq: " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/printk.h> #include <linux/miscdevice.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/poll.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/file.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/sched/signal.h> #include <linux/watch_queue.h> #include <linux/pipe_fs_i.h> MODULE_DESCRIPTION("Watch queue"); MODULE_AUTHOR("Red Hat, Inc."); #define WATCH_QUEUE_NOTE_SIZE 128 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) /* * This must be called under the RCU read-lock, which makes * sure that the wqueue still exists. It can then take the lock, * and check that the wqueue hasn't been destroyed, which in * turn makes sure that the notification pipe still exists. */ static inline bool lock_wqueue(struct watch_queue *wqueue) { spin_lock_bh(&wqueue->lock); if (unlikely(!wqueue->pipe)) { spin_unlock_bh(&wqueue->lock); return false; } return true; } static inline void unlock_wqueue(struct watch_queue *wqueue) { spin_unlock_bh(&wqueue->lock); } static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct watch_queue *wqueue = (struct watch_queue *)buf->private; struct page *page; unsigned int bit; /* We need to work out which note within the page this refers to, but * the note might have been maximum size, so merely ANDing the offset * off doesn't work. OTOH, the note must've been more than zero size. */ bit = buf->offset + buf->len; if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) bit -= WATCH_QUEUE_NOTE_SIZE; bit /= WATCH_QUEUE_NOTE_SIZE; page = buf->page; bit += page->private; set_bit(bit, wqueue->notes_bitmap); generic_pipe_buf_release(pipe, buf); } // No try_steal function => no stealing #define watch_queue_pipe_buf_try_steal NULL /* New data written to a pipe may be appended to a buffer with this type. */ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { .release = watch_queue_pipe_buf_release, .try_steal = watch_queue_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* * Post a notification to a watch queue. * * Must be called with the RCU lock for reading, and the * watch_queue lock held, which guarantees that the pipe * hasn't been released. */ static bool post_one_notification(struct watch_queue *wqueue, struct watch_notification *n) { void *p; struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; struct page *page; unsigned int head, tail, note, offset, len; bool done = false; spin_lock_irq(&pipe->rd_wait.lock); head = pipe->head; tail = pipe->tail; if (pipe_full(head, tail, pipe->ring_size)) goto lost; note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); if (note >= wqueue->nr_notes) goto lost; page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; get_page(page); len = n->info & WATCH_INFO_LENGTH; p = kmap_atomic(page); memcpy(p + offset, n, len); kunmap_atomic(p); buf = pipe_buf(pipe, head); buf->page = page; buf->private = (unsigned long)wqueue; buf->ops = &watch_queue_pipe_buf_ops; buf->offset = offset; buf->len = len; buf->flags = PIPE_BUF_FLAG_WHOLE; smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { spin_unlock_irq(&pipe->rd_wait.lock); BUG(); } wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); done = true; out: spin_unlock_irq(&pipe->rd_wait.lock); if (done) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); return done; lost: buf = pipe_buf(pipe, head - 1); buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } /* * Apply filter rules to a notification. */ static bool filter_watch_notification(const struct watch_filter *wf, const struct watch_notification *n) { const struct watch_type_filter *wt; unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8; unsigned int st_index = n->subtype / st_bits; unsigned int st_bit = 1U << (n->subtype % st_bits); int i; if (!test_bit(n->type, wf->type_filter)) return false; for (i = 0; i < wf->nr_filters; i++) { wt = &wf->filters[i]; if (n->type == wt->type && (wt->subtype_filter[st_index] & st_bit) && (n->info & wt->info_mask) == wt->info_filter) return true; } return false; /* If there is a filter, the default is to reject. */ } /** * __post_watch_notification - Post an event notification * @wlist: The watch list to post the event to. * @n: The notification record to post. * @cred: The creds of the process that triggered the notification. * @id: The ID to match on the watch. * * Post a notification of an event into a set of watch queues and let the users * know. * * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and * should be in units of sizeof(*n). */ void __post_watch_notification(struct watch_list *wlist, struct watch_notification *n, const struct cred *cred, u64 id) { const struct watch_filter *wf; struct watch_queue *wqueue; struct watch *watch; if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) { WARN_ON(1); return; } rcu_read_lock(); hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) { if (watch->id != id) continue; n->info &= ~WATCH_INFO_ID; n->info |= watch->info_id; wqueue = rcu_dereference(watch->queue); wf = rcu_dereference(wqueue->filter); if (wf && !filter_watch_notification(wf, n)) continue; if (security_post_notification(watch->cred, cred, n) < 0) continue; if (lock_wqueue(wqueue)) { post_one_notification(wqueue, n); unlock_wqueue(wqueue); } } rcu_read_unlock(); } EXPORT_SYMBOL(__post_watch_notification); /* * Allocate sufficient pages to preallocation for the requested number of * notifications. */ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) { struct watch_queue *wqueue = pipe->watch_queue; struct page **pages; unsigned long *bitmap; unsigned long user_bufs; int ret, i, nr_pages; if (!wqueue) return -ENODEV; if (wqueue->notes) return -EBUSY; if (nr_notes < 1 || nr_notes > 512) /* TODO: choose a better hard limit */ return -EINVAL; nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); if (nr_pages > pipe->max_usage && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && pipe_is_unprivileged_user()) { ret = -EPERM; goto error; } nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; /* * pipe_resize_ring() does not update nr_accounted for watch_queue * pipes, because the above vastly overprovisions. Set nr_accounted on * and max_usage this pipe to the number that was actually charged to * the user above via account_pipe_buffers. */ pipe->max_usage = nr_pages; pipe->nr_accounted = nr_pages; ret = -ENOMEM; pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto error; for (i = 0; i < nr_pages; i++) { pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) goto error_p; pages[i]->private = i * WATCH_QUEUE_NOTES_PER_PAGE; } bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); if (!bitmap) goto error_p; bitmap_fill(bitmap, nr_notes); wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; wqueue->nr_notes = nr_notes; return 0; error_p: while (--i >= 0) __free_page(pages[i]); kfree(pages); error: (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); return ret; } /* * Set the filter on a watch queue. */ long watch_queue_set_filter(struct pipe_inode_info *pipe, struct watch_notification_filter __user *_filter) { struct watch_notification_type_filter *tf; struct watch_notification_filter filter; struct watch_type_filter *q; struct watch_filter *wfilter; struct watch_queue *wqueue = pipe->watch_queue; int ret, nr_filter = 0, i; if (!wqueue) return -ENODEV; if (!_filter) { /* Remove the old filter */ wfilter = NULL; goto set; } /* Grab the user's filter specification */ if (copy_from_user(&filter, _filter, sizeof(filter)) != 0) return -EFAULT; if (filter.nr_filters == 0 || filter.nr_filters > 16 || filter.__reserved != 0) return -EINVAL; tf = memdup_array_user(_filter->filters, filter.nr_filters, sizeof(*tf)); if (IS_ERR(tf)) return PTR_ERR(tf); ret = -EINVAL; for (i = 0; i < filter.nr_filters; i++) { if ((tf[i].info_filter & ~tf[i].info_mask) || tf[i].info_mask & WATCH_INFO_LENGTH) goto err_filter; /* Ignore any unknown types */ if (tf[i].type >= WATCH_TYPE__NR) continue; nr_filter++; } /* Now we need to build the internal filter from only the relevant * user-specified filters. */ ret = -ENOMEM; wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL); if (!wfilter) goto err_filter; wfilter->nr_filters = nr_filter; q = wfilter->filters; for (i = 0; i < filter.nr_filters; i++) { if (tf[i].type >= WATCH_TYPE__NR) continue; q->type = tf[i].type; q->info_filter = tf[i].info_filter; q->info_mask = tf[i].info_mask; q->subtype_filter[0] = tf[i].subtype_filter[0]; __set_bit(q->type, wfilter->type_filter); q++; } kfree(tf); set: pipe_lock(pipe); wfilter = rcu_replace_pointer(wqueue->filter, wfilter, lockdep_is_held(&pipe->mutex)); pipe_unlock(pipe); if (wfilter) kfree_rcu(wfilter, rcu); return 0; err_filter: kfree(tf); return ret; } static void __put_watch_queue(struct kref *kref) { struct watch_queue *wqueue = container_of(kref, struct watch_queue, usage); struct watch_filter *wfilter; int i; for (i = 0; i < wqueue->nr_pages; i++) __free_page(wqueue->notes[i]); kfree(wqueue->notes); bitmap_free(wqueue->notes_bitmap); wfilter = rcu_access_pointer(wqueue->filter); if (wfilter) kfree_rcu(wfilter, rcu); kfree_rcu(wqueue, rcu); } /** * put_watch_queue - Dispose of a ref on a watchqueue. * @wqueue: The watch queue to unref. */ void put_watch_queue(struct watch_queue *wqueue) { kref_put(&wqueue->usage, __put_watch_queue); } EXPORT_SYMBOL(put_watch_queue); static void free_watch(struct rcu_head *rcu) { struct watch *watch = container_of(rcu, struct watch, rcu); put_watch_queue(rcu_access_pointer(watch->queue)); atomic_dec(&watch->cred->user->nr_watches); put_cred(watch->cred); kfree(watch); } static void __put_watch(struct kref *kref) { struct watch *watch = container_of(kref, struct watch, usage); call_rcu(&watch->rcu, free_watch); } /* * Discard a watch. */ static void put_watch(struct watch *watch) { kref_put(&watch->usage, __put_watch); } /** * init_watch - Initialise a watch * @watch: The watch to initialise. * @wqueue: The queue to assign. * * Initialise a watch and set the watch queue. */ void init_watch(struct watch *watch, struct watch_queue *wqueue) { kref_init(&watch->usage); INIT_HLIST_NODE(&watch->list_node); INIT_HLIST_NODE(&watch->queue_node); rcu_assign_pointer(watch->queue, wqueue); } static int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue) { const struct cred *cred; struct watch *w; hlist_for_each_entry(w, &wlist->watchers, list_node) { struct watch_queue *wq = rcu_access_pointer(w->queue); if (wqueue == wq && watch->id == w->id) return -EBUSY; } cred = current_cred(); if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) { atomic_dec(&cred->user->nr_watches); return -EAGAIN; } watch->cred = get_cred(cred); rcu_assign_pointer(watch->watch_list, wlist); kref_get(&wqueue->usage); kref_get(&watch->usage); hlist_add_head(&watch->queue_node, &wqueue->watches); hlist_add_head_rcu(&watch->list_node, &wlist->watchers); return 0; } /** * add_watch_to_object - Add a watch on an object to a watch list * @watch: The watch to add * @wlist: The watch list to add to * * @watch->queue must have been set to point to the queue to post notifications * to and the watch list of the object to be watched. @watch->cred must also * have been set to the appropriate credentials and a ref taken on them. * * The caller must pin the queue and the list both and must hold the list * locked against racing watch additions/removals. */ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) { struct watch_queue *wqueue; int ret = -ENOENT; rcu_read_lock(); wqueue = rcu_access_pointer(watch->queue); if (lock_wqueue(wqueue)) { spin_lock(&wlist->lock); ret = add_one_watch(watch, wlist, wqueue); spin_unlock(&wlist->lock); unlock_wqueue(wqueue); } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(add_watch_to_object); /** * remove_watch_from_object - Remove a watch or all watches from an object. * @wlist: The watch list to remove from * @wq: The watch queue of interest (ignored if @all is true) * @id: The ID of the watch to remove (ignored if @all is true) * @all: True to remove all objects * * Remove a specific watch or all watches from an object. A notification is * sent to the watcher to tell them that this happened. */ int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, u64 id, bool all) { struct watch_notification_removal n; struct watch_queue *wqueue; struct watch *watch; int ret = -EBADSLT; rcu_read_lock(); again: spin_lock(&wlist->lock); hlist_for_each_entry(watch, &wlist->watchers, list_node) { if (all || (watch->id == id && rcu_access_pointer(watch->queue) == wq)) goto found; } spin_unlock(&wlist->lock); goto out; found: ret = 0; hlist_del_init_rcu(&watch->list_node); rcu_assign_pointer(watch->watch_list, NULL); spin_unlock(&wlist->lock); /* We now own the reference on watch that used to belong to wlist. */ n.watch.type = WATCH_TYPE_META; n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION; n.watch.info = watch->info_id | watch_sizeof(n.watch); n.id = id; if (id != 0) n.watch.info = watch->info_id | watch_sizeof(n); wqueue = rcu_dereference(watch->queue); if (lock_wqueue(wqueue)) { post_one_notification(wqueue, &n.watch); if (!hlist_unhashed(&watch->queue_node)) { hlist_del_init_rcu(&watch->queue_node); put_watch(watch); } unlock_wqueue(wqueue); } if (wlist->release_watch) { void (*release_watch)(struct watch *); release_watch = wlist->release_watch; rcu_read_unlock(); (*release_watch)(watch); rcu_read_lock(); } put_watch(watch); if (all && !hlist_empty(&wlist->watchers)) goto again; out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL(remove_watch_from_object); /* * Remove all the watches that are contributory to a queue. This has the * potential to race with removal of the watches by the destruction of the * objects being watched or with the distribution of notifications. */ void watch_queue_clear(struct watch_queue *wqueue) { struct watch_list *wlist; struct watch *watch; bool release; rcu_read_lock(); spin_lock_bh(&wqueue->lock); /* * This pipe can be freed by callers like free_pipe_info(). * Removing this reference also prevents new notifications. */ wqueue->pipe = NULL; while (!hlist_empty(&wqueue->watches)) { watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); hlist_del_init_rcu(&watch->queue_node); /* We now own a ref on the watch. */ spin_unlock_bh(&wqueue->lock); /* We can't do the next bit under the queue lock as we need to * get the list lock - which would cause a deadlock if someone * was removing from the opposite direction at the same time or * posting a notification. */ wlist = rcu_dereference(watch->watch_list); if (wlist) { void (*release_watch)(struct watch *); spin_lock(&wlist->lock); release = !hlist_unhashed(&watch->list_node); if (release) { hlist_del_init_rcu(&watch->list_node); rcu_assign_pointer(watch->watch_list, NULL); /* We now own a second ref on the watch. */ } release_watch = wlist->release_watch; spin_unlock(&wlist->lock); if (release) { if (release_watch) { rcu_read_unlock(); /* This might need to call dput(), so * we have to drop all the locks. */ (*release_watch)(watch); rcu_read_lock(); } put_watch(watch); } } put_watch(watch); spin_lock_bh(&wqueue->lock); } spin_unlock_bh(&wqueue->lock); rcu_read_unlock(); } /** * get_watch_queue - Get a watch queue from its file descriptor. * @fd: The fd to query. */ struct watch_queue *get_watch_queue(int fd) { struct pipe_inode_info *pipe; struct watch_queue *wqueue = ERR_PTR(-EINVAL); CLASS(fd, f)(fd); if (!fd_empty(f)) { pipe = get_pipe_info(fd_file(f), false); if (pipe && pipe->watch_queue) { wqueue = pipe->watch_queue; kref_get(&wqueue->usage); } } return wqueue; } EXPORT_SYMBOL(get_watch_queue); /* * Initialise a watch queue */ int watch_queue_init(struct pipe_inode_info *pipe) { struct watch_queue *wqueue; wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL); if (!wqueue) return -ENOMEM; wqueue->pipe = pipe; kref_init(&wqueue->usage); spin_lock_init(&wqueue->lock); INIT_HLIST_HEAD(&wqueue->watches); pipe->watch_queue = wqueue; return 0; } |
7 5 2 2 1 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The Internet Protocol (IP) module. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * * Fixes: * Alan Cox : Commented a couple of minor bits of surplus code * Alan Cox : Undefining IP_FORWARD doesn't include the code * (just stops a compiler warning). * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes * are junked rather than corrupting things. * Alan Cox : Frames to bad broadcast subnets are dumped * We used to process them non broadcast and * boy could that cause havoc. * Alan Cox : ip_forward sets the free flag on the * new frame it queues. Still crap because * it copies the frame but at least it * doesn't eat memory too. * Alan Cox : Generic queue code and memory fixes. * Fred Van Kempen : IP fragment support (borrowed from NET2E) * Gerhard Koerting: Forward fragmented frames correctly. * Gerhard Koerting: Fixes to my fix of the above 8-). * Gerhard Koerting: IP interface addressing fix. * Linus Torvalds : More robustness checks * Alan Cox : Even more checks: Still not as robust as it ought to be * Alan Cox : Save IP header pointer for later * Alan Cox : ip option setting * Alan Cox : Use ip_tos/ip_ttl settings * Alan Cox : Fragmentation bogosity removed * (Thanks to Mark.Bush@prg.ox.ac.uk) * Dmitry Gorodchanin : Send of a raw packet crash fix. * Alan Cox : Silly ip bug when an overlength * fragment turns up. Now frees the * queue. * Linus Torvalds/ : Memory leakage on fragmentation * Alan Cox : handling. * Gerhard Koerting: Forwarding uses IP priority hints * Teemu Rantanen : Fragment problems. * Alan Cox : General cleanup, comments and reformat * Alan Cox : SNMP statistics * Alan Cox : BSD address rule semantics. Also see * UDP as there is a nasty checksum issue * if you do things the wrong way. * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file * Alan Cox : IP options adjust sk->priority. * Pedro Roque : Fix mtu/length error in ip_forward. * Alan Cox : Avoid ip_chk_addr when possible. * Richard Underwood : IP multicasting. * Alan Cox : Cleaned up multicast handlers. * Alan Cox : RAW sockets demultiplex in the BSD style. * Gunther Mayer : Fix the SNMP reporting typo * Alan Cox : Always in group 224.0.0.1 * Pauline Middelink : Fast ip_checksum update when forwarding * Masquerading support. * Alan Cox : Multicast loopback error for 224.0.0.1 * Alan Cox : IP_MULTICAST_LOOP option. * Alan Cox : Use notifiers. * Bjorn Ekwall : Removed ip_csum (from slhc.c too) * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) * Stefan Becker : Send out ICMP HOST REDIRECT * Arnt Gulbrandsen : ip_build_xmit * Alan Cox : Per socket routing cache * Alan Cox : Fixed routing cache, added header cache. * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. * Alan Cox : Incoming IP option handling. * Alan Cox : Set saddr on raw output frames as per BSD. * Alan Cox : Stopped broadcast source route explosions. * Alan Cox : Can disable source routing * Takeshi Sone : Masquerading didn't work. * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. * Alan Cox : Memory leaks, tramples, misc debugging. * Alan Cox : Fixed multicast (by popular demand 8)) * Alan Cox : Fixed forwarding (by even more popular demand 8)) * Alan Cox : Fixed SNMP statistics [I think] * Gerhard Koerting : IP fragmentation forwarding fix * Alan Cox : Device lock against page fault. * Alan Cox : IP_HDRINCL facility. * Werner Almesberger : Zero fragment bug * Alan Cox : RAW IP frame length bug * Alan Cox : Outgoing firewall on build_xmit * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel * Alan Cox : Multicast routing hooks * Jos Vos : Do accounting *before* call_in_firewall * Willy Konynenberg : Transparent proxying support * * To Fix: * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient * and could be made very efficient with the addition of some virtual memory hacks to permit * the allocation of a buffer that can then be 'grown' by twiddling page tables. * Output fragmentation wants updating along with the buffer management to use a single * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause * fragmentation anyway. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/indirect_call_wrapper.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> #include <net/icmp.h> #include <net/raw.h> #include <net/checksum.h> #include <net/inet_ecn.h> #include <linux/netfilter_ipv4.h> #include <net/xfrm.h> #include <linux/mroute.h> #include <linux/netlink.h> #include <net/dst_metadata.h> /* * Process Router Attention IP option (RFC 2113) */ bool ip_call_ra_chain(struct sk_buff *skb) { struct ip_ra_chain *ra; u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; struct net *net = dev_net(dev); for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report * the packet if it came from that interface. */ if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex)) { if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) raw_rcv(last, skb2); } last = sk; } } if (last) { raw_rcv(last, skb); return true; } return false; } INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *)); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { const struct net_protocol *ipprot; int raw, ret; resubmit: raw = raw_local_deliver(skb, protocol); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot) { if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return; } nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb); if (ret < 0) { protocol = -ret; goto resubmit; } __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO); } else { __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } } static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_delivery_time(skb); __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); rcu_read_unlock(); return 0; } /* * Deliver IP Packets to the higher protocol layers. */ int ip_local_deliver(struct sk_buff *skb) { /* * Reassemble IP fragments. */ struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } EXPORT_SYMBOL(ip_local_deliver); static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt; const struct iphdr *iph; /* It looks as overkill, because not all IP options require packet mangling. But it is the easiest for now, especially taking into account that combination of IP options and running sniffer is extremely rare condition. --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); opt = &(IPCB(skb)->opt); opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); goto drop; } if (unlikely(opt->srr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev) { if (!IN_DEV_SOURCE_ROUTE(in_dev)) { if (IN_DEV_LOG_MARTIANS(in_dev)) net_info_ratelimited("source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); goto drop; } } if (ip_options_rcv_srr(skb, dev)) goto drop; } return false; drop: return true; } static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, const struct sk_buff *hint) { return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && ip_hdr(hint)->tos == iph->tos; } int tcp_v4_early_demux(struct sk_buff *skb); int udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); int err, drop_reason; struct rtable *rt; if (ip_can_use_hint(skb, iph, hint)) { drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev, hint); if (unlikely(drop_reason)) goto drop_error; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_TCP: if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { tcp_v4_early_demux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; case IPPROTO_UDP: if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { err = udp_v4_early_demux(skb); if (unlikely(err)) goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; } } /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); if (unlikely(drop_reason)) goto drop_error; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; } #ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; st[(idx>>16)&0xFF].i_bytes += skb->len; } #endif if (iph->ihl > 5 && ip_rcv_options(skb, dev)) goto drop; rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* RFC 1122 3.3.6: * * When a host sends a datagram to a link-layer broadcast * address, the IP destination address MUST be a legal IP * broadcast or IP multicast address. * * A host SHOULD silently discard a datagram that is received * via a link-layer broadcast (see Section 2.4) but does not * specify an IP multicast or broadcast destination address. * * This doesn't explicitly say L2 *broadcast*, but broadcast is * in a way a form of multicast and the most common use case for * this is 802.11 protecting against cross-station spoofing (the * so-called "hole-196" attack) so do it for both. */ if (in_dev && IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) { drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST; goto drop; } } return NET_RX_SUCCESS; drop: kfree_skb_reason(skb, drop_reason); return NET_RX_DROP; drop_error: if (drop_reason == SKB_DROP_REASON_IP_RPFILTER) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); goto drop; } static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) return NET_RX_SUCCESS; ret = ip_rcv_finish_core(net, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; } /* * Main IP Receive routine. */ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; int drop_reason; u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) { dev_core_stats_rx_otherhost_dropped_inc(skb->dev); drop_reason = SKB_DROP_REASON_OTHERHOST; goto drop; } __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto out; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; iph = ip_hdr(skb); /* * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum. * * Is the datagram acceptable? * * 1. Length at least the size of an ip header * 2. Version of 4 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] * 4. Doesn't have a bogus length */ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); __IP_ADD_STATS(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; len = iph_totlen(skb, iph); if (skb->len < len) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ if (!skb_sk_is_prefetched(skb)) skb_orphan(skb); return skb; csum_error: drop_reason = SKB_DROP_REASON_IP_CSUM; __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED) drop_reason = SKB_DROP_REASON_IP_INHDR; __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb_reason(skb, drop_reason); out: return NULL; } /* * IP receive entry point */ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net(dev); skb = ip_rcv_core(skb, net); if (skb == NULL) return NET_RX_DROP; return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip_rcv_finish); } static void ip_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); dst_input(skb); } } static struct sk_buff *ip_extract_route_hint(const struct net *net, struct sk_buff *skb, int rt_type) { if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST || IPCB(skb)->flags & IPSKB_MULTIPATH) return NULL; return skb; } static void ip_list_rcv_finish(struct net *net, struct list_head *head) { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct dst_entry *dst; skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) continue; if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { hint = ip_extract_route_hint(net, skb, dst_rtable(dst)->rt_type); /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dst = dst; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ ip_sublist_rcv_finish(&sublist); } static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, struct net *net) { NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, head, dev, NULL, ip_rcv_finish); ip_list_rcv_finish(net, head); } /* Receive a list of IP packets */ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev) { struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); skb_list_del_init(skb); skb = ip_rcv_core(skb, net); if (skb == NULL) continue; if (curr_dev != dev || curr_net != net) { /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dev = dev; curr_net = net; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); } |
3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dsa/user.c - user device handling * Copyright (c) 2008-2009 Marvell Semiconductor */ #include <linux/list.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/phy.h> #include <linux/phy_fixed.h> #include <linux/phylink.h> #include <linux/of_net.h> #include <linux/of_mdio.h> #include <linux/mdio.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/selftests.h> #include <net/tc_act/tc_mirred.h> #include <linux/if_bridge.h> #include <linux/if_hsr.h> #include <net/dcbnl.h> #include <linux/netpoll.h> #include <linux/string.h> #include "conduit.h" #include "dsa.h" #include "netlink.h" #include "port.h" #include "switch.h" #include "tag.h" #include "user.h" struct dsa_switchdev_event_work { struct net_device *dev; struct net_device *orig_dev; struct work_struct work; unsigned long event; /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and * SWITCHDEV_FDB_DEL_TO_DEVICE */ unsigned char addr[ETH_ALEN]; u16 vid; bool host_addr; }; enum dsa_standalone_event { DSA_UC_ADD, DSA_UC_DEL, DSA_MC_ADD, DSA_MC_DEL, }; struct dsa_standalone_event_work { struct work_struct work; struct net_device *dev; enum dsa_standalone_event event; unsigned char addr[ETH_ALEN]; u16 vid; }; struct dsa_host_vlan_rx_filtering_ctx { struct net_device *dev; const unsigned char *addr; enum dsa_standalone_event event; }; static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds) { return ds->ops->port_fdb_add && ds->ops->port_fdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds) { return ds->ops->port_mdb_add && ds->ops->port_mdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static void dsa_user_standalone_event_work(struct work_struct *work) { struct dsa_standalone_event_work *standalone_work = container_of(work, struct dsa_standalone_event_work, work); const unsigned char *addr = standalone_work->addr; struct net_device *dev = standalone_work->dev; struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_mdb mdb; struct dsa_switch *ds = dp->ds; u16 vid = standalone_work->vid; int err; switch (standalone_work->event) { case DSA_UC_ADD: err = dsa_port_standalone_host_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_UC_DEL: err = dsa_port_standalone_host_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; case DSA_MC_ADD: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_add(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to mdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_MC_DEL: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_del(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from mdb: %d\n", dp->index, addr, vid, err); } break; } kfree(standalone_work); } static int dsa_user_schedule_standalone_work(struct net_device *dev, enum dsa_standalone_event event, const unsigned char *addr, u16 vid) { struct dsa_standalone_event_work *standalone_work; standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC); if (!standalone_work) return -ENOMEM; INIT_WORK(&standalone_work->work, dsa_user_standalone_event_work); standalone_work->event = event; standalone_work->dev = dev; ether_addr_copy(standalone_work->addr, addr); standalone_work->vid = vid; dsa_schedule_work(&standalone_work->work); return 0; } static int dsa_user_host_vlan_rx_filtering(void *arg, int vid) { struct dsa_host_vlan_rx_filtering_ctx *ctx = arg; return dsa_user_schedule_standalone_work(ctx->dev, ctx->event, ctx->addr, vid); } static int dsa_user_vlan_for_each(struct net_device *dev, int (*cb)(void *arg, int vid), void *arg) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_vlan *v; int err; lockdep_assert_held(&dev->addr_list_lock); err = cb(arg, 0); if (err) return err; list_for_each_entry(v, &dp->user_vlans, list) { err = cb(arg, v->vid); if (err) return err; } return 0; } static int dsa_user_sync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_ADD, }; dev_uc_add(conduit, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_unsync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_DEL, }; dev_uc_del(conduit, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_sync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_ADD, }; dev_mc_add(conduit, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_unsync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_DEL, }; dev_mc_del(conduit, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } void dsa_user_sync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_mc_addr(ha, dev) dsa_user_sync_mc(dev, ha->addr); netdev_for_each_synced_uc_addr(ha, dev) dsa_user_sync_uc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } void dsa_user_unsync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_uc_addr(ha, dev) dsa_user_unsync_uc(dev, ha->addr); netdev_for_each_synced_mc_addr(ha, dev) dsa_user_unsync_mc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } /* user mii_bus handling ***************************************************/ static int dsa_user_phy_read(struct mii_bus *bus, int addr, int reg) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_read(ds, addr, reg); return 0xffff; } static int dsa_user_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_write(ds, addr, reg, val); return 0; } void dsa_user_mii_bus_init(struct dsa_switch *ds) { ds->user_mii_bus->priv = (void *)ds; ds->user_mii_bus->name = "dsa user smi"; ds->user_mii_bus->read = dsa_user_phy_read; ds->user_mii_bus->write = dsa_user_phy_write; snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d", ds->dst->index, ds->index); ds->user_mii_bus->parent = ds->dev; ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask; } /* user device handling ****************************************************/ static int dsa_user_get_iflink(const struct net_device *dev) { return READ_ONCE(dsa_user_to_conduit(dev)->ifindex); } int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int err; if (dsa_switch_supports_uc_filtering(ds)) { err = dsa_port_standalone_host_fdb_add(dp, addr, 0); if (err) goto out; } if (!ether_addr_equal(addr, conduit->dev_addr)) { err = dev_uc_add(conduit, addr); if (err < 0) goto del_host_addr; } return 0; del_host_addr: if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, addr, 0); out: return err; } void dsa_user_host_uc_uninstall(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) dev_uc_del(conduit, dev->dev_addr); if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); } static int dsa_user_open(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); int err; err = dev_open(conduit, NULL); if (err < 0) { netdev_err(dev, "failed to open conduit %s\n", conduit->name); goto out; } err = dsa_user_host_uc_install(dev, dev->dev_addr); if (err) goto out; err = dsa_port_enable_rt(dp, dev->phydev); if (err) goto out_del_host_uc; return 0; out_del_host_uc: dsa_user_host_uc_uninstall(dev); out: return err; } static int dsa_user_close(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); dsa_port_disable_rt(dp); dsa_user_host_uc_uninstall(dev); return 0; } static void dsa_user_manage_host_flood(struct net_device *dev) { bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI); struct dsa_port *dp = dsa_user_to_port(dev); bool uc = dev->flags & IFF_PROMISC; dsa_port_set_host_flood(dp, uc, mc); } static void dsa_user_change_rx_flags(struct net_device *dev, int change) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (change & IFF_ALLMULTI) dev_set_allmulti(conduit, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(conduit, dev->flags & IFF_PROMISC ? 1 : -1); if (dsa_switch_supports_uc_filtering(ds) && dsa_switch_supports_mc_filtering(ds)) dsa_user_manage_host_flood(dev); } static void dsa_user_set_rx_mode(struct net_device *dev) { __dev_mc_sync(dev, dsa_user_sync_mc, dsa_user_unsync_mc); __dev_uc_sync(dev, dsa_user_sync_uc, dsa_user_unsync_uc); } static int dsa_user_set_mac_address(struct net_device *dev, void *a) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct sockaddr *addr = a; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (ds->ops->port_set_mac_address) { err = ds->ops->port_set_mac_address(ds, dp->index, addr->sa_data); if (err) return err; } /* If the port is down, the address isn't synced yet to hardware or * to the DSA conduit, so there is nothing to change. */ if (!(dev->flags & IFF_UP)) goto out_change_dev_addr; err = dsa_user_host_uc_install(dev, addr->sa_data); if (err) return err; dsa_user_host_uc_uninstall(dev); out_change_dev_addr: eth_hw_addr_set(dev, addr->sa_data); return 0; } struct dsa_user_dump_ctx { struct net_device *dev; struct sk_buff *skb; struct netlink_callback *cb; int idx; }; static int dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid, bool is_static, void *data) { struct dsa_user_dump_ctx *dump = data; struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; if (dump->idx < ctx->fdb_idx) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, sizeof(*ndm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = NTF_SELF; ndm->ndm_type = 0; ndm->ndm_ifindex = dump->dev->ifindex; ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr)) goto nla_put_failure; if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid)) goto nla_put_failure; nlmsg_end(dump->skb, nlh); skip: dump->idx++; return 0; nla_put_failure: nlmsg_cancel(dump->skb, nlh); return -EMSGSIZE; } static int dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_dump_ctx dump = { .dev = dev, .skb = skb, .cb = cb, .idx = *idx, }; int err; err = dsa_port_fdb_dump(dp, dsa_user_port_fdb_do_dump, &dump); *idx = dump.idx; return err; } static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->dp->ds; int port = p->dp->index; /* Pass through to switch driver if it supports timestamping */ switch (cmd) { case SIOCGHWTSTAMP: if (ds->ops->port_hwtstamp_get) return ds->ops->port_hwtstamp_get(ds, port, ifr); break; case SIOCSHWTSTAMP: if (ds->ops->port_hwtstamp_set) return ds->ops->port_hwtstamp_set(ds, port, ifr); break; } return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } static int dsa_user_port_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); int ret; if (ctx && ctx != dp) return 0; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_state(dp, attr->u.stp_state, true); break; case SWITCHDEV_ATTR_ID_PORT_MST_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_ageing_time(dp, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_BRIDGE_MST: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_mst_enable(dp, attr->u.mst, extack); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_VLAN_MSTI: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti); break; default: ret = -EOPNOTSUPP; break; } return ret; } /* Must be called under rcu_read_lock() */ static int dsa_user_vlan_check_for_8021q_uppers(struct net_device *user, const struct switchdev_obj_port_vlan *vlan) { struct net_device *upper_dev; struct list_head *iter; netdev_for_each_upper_dev_rcu(user, upper_dev, iter) { u16 vid; if (!is_vlan_dev(upper_dev)) continue; vid = vlan_dev_vlan_id(upper_dev); if (vid == vlan->vid) return -EBUSY; } return 0; } static int dsa_user_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; int err; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); /* Deny adding a bridge VLAN when there is already an 802.1Q upper with * the same VID. */ if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) { rcu_read_lock(); err = dsa_user_vlan_check_for_8021q_uppers(dev, vlan); rcu_read_unlock(); if (err) { NL_SET_ERR_MSG_MOD(extack, "Port already has a VLAN upper with this VID"); return err; } } return dsa_port_vlan_add(dp, vlan, extack); } /* Offload a VLAN installed on the bridge or on a foreign interface by * installing it as a VLAN towards the CPU port. */ static int dsa_user_host_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); /* Even though drivers often handle CPU membership in special ways, * it doesn't make sense to program a PVID, so clear this flag. */ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; return dsa_port_host_vlan_add(dp, &vlan, extack); } static int dsa_user_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_user_vlan_add(dev, obj, extack); else err = dsa_user_host_vlan_add(dev, obj, extack); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static int dsa_user_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_vlan_del(dp, vlan); } static int dsa_user_host_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_host_vlan_del(dp, vlan); } static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_user_vlan_del(dev, obj); else err = dsa_user_host_vlan_del(dev, obj); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_user_priv *p = netdev_priv(dev); return netpoll_send_skb(p->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static void dsa_skb_tx_timestamp(struct dsa_user_priv *p, struct sk_buff *skb) { struct dsa_switch *ds = p->dp->ds; if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NOBPF)) return; if (!ds->ops->port_txtstamp) return; ds->ops->port_txtstamp(ds, p->dp->index, skb); } netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev) { /* SKB for netpoll still need to be mangled with the protocol-specific * tag to be successfully transmitted */ if (unlikely(netpoll_tx_running(dev))) return dsa_user_netpoll_send_skb(dev, skb); /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ skb->dev = dsa_user_to_conduit(dev); dev_queue_xmit(skb); return NETDEV_TX_OK; } EXPORT_SYMBOL_GPL(dsa_enqueue_skb); static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); struct sk_buff *nskb; dev_sw_netstats_tx_add(dev, 1, skb->len); memset(skb->cb, 0, sizeof(skb->cb)); /* Handle tx timestamp if any */ dsa_skb_tx_timestamp(p, skb); if (skb_ensure_writable_head_tail(skb, dev)) { dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* needed_tailroom should still be 'warm' in the cache line from * skb_ensure_writable_head_tail(), which has also ensured that * padding is safe. */ if (dev->needed_tailroom) eth_skb_pad(skb); /* Transmit function may have to reallocate the original SKB, * in which case it must have freed it. Only free it here on error. */ nskb = p->xmit(skb, dev); if (!nskb) { kfree_skb(skb); return NETDEV_TX_OK; } return dsa_enqueue_skb(nskb, dev); } /* ethtool operations *******************************************************/ static void dsa_user_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver)); strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version)); strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); } static int dsa_user_get_regs_len(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs_len) return ds->ops->get_regs_len(ds, dp->index); return -EOPNOTSUPP; } static void dsa_user_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs) ds->ops->get_regs(ds, dp->index, regs, _p); } static int dsa_user_nway_reset(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_nway_reset(dp->pl); } static int dsa_user_get_eeprom_len(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; if (ds->ops->get_eeprom_len) return ds->ops->get_eeprom_len(ds); return 0; } static int dsa_user_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eeprom) return ds->ops->get_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static int dsa_user_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->set_eeprom) return ds->ops->set_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static void dsa_user_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (stringset == ETH_SS_STATS) { ethtool_puts(&data, "tx_packets"); ethtool_puts(&data, "tx_bytes"); ethtool_puts(&data, "rx_packets"); ethtool_puts(&data, "rx_bytes"); if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data); } else if (stringset == ETH_SS_TEST) { net_selftest_get_strings(data); } } static void dsa_user_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct pcpu_sw_netstats *s; unsigned int start; int i; for_each_possible_cpu(i) { u64 tx_packets, tx_bytes, rx_packets, rx_bytes; s = per_cpu_ptr(dev->tstats, i); do { start = u64_stats_fetch_begin(&s->syncp); tx_packets = u64_stats_read(&s->tx_packets); tx_bytes = u64_stats_read(&s->tx_bytes); rx_packets = u64_stats_read(&s->rx_packets); rx_bytes = u64_stats_read(&s->rx_bytes); } while (u64_stats_fetch_retry(&s->syncp, start)); data[0] += tx_packets; data[1] += tx_bytes; data[2] += rx_packets; data[3] += rx_bytes; } if (ds->ops->get_ethtool_stats) ds->ops->get_ethtool_stats(ds, dp->index, data + 4); } static int dsa_user_get_sset_count(struct net_device *dev, int sset) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (sset == ETH_SS_STATS) { int count = 0; if (ds->ops->get_sset_count) { count = ds->ops->get_sset_count(ds, dp->index, sset); if (count < 0) return count; } return count + 4; } else if (sset == ETH_SS_TEST) { return net_selftest_get_count(); } return -EOPNOTSUPP; } static void dsa_user_get_eth_phy_stats(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_phy_stats) ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats); } static void dsa_user_get_eth_mac_stats(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_mac_stats) ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats); } static void dsa_user_get_eth_ctrl_stats(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_ctrl_stats) ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats); } static void dsa_user_get_rmon_stats(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_rmon_stats) ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges); } static void dsa_user_get_ts_stats(struct net_device *dev, struct ethtool_ts_stats *ts_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_ts_stats) ds->ops->get_ts_stats(ds, dp->index, ts_stats); } static void dsa_user_net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { struct dsa_port *dp = dsa_user_to_port(ndev); struct dsa_switch *ds = dp->ds; if (ds->ops->self_test) { ds->ops->self_test(ds, dp->index, etest, buf); return; } net_selftest(ndev, etest, buf); } static int dsa_user_get_mm(struct net_device *dev, struct ethtool_mm_state *state) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_mm) return -EOPNOTSUPP; return ds->ops->get_mm(ds, dp->index, state); } static int dsa_user_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_mm) return -EOPNOTSUPP; return ds->ops->set_mm(ds, dp->index, cfg, extack); } static void dsa_user_get_mm_stats(struct net_device *dev, struct ethtool_mm_stats *stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_mm_stats) ds->ops->get_mm_stats(ds, dp->index, stats); } static void dsa_user_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; phylink_ethtool_get_wol(dp->pl, w); if (ds->ops->get_wol) ds->ops->get_wol(ds, dp->index, w); } static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int ret = -EOPNOTSUPP; phylink_ethtool_set_wol(dp->pl, w); if (ds->ops->set_wol) ret = ds->ops->set_wol(ds, dp->index, w); return ret; } static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int ret; /* Check whether the switch supports EEE */ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; /* If the port is using phylink managed EEE, then an unimplemented * set_mac_eee() is permissible. */ if (!phylink_mac_implements_lpi(ds->phylink_mac_ops)) { /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev) return -ENODEV; if (!ds->ops->set_mac_eee) return -EOPNOTSUPP; ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; } else if (ds->ops->set_mac_eee) { ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; } return phylink_ethtool_set_eee(dp->pl, e); } static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; /* Check whether the switch supports EEE */ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev) return -ENODEV; return phylink_ethtool_get_eee(dp->pl, e); } static int dsa_user_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_ksettings_get(dp->pl, cmd); } static int dsa_user_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_ksettings_set(dp->pl, cmd); } static void dsa_user_get_pause_stats(struct net_device *dev, struct ethtool_pause_stats *pause_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_pause_stats) ds->ops->get_pause_stats(ds, dp->index, pause_stats); } static void dsa_user_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_user_to_port(dev); phylink_ethtool_get_pauseparam(dp->pl, pause); } static int dsa_user_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_set_pauseparam(dp->pl, pause); } #ifdef CONFIG_NET_POLL_CONTROLLER static int dsa_user_netpoll_setup(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_user_priv *p = netdev_priv(dev); struct netpoll *netpoll; int err = 0; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); if (!netpoll) return -ENOMEM; err = __netpoll_setup(netpoll, conduit); if (err) { kfree(netpoll); goto out; } p->netpoll = netpoll; out: return err; } static void dsa_user_netpoll_cleanup(struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); struct netpoll *netpoll = p->netpoll; if (!netpoll) return; p->netpoll = NULL; __netpoll_free(netpoll); } static void dsa_user_poll_controller(struct net_device *dev) { } #endif static struct dsa_mall_tc_entry * dsa_user_mall_tc_entry_find(struct net_device *dev, unsigned long cookie) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) if (mall_tc_entry->cookie == cookie) return mall_tc_entry; return NULL; } static int dsa_user_add_cls_matchall_mirred(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress, bool ingress_target) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_mirror_tc_entry *mirror; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; struct dsa_port *to_dp; int err; if (cls->common.protocol != htons(ETH_P_ALL)) { NL_SET_ERR_MSG_MOD(extack, "Can only offload \"protocol all\" matchall filter"); return -EOPNOTSUPP; } if (!ds->ops->port_mirror_add) { NL_SET_ERR_MSG_MOD(extack, "Switch does not support mirroring operation"); return -EOPNOTSUPP; } if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack)) return -EOPNOTSUPP; act = &cls->rule->action.entries[0]; if (!act->dev) return -EINVAL; if (dsa_user_dev_check(act->dev)) { if (ingress_target) { /* We can only fulfill this using software assist */ if (cls->common.skip_sw) { NL_SET_ERR_MSG_MOD(extack, "Can only mirred to ingress of DSA user port if filter also runs in software"); return -EOPNOTSUPP; } to_dp = dp->cpu_dp; } else { to_dp = dsa_user_to_port(act->dev); } } else { /* Handle mirroring to foreign target ports as a mirror towards * the CPU. The software tc rule will take the packets from * there. */ if (cls->common.skip_sw) { NL_SET_ERR_MSG_MOD(extack, "Can only mirred to CPU if filter also runs in software"); return -EOPNOTSUPP; } to_dp = dp->cpu_dp; } if (dp->ds != to_dp->ds) { NL_SET_ERR_MSG_MOD(extack, "Cross-chip mirroring not implemented"); return -EOPNOTSUPP; } mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_MIRROR; mirror = &mall_tc_entry->mirror; mirror->to_local_port = to_dp->index; mirror->ingress = ingress; err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_user_add_cls_matchall_police(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_policer_tc_entry *policer; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; int err; if (!ds->ops->port_policer_add) { NL_SET_ERR_MSG_MOD(extack, "Policing offload not implemented"); return -EOPNOTSUPP; } if (!ingress) { NL_SET_ERR_MSG_MOD(extack, "Only supported on ingress qdisc"); return -EOPNOTSUPP; } if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack)) return -EOPNOTSUPP; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) { if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) { NL_SET_ERR_MSG_MOD(extack, "Only one port policer allowed"); return -EEXIST; } } act = &cls->rule->action.entries[0]; mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_POLICER; policer = &mall_tc_entry->policer; policer->rate_bytes_per_sec = act->police.rate_bytes_ps; policer->burst = act->police.burst; err = ds->ops->port_policer_add(ds, dp->index, policer); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_user_add_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { const struct flow_action *action = &cls->rule->action; struct netlink_ext_ack *extack = cls->common.extack; if (!flow_offload_has_one_action(action)) { NL_SET_ERR_MSG_MOD(extack, "Cannot offload matchall filter with more than one action"); return -EOPNOTSUPP; } switch (action->entries[0].id) { case FLOW_ACTION_MIRRED: return dsa_user_add_cls_matchall_mirred(dev, cls, ingress, false); case FLOW_ACTION_MIRRED_INGRESS: return dsa_user_add_cls_matchall_mirred(dev, cls, ingress, true); case FLOW_ACTION_POLICE: return dsa_user_add_cls_matchall_police(dev, cls, ingress); default: NL_SET_ERR_MSG_MOD(extack, "Unknown action"); break; } return -EOPNOTSUPP; } static void dsa_user_del_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; mall_tc_entry = dsa_user_mall_tc_entry_find(dev, cls->cookie); if (!mall_tc_entry) return; list_del(&mall_tc_entry->list); switch (mall_tc_entry->type) { case DSA_PORT_MALL_MIRROR: if (ds->ops->port_mirror_del) ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror); break; case DSA_PORT_MALL_POLICER: if (ds->ops->port_policer_del) ds->ops->port_policer_del(ds, dp->index); break; default: WARN_ON(1); } kfree(mall_tc_entry); } static int dsa_user_setup_tc_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { if (cls->common.chain_index) return -EOPNOTSUPP; switch (cls->command) { case TC_CLSMATCHALL_REPLACE: return dsa_user_add_cls_matchall(dev, cls, ingress); case TC_CLSMATCHALL_DESTROY: dsa_user_del_cls_matchall(dev, cls); return 0; default: return -EOPNOTSUPP; } } static int dsa_user_add_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_add) return -EOPNOTSUPP; return ds->ops->cls_flower_add(ds, port, cls, ingress); } static int dsa_user_del_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_del) return -EOPNOTSUPP; return ds->ops->cls_flower_del(ds, port, cls, ingress); } static int dsa_user_stats_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_stats) return -EOPNOTSUPP; return ds->ops->cls_flower_stats(ds, port, cls, ingress); } static int dsa_user_setup_tc_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { switch (cls->command) { case FLOW_CLS_REPLACE: return dsa_user_add_cls_flower(dev, cls, ingress); case FLOW_CLS_DESTROY: return dsa_user_del_cls_flower(dev, cls, ingress); case FLOW_CLS_STATS: return dsa_user_stats_cls_flower(dev, cls, ingress); default: return -EOPNOTSUPP; } } static int dsa_user_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv, bool ingress) { struct net_device *dev = cb_priv; if (!tc_can_offload(dev)) return -EOPNOTSUPP; switch (type) { case TC_SETUP_CLSMATCHALL: return dsa_user_setup_tc_cls_matchall(dev, type_data, ingress); case TC_SETUP_CLSFLOWER: return dsa_user_setup_tc_cls_flower(dev, type_data, ingress); default: return -EOPNOTSUPP; } } static int dsa_user_setup_tc_block_cb_ig(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, true); } static int dsa_user_setup_tc_block_cb_eg(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, false); } static LIST_HEAD(dsa_user_block_cb_list); static int dsa_user_setup_tc_block(struct net_device *dev, struct flow_block_offload *f) { struct flow_block_cb *block_cb; flow_setup_cb_t *cb; if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) cb = dsa_user_setup_tc_block_cb_ig; else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) cb = dsa_user_setup_tc_block_cb_eg; else return -EOPNOTSUPP; f->driver_block_list = &dsa_user_block_cb_list; switch (f->command) { case FLOW_BLOCK_BIND: if (flow_block_cb_is_busy(cb, dev, &dsa_user_block_cb_list)) return -EBUSY; block_cb = flow_block_cb_alloc(cb, dev, dev, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); flow_block_cb_add(block_cb, f); list_add_tail(&block_cb->driver_list, &dsa_user_block_cb_list); return 0; case FLOW_BLOCK_UNBIND: block_cb = flow_block_cb_lookup(f->block, cb, dev); if (!block_cb) return -ENOENT; flow_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); return 0; default: return -EOPNOTSUPP; } } static int dsa_user_setup_ft_block(struct dsa_switch *ds, int port, void *type_data) { struct net_device *conduit = dsa_port_to_conduit(dsa_to_port(ds, port)); if (!conduit->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; return conduit->netdev_ops->ndo_setup_tc(conduit, TC_SETUP_FT, type_data); } static int dsa_user_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; switch (type) { case TC_SETUP_BLOCK: return dsa_user_setup_tc_block(dev, type_data); case TC_SETUP_FT: return dsa_user_setup_ft_block(ds, dp->index, type_data); default: break; } if (!ds->ops->port_setup_tc) return -EOPNOTSUPP; return ds->ops->port_setup_tc(ds, dp->index, type, type_data); } static int dsa_user_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_rxnfc) return -EOPNOTSUPP; return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs); } static int dsa_user_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_rxnfc) return -EOPNOTSUPP; return ds->ops->set_rxnfc(ds, dp->index, nfc); } static int dsa_user_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *ts) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->dp->ds; if (!ds->ops->get_ts_info) return -EOPNOTSUPP; return ds->ops->get_ts_info(ds, p->dp->index, ts); } static int dsa_user_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan = { .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct netlink_ext_ack extack = {0}; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int ret; /* User port... */ ret = dsa_port_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "%s\n", extack._msg); return ret; } /* And CPU port... */ ret = dsa_port_host_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index, extack._msg); return ret; } if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; v = kzalloc(sizeof(*v), GFP_KERNEL); if (!v) { ret = -ENOMEM; goto rollback; } netif_addr_lock_bh(dev); v->vid = vid; list_add_tail(&v->list, &dp->user_vlans); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_MC_ADD, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_UC_ADD, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; rollback: dsa_port_host_vlan_del(dp, &vlan); dsa_port_vlan_del(dp, &vlan); return ret; } static int dsa_user_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan = { .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int err; err = dsa_port_vlan_del(dp, &vlan); if (err) return err; err = dsa_port_host_vlan_del(dp, &vlan); if (err) return err; if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; netif_addr_lock_bh(dev); v = dsa_vlan_find(&dp->user_vlans, &vlan); if (!v) { netif_addr_unlock_bh(dev); return -ENOENT; } list_del(&v->list); kfree(v); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_MC_DEL, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_UC_DEL, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; } static int dsa_user_restore_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_user_vlan_rx_add_vid(arg, proto, vid); } static int dsa_user_clear_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_user_vlan_rx_kill_vid(arg, proto, vid); } /* Keep the VLAN RX filtering list in sync with the hardware only if VLAN * filtering is enabled. The baseline is that only ports that offload a * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware, * but there are exceptions for quirky hardware. * * If ds->vlan_filtering_is_global = true, then standalone ports which share * the same switch with other ports that offload a VLAN-aware bridge are also * inevitably VLAN-aware. * * To summarize, a DSA switch port offloads: * * - If standalone (this includes software bridge, software LAG): * - if ds->needs_standalone_vlan_filtering = true, OR if * (ds->vlan_filtering_is_global = true AND there are bridges spanning * this switch chip which have vlan_filtering=1) * - the 8021q upper VLANs * - else (standalone VLAN filtering is not needed, VLAN filtering is not * global, or it is, but no port is under a VLAN-aware bridge): * - no VLAN (any 8021q upper is a software VLAN) * * - If under a vlan_filtering=0 bridge which it offload: * - if ds->configure_vlan_while_not_filtering = true (default): * - the bridge VLANs. These VLANs are committed to hardware but inactive. * - else (deprecated): * - no VLAN. The bridge VLANs are not restored when VLAN awareness is * enabled, so this behavior is broken and discouraged. * * - If under a vlan_filtering=1 bridge which it offload: * - the bridge VLANs * - the 8021q upper VLANs */ int dsa_user_manage_vlan_filtering(struct net_device *user, bool vlan_filtering) { int err; if (vlan_filtering) { user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; err = vlan_for_each(user, dsa_user_restore_vlan, user); if (err) { vlan_for_each(user, dsa_user_clear_vlan, user); user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; return err; } } else { err = vlan_for_each(user, dsa_user_clear_vlan, user); if (err) return err; user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; } return 0; } struct dsa_hw_port { struct list_head list; struct net_device *dev; int old_mtu; }; static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu) { const struct dsa_hw_port *p; int err; list_for_each_entry(p, hw_port_list, list) { if (p->dev->mtu == mtu) continue; err = dev_set_mtu(p->dev, mtu); if (err) goto rollback; } return 0; rollback: list_for_each_entry_continue_reverse(p, hw_port_list, list) { if (p->dev->mtu == p->old_mtu) continue; if (dev_set_mtu(p->dev, p->old_mtu)) netdev_err(p->dev, "Failed to restore MTU\n"); } return err; } static void dsa_hw_port_list_free(struct list_head *hw_port_list) { struct dsa_hw_port *p, *n; list_for_each_entry_safe(p, n, hw_port_list, list) kfree(p); } /* Make the hardware datapath to/from @dev limited to a common MTU */ static void dsa_bridge_mtu_normalization(struct dsa_port *dp) { struct list_head hw_port_list; struct dsa_switch_tree *dst; int min_mtu = ETH_MAX_MTU; struct dsa_port *other_dp; int err; if (!dp->ds->mtu_enforcement_ingress) return; if (!dp->bridge) return; INIT_LIST_HEAD(&hw_port_list); /* Populate the list of ports that are part of the same bridge * as the newly added/modified port */ list_for_each_entry(dst, &dsa_tree_list, list) { list_for_each_entry(other_dp, &dst->ports, list) { struct dsa_hw_port *hw_port; struct net_device *user; if (other_dp->type != DSA_PORT_TYPE_USER) continue; if (!dsa_port_bridge_same(dp, other_dp)) continue; if (!other_dp->ds->mtu_enforcement_ingress) continue; user = other_dp->user; if (min_mtu > user->mtu) min_mtu = user->mtu; hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL); if (!hw_port) goto out; hw_port->dev = user; hw_port->old_mtu = user->mtu; list_add(&hw_port->list, &hw_port_list); } } /* Attempt to configure the entire hardware bridge to the newly added * interface's MTU first, regardless of whether the intention of the * user was to raise or lower it. */ err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->user->mtu); if (!err) goto out; /* Clearly that didn't work out so well, so just set the minimum MTU on * all hardware bridge ports now. If this fails too, then all ports will * still have their old MTU rolled back anyway. */ dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu); out: dsa_hw_port_list_free(&hw_port_list); } int dsa_user_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_port *cpu_dp = dp->cpu_dp; struct dsa_switch *ds = dp->ds; struct dsa_port *other_dp; int largest_mtu = 0; int new_conduit_mtu; int old_conduit_mtu; int mtu_limit; int overhead; int cpu_mtu; int err; if (!ds->ops->port_change_mtu) return -EOPNOTSUPP; dsa_tree_for_each_user_port(other_dp, ds->dst) { int user_mtu; /* During probe, this function will be called for each user * device, while not all of them have been allocated. That's * ok, it doesn't change what the maximum is, so ignore it. */ if (!other_dp->user) continue; /* Pretend that we already applied the setting, which we * actually haven't (still haven't done all integrity checks) */ if (dp == other_dp) user_mtu = new_mtu; else user_mtu = other_dp->user->mtu; if (largest_mtu < user_mtu) largest_mtu = user_mtu; } overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops); mtu_limit = min_t(int, conduit->max_mtu, dev->max_mtu + overhead); old_conduit_mtu = conduit->mtu; new_conduit_mtu = largest_mtu + overhead; if (new_conduit_mtu > mtu_limit) return -ERANGE; /* If the conduit MTU isn't over limit, there's no need to check the CPU * MTU, since that surely isn't either. */ cpu_mtu = largest_mtu; /* Start applying stuff */ if (new_conduit_mtu != old_conduit_mtu) { err = dev_set_mtu(conduit, new_conduit_mtu); if (err < 0) goto out_conduit_failed; /* We only need to propagate the MTU of the CPU port to * upstream switches, so emit a notifier which updates them. */ err = dsa_port_mtu_change(cpu_dp, cpu_mtu); if (err) goto out_cpu_failed; } err = ds->ops->port_change_mtu(ds, dp->index, new_mtu); if (err) goto out_port_failed; WRITE_ONCE(dev->mtu, new_mtu); dsa_bridge_mtu_normalization(dp); return 0; out_port_failed: if (new_conduit_mtu != old_conduit_mtu) dsa_port_mtu_change(cpu_dp, old_conduit_mtu - overhead); out_cpu_failed: if (new_conduit_mtu != old_conduit_mtu) dev_set_mtu(conduit, old_conduit_mtu); out_conduit_failed: return err; } static int __maybe_unused dsa_user_dcbnl_set_apptrust(struct net_device *dev, u8 *sel, int nsel) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->port_set_apptrust) return -EOPNOTSUPP; return ds->ops->port_set_apptrust(ds, port, sel, nsel); } static int __maybe_unused dsa_user_dcbnl_get_apptrust(struct net_device *dev, u8 *sel, int *nsel) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->port_get_apptrust) return -EOPNOTSUPP; return ds->ops->port_get_apptrust(ds, port, sel, nsel); } static int __maybe_unused dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } return 0; } /* Update the DSCP prio entries on all user ports of the switch in case * the switch supports global DSCP prio instead of per port DSCP prios. */ static int dsa_user_dcbnl_ieee_global_dscp_setdel(struct net_device *dev, struct dcb_app *app, bool del) { int (*setdel)(struct net_device *dev, struct dcb_app *app); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct dsa_port *other_dp; int err, restore_err; if (del) setdel = dcb_ieee_delapp; else setdel = dcb_ieee_setapp; dsa_switch_for_each_user_port(other_dp, ds) { struct net_device *user = other_dp->user; if (!user || user == dev) continue; err = setdel(user, app); if (err) goto err_try_to_restore; } return 0; err_try_to_restore: /* Revert logic to restore previous state of app entries */ if (!del) setdel = dcb_ieee_delapp; else setdel = dcb_ieee_setapp; dsa_switch_for_each_user_port_continue_reverse(other_dp, ds) { struct net_device *user = other_dp->user; if (!user || user == dev) continue; restore_err = setdel(user, app); if (restore_err) netdev_err(user, "Failed to restore DSCP prio entry configuration\n"); } return err; } static int __maybe_unused dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_add_dscp_prio) return -EOPNOTSUPP; if (dscp >= 64) { netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n", dscp); return -EINVAL; } err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } if (!ds->dscp_prio_mapping_is_global) return 0; err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, false); if (err) { if (ds->ops->port_del_dscp_prio) ds->ops->port_del_dscp_prio(ds, port, dscp, new_prio); dcb_ieee_delapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_user_dcbnl_set_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_user_dcbnl_add_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } static int __maybe_unused dsa_user_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = mask ? __fls(mask) : 0; err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_del_dscp_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority); if (err) { dcb_ieee_setapp(dev, app); return err; } if (!ds->dscp_prio_mapping_is_global) return 0; err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, true); if (err) { if (ds->ops->port_add_dscp_prio) ds->ops->port_add_dscp_prio(ds, port, dscp, app->priority); dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_user_dcbnl_del_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_user_dcbnl_del_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } /* Pre-populate the DCB application priority table with the priorities * configured during switch setup, which we read from hardware here. */ static int dsa_user_dcbnl_init(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; int err; if (ds->ops->port_get_default_prio) { int prio = ds->ops->port_get_default_prio(ds, port); struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE, .protocol = 0, .priority = prio, }; if (prio < 0) return prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } if (ds->ops->port_get_dscp_prio) { int protocol; for (protocol = 0; protocol < 64; protocol++) { struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_DSCP, .protocol = protocol, }; int prio; prio = ds->ops->port_get_dscp_prio(ds, port, protocol); if (prio == -EOPNOTSUPP) continue; if (prio < 0) return prio; app.priority = prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } } return 0; } static const struct ethtool_ops dsa_user_ethtool_ops = { .get_drvinfo = dsa_user_get_drvinfo, .get_regs_len = dsa_user_get_regs_len, .get_regs = dsa_user_get_regs, .nway_reset = dsa_user_nway_reset, .get_link = ethtool_op_get_link, .get_eeprom_len = dsa_user_get_eeprom_len, .get_eeprom = dsa_user_get_eeprom, .set_eeprom = dsa_user_set_eeprom, .get_strings = dsa_user_get_strings, .get_ethtool_stats = dsa_user_get_ethtool_stats, .get_sset_count = dsa_user_get_sset_count, .get_eth_phy_stats = dsa_user_get_eth_phy_stats, .get_eth_mac_stats = dsa_user_get_eth_mac_stats, .get_eth_ctrl_stats = dsa_user_get_eth_ctrl_stats, .get_rmon_stats = dsa_user_get_rmon_stats, .get_ts_stats = dsa_user_get_ts_stats, .set_wol = dsa_user_set_wol, .get_wol = dsa_user_get_wol, .set_eee = dsa_user_set_eee, .get_eee = dsa_user_get_eee, .get_link_ksettings = dsa_user_get_link_ksettings, .set_link_ksettings = dsa_user_set_link_ksettings, .get_pause_stats = dsa_user_get_pause_stats, .get_pauseparam = dsa_user_get_pauseparam, .set_pauseparam = dsa_user_set_pauseparam, .get_rxnfc = dsa_user_get_rxnfc, .set_rxnfc = dsa_user_set_rxnfc, .get_ts_info = dsa_user_get_ts_info, .self_test = dsa_user_net_selftest, .get_mm = dsa_user_get_mm, .set_mm = dsa_user_set_mm, .get_mm_stats = dsa_user_get_mm_stats, }; static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = { .ieee_setapp = dsa_user_dcbnl_ieee_setapp, .ieee_delapp = dsa_user_dcbnl_ieee_delapp, .dcbnl_setapptrust = dsa_user_dcbnl_set_apptrust, .dcbnl_getapptrust = dsa_user_dcbnl_get_apptrust, }; static void dsa_user_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_stats64) ds->ops->get_stats64(ds, dp->index, s); else dev_get_tstats64(dev, s); } static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct dsa_port *dp = dsa_user_to_port(ctx->dev); struct net_device *conduit = dsa_port_to_conduit(dp); struct dsa_port *cpu_dp = dp->cpu_dp; path->dev = ctx->dev; path->type = DEV_PATH_DSA; path->dsa.proto = cpu_dp->tag_ops->proto; path->dsa.port = dp->index; ctx->dev = conduit; return 0; } static const struct net_device_ops dsa_user_netdev_ops = { .ndo_open = dsa_user_open, .ndo_stop = dsa_user_close, .ndo_start_xmit = dsa_user_xmit, .ndo_change_rx_flags = dsa_user_change_rx_flags, .ndo_set_rx_mode = dsa_user_set_rx_mode, .ndo_set_mac_address = dsa_user_set_mac_address, .ndo_fdb_dump = dsa_user_fdb_dump, .ndo_eth_ioctl = dsa_user_ioctl, .ndo_get_iflink = dsa_user_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = dsa_user_netpoll_setup, .ndo_netpoll_cleanup = dsa_user_netpoll_cleanup, .ndo_poll_controller = dsa_user_poll_controller, #endif .ndo_setup_tc = dsa_user_setup_tc, .ndo_get_stats64 = dsa_user_get_stats64, .ndo_vlan_rx_add_vid = dsa_user_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = dsa_user_vlan_rx_kill_vid, .ndo_change_mtu = dsa_user_change_mtu, .ndo_fill_forward_path = dsa_user_fill_forward_path, }; static const struct device_type dsa_type = { .name = "dsa", }; void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) { const struct dsa_port *dp = dsa_to_port(ds, port); if (dp->pl) phylink_mac_change(dp->pl, up); } EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); static void dsa_user_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would * not be called if it was not. */ ds->ops->phylink_fixed_state(ds, dp->index, state); } /* user device setup *******************************************************/ static int dsa_user_phy_connect(struct net_device *user_dev, int addr, u32 flags) { struct dsa_port *dp = dsa_user_to_port(user_dev); struct dsa_switch *ds = dp->ds; user_dev->phydev = mdiobus_get_phy(ds->user_mii_bus, addr); if (!user_dev->phydev) { netdev_err(user_dev, "no phy at %d\n", addr); return -ENODEV; } user_dev->phydev->dev_flags |= flags; return phylink_connect_phy(dp->pl, user_dev->phydev); } static int dsa_user_phy_setup(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); struct device_node *port_dn = dp->dn; struct dsa_switch *ds = dp->ds; u32 phy_flags = 0; int ret; dp->pl_config.dev = &user_dev->dev; dp->pl_config.type = PHYLINK_NETDEV; /* The get_fixed_state callback takes precedence over polling the * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set * this if the switch provides such a callback. */ if (ds->ops->phylink_fixed_state) { dp->pl_config.get_fixed_state = dsa_user_phylink_fixed_state; dp->pl_config.poll_fixed_state = true; } ret = dsa_port_phylink_create(dp); if (ret) return ret; if (ds->ops->get_phy_flags) phy_flags = ds->ops->get_phy_flags(ds, dp->index); ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags); if (ret == -ENODEV && ds->user_mii_bus) { /* We could not connect to a designated PHY or SFP, so try to * use the switch internal MDIO bus instead */ ret = dsa_user_phy_connect(user_dev, dp->index, phy_flags); } if (ret) { netdev_err(user_dev, "failed to connect to PHY: %pe\n", ERR_PTR(ret)); dsa_port_phylink_destroy(dp); } return ret; } void dsa_user_setup_tagger(struct net_device *user) { struct dsa_port *dp = dsa_user_to_port(user); struct net_device *conduit = dsa_port_to_conduit(dp); struct dsa_user_priv *p = netdev_priv(user); const struct dsa_port *cpu_dp = dp->cpu_dp; const struct dsa_switch *ds = dp->ds; user->needed_headroom = cpu_dp->tag_ops->needed_headroom; user->needed_tailroom = cpu_dp->tag_ops->needed_tailroom; /* Try to save one extra realloc later in the TX path (in the conduit) * by also inheriting the conduit's needed headroom and tailroom. * The 8021q driver also does this. */ user->needed_headroom += conduit->needed_headroom; user->needed_tailroom += conduit->needed_tailroom; p->xmit = cpu_dp->tag_ops->xmit; user->features = conduit->vlan_features | NETIF_F_HW_TC; user->hw_features |= NETIF_F_HW_TC; if (user->needed_tailroom) user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST); if (ds->needs_standalone_vlan_filtering) user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; user->lltx = true; } int dsa_user_suspend(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); if (!netif_running(user_dev)) return 0; netif_device_detach(user_dev); rtnl_lock(); phylink_stop(dp->pl); rtnl_unlock(); return 0; } int dsa_user_resume(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); if (!netif_running(user_dev)) return 0; netif_device_attach(user_dev); rtnl_lock(); phylink_start(dp->pl); rtnl_unlock(); return 0; } int dsa_user_create(struct dsa_port *port) { struct net_device *conduit = dsa_port_to_conduit(port); struct dsa_switch *ds = port->ds; struct net_device *user_dev; struct dsa_user_priv *p; const char *name; int assign_type; int ret; if (!ds->num_tx_queues) ds->num_tx_queues = 1; if (port->name) { name = port->name; assign_type = NET_NAME_PREDICTABLE; } else { name = "eth%d"; assign_type = NET_NAME_ENUM; } user_dev = alloc_netdev_mqs(sizeof(struct dsa_user_priv), name, assign_type, ether_setup, ds->num_tx_queues, 1); if (user_dev == NULL) return -ENOMEM; user_dev->rtnl_link_ops = &dsa_link_ops; user_dev->ethtool_ops = &dsa_user_ethtool_ops; #if IS_ENABLED(CONFIG_DCB) user_dev->dcbnl_ops = &dsa_user_dcbnl_ops; #endif if (!is_zero_ether_addr(port->mac)) eth_hw_addr_set(user_dev, port->mac); else eth_hw_addr_inherit(user_dev, conduit); user_dev->priv_flags |= IFF_NO_QUEUE; if (dsa_switch_supports_uc_filtering(ds)) user_dev->priv_flags |= IFF_UNICAST_FLT; user_dev->netdev_ops = &dsa_user_netdev_ops; if (ds->ops->port_max_mtu) user_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); SET_NETDEV_DEVTYPE(user_dev, &dsa_type); SET_NETDEV_DEV(user_dev, port->ds->dev); SET_NETDEV_DEVLINK_PORT(user_dev, &port->devlink_port); user_dev->dev.of_node = port->dn; user_dev->vlan_features = conduit->vlan_features; p = netdev_priv(user_dev); user_dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; ret = gro_cells_init(&p->gcells, user_dev); if (ret) goto out_free; p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); port->user = user_dev; dsa_user_setup_tagger(user_dev); netif_carrier_off(user_dev); ret = dsa_user_phy_setup(user_dev); if (ret) { netdev_err(user_dev, "error %d setting up PHY for tree %d, switch %d, port %d\n", ret, ds->dst->index, ds->index, port->index); goto out_gcells; } rtnl_lock(); ret = dsa_user_change_mtu(user_dev, ETH_DATA_LEN); if (ret && ret != -EOPNOTSUPP) dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n", ret, ETH_DATA_LEN, port->index); ret = register_netdevice(user_dev); if (ret) { netdev_err(conduit, "error %d registering interface %s\n", ret, user_dev->name); rtnl_unlock(); goto out_phy; } if (IS_ENABLED(CONFIG_DCB)) { ret = dsa_user_dcbnl_init(user_dev); if (ret) { netdev_err(user_dev, "failed to initialize DCB: %pe\n", ERR_PTR(ret)); rtnl_unlock(); goto out_unregister; } } ret = netdev_upper_dev_link(conduit, user_dev, NULL); rtnl_unlock(); if (ret) goto out_unregister; return 0; out_unregister: unregister_netdev(user_dev); out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(p->dp); out_gcells: gro_cells_destroy(&p->gcells); out_free: free_netdev(user_dev); port->user = NULL; return ret; } void dsa_user_destroy(struct net_device *user_dev) { struct net_device *conduit = dsa_user_to_conduit(user_dev); struct dsa_port *dp = dsa_user_to_port(user_dev); struct dsa_user_priv *p = netdev_priv(user_dev); netif_carrier_off(user_dev); rtnl_lock(); netdev_upper_dev_unlink(conduit, user_dev); unregister_netdevice(user_dev); phylink_disconnect_phy(dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(dp); gro_cells_destroy(&p->gcells); free_netdev(user_dev); } int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit, struct netlink_ext_ack *extack) { struct net_device *old_conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct net_device *upper; struct list_head *iter; int err; if (conduit == old_conduit) return 0; if (!ds->ops->port_change_conduit) { NL_SET_ERR_MSG_MOD(extack, "Driver does not support changing DSA conduit"); return -EOPNOTSUPP; } if (!netdev_uses_dsa(conduit)) { NL_SET_ERR_MSG_MOD(extack, "Interface not eligible as DSA conduit"); return -EOPNOTSUPP; } netdev_for_each_upper_dev_rcu(conduit, upper, iter) { if (dsa_user_dev_check(upper)) continue; if (netif_is_bridge_master(upper)) continue; NL_SET_ERR_MSG_MOD(extack, "Cannot join conduit with unknown uppers"); return -EOPNOTSUPP; } /* Since we allow live-changing the DSA conduit, plus we auto-open the * DSA conduit when the user port opens => we need to ensure that the * new DSA conduit is open too. */ if (dev->flags & IFF_UP) { err = dev_open(conduit, extack); if (err) return err; } netdev_upper_dev_unlink(old_conduit, dev); err = netdev_upper_dev_link(conduit, dev, extack); if (err) goto out_revert_old_conduit_unlink; err = dsa_port_change_conduit(dp, conduit, extack); if (err) goto out_revert_conduit_link; /* Update the MTU of the new CPU port through cross-chip notifiers */ err = dsa_user_change_mtu(dev, dev->mtu); if (err && err != -EOPNOTSUPP) { netdev_warn(dev, "nonfatal error updating MTU with new conduit: %pe\n", ERR_PTR(err)); } return 0; out_revert_conduit_link: netdev_upper_dev_unlink(conduit, dev); out_revert_old_conduit_unlink: netdev_upper_dev_link(old_conduit, dev, NULL); return err; } bool dsa_user_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_user_netdev_ops; } EXPORT_SYMBOL_GPL(dsa_user_dev_check); static int dsa_user_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack; int err = NOTIFY_DONE; struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return err; dp = dsa_user_to_port(dev); extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) { err = dsa_port_bridge_join(dp, info->upper_dev, extack); if (!err) dsa_bridge_mtu_normalization(dp); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_bridge_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_port_lag_join(dp, info->upper_dev, info->upper_info, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_lag_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (is_hsr_master(info->upper_dev)) { if (info->linking) { err = dsa_port_hsr_join(dp, info->upper_dev, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_hsr_leave(dp, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_user_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return NOTIFY_DONE; dp = dsa_user_to_port(dev); if (netif_is_bridge_master(info->upper_dev) && !info->linking) dsa_port_pre_bridge_leave(dp, info->upper_dev); else if (netif_is_lag_master(info->upper_dev) && !info->linking) dsa_port_pre_lag_leave(dp, info->upper_dev); /* dsa_port_pre_hsr_leave is not yet necessary since hsr devices cannot * meaningfully placed under a bridge yet */ return NOTIFY_DONE; } static int dsa_user_lag_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_user_dev_check(lower)) continue; dp = dsa_user_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_user_changeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } /* Same as dsa_user_lag_changeupper() except that it calls * dsa_user_prechangeupper() */ static int dsa_user_lag_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_user_dev_check(lower)) continue; dp = dsa_user_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_user_prechangeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } static int dsa_prevent_bridging_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *ext_ack; struct net_device *user, *br; struct dsa_port *dp; ext_ack = netdev_notifier_info_to_extack(&info->info); if (!is_vlan_dev(dev)) return NOTIFY_DONE; user = vlan_dev_real_dev(dev); if (!dsa_user_dev_check(user)) return NOTIFY_DONE; dp = dsa_user_to_port(user); br = dsa_port_bridge_dev_get(dp); if (!br) return NOTIFY_DONE; /* Deny enslaving a VLAN device into a VLAN-aware bridge */ if (br_vlan_enabled(br) && netif_is_bridge_master(info->upper_dev) && info->linking) { NL_SET_ERR_MSG_MOD(ext_ack, "Cannot make VLAN device join VLAN-aware bridge"); return notifier_from_errno(-EINVAL); } return NOTIFY_DONE; } static int dsa_user_check_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp = dsa_user_to_port(dev); struct net_device *br = dsa_port_bridge_dev_get(dp); struct bridge_vlan_info br_info; struct netlink_ext_ack *extack; int err = NOTIFY_DONE; u16 vid; if (!br || !br_vlan_enabled(br)) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); vid = vlan_dev_vlan_id(info->upper_dev); /* br_vlan_get_info() returns -EINVAL or -ENOENT if the * device, respectively the VID is not found, returning * 0 means success, which is a failure for us here. */ err = br_vlan_get_info(br, vid, &br_info); if (err == 0) { NL_SET_ERR_MSG_MOD(extack, "This VLAN is already configured by the bridge"); return notifier_from_errno(-EBUSY); } return NOTIFY_DONE; } static int dsa_user_prechangeupper_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_switch *ds; struct dsa_port *dp; int err; if (!dsa_user_dev_check(dev)) return dsa_prevent_bridging_8021q_upper(dev, info); dp = dsa_user_to_port(dev); ds = dp->ds; if (ds->ops->port_prechangeupper) { err = ds->ops->port_prechangeupper(ds, dp->index, info); if (err) return notifier_from_errno(err); } if (is_vlan_dev(info->upper_dev)) return dsa_user_check_8021q_upper(dev, info); return NOTIFY_DONE; } /* To be eligible as a DSA conduit, a LAG must have all lower interfaces be * eligible DSA conduits. Additionally, all LAG slaves must be DSA conduits of * switches in the same switch tree. */ static int dsa_lag_conduit_validate(struct net_device *lag_dev, struct netlink_ext_ack *extack) { struct net_device *lower1, *lower2; struct list_head *iter1, *iter2; netdev_for_each_lower_dev(lag_dev, lower1, iter1) { netdev_for_each_lower_dev(lag_dev, lower2, iter2) { if (!netdev_uses_dsa(lower1) || !netdev_uses_dsa(lower2)) { NL_SET_ERR_MSG_MOD(extack, "All LAG ports must be eligible as DSA conduits"); return notifier_from_errno(-EINVAL); } if (lower1 == lower2) continue; if (!dsa_port_tree_same(lower1->dsa_ptr, lower2->dsa_ptr)) { NL_SET_ERR_MSG_MOD(extack, "LAG contains DSA conduits of disjoint switch trees"); return notifier_from_errno(-EINVAL); } } } return NOTIFY_DONE; } static int dsa_conduit_prechangeupper_sanity_check(struct net_device *conduit, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); if (!netdev_uses_dsa(conduit)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; /* Allow DSA switch uppers */ if (dsa_user_dev_check(info->upper_dev)) return NOTIFY_DONE; /* Allow bridge uppers of DSA conduits, subject to further * restrictions in dsa_bridge_prechangelower_sanity_check() */ if (netif_is_bridge_master(info->upper_dev)) return NOTIFY_DONE; /* Allow LAG uppers, subject to further restrictions in * dsa_lag_conduit_prechangelower_sanity_check() */ if (netif_is_lag_master(info->upper_dev)) return dsa_lag_conduit_validate(info->upper_dev, extack); NL_SET_ERR_MSG_MOD(extack, "DSA conduit cannot join unknown upper interfaces"); return notifier_from_errno(-EBUSY); } static int dsa_lag_conduit_prechangelower_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); struct net_device *lag_dev = info->upper_dev; struct net_device *lower; struct list_head *iter; if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; if (!netdev_uses_dsa(dev)) { NL_SET_ERR_MSG(extack, "Only DSA conduits can join a LAG DSA conduit"); return notifier_from_errno(-EINVAL); } netdev_for_each_lower_dev(lag_dev, lower, iter) { if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) { NL_SET_ERR_MSG(extack, "Interface is DSA conduit for a different switch tree than this LAG"); return notifier_from_errno(-EINVAL); } break; } return NOTIFY_DONE; } /* Don't allow bridging of DSA conduits, since the bridge layer rx_handler * prevents the DSA fake ethertype handler to be invoked, so we don't get the * chance to strip off and parse the DSA switch tag protocol header (the bridge * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these * frames). * The only case where that would not be an issue is when bridging can already * be offloaded, such as when the DSA conduit is itself a DSA or plain switchdev * port, and is bridged only with other ports from the same hardware device. */ static int dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower, struct netdev_notifier_changeupper_info *info) { struct net_device *br = info->upper_dev; struct netlink_ext_ack *extack; struct net_device *lower; struct list_head *iter; if (!netif_is_bridge_master(br)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); netdev_for_each_lower_dev(br, lower, iter) { if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower)) continue; if (!netdev_port_same_parent_id(lower, new_lower)) { NL_SET_ERR_MSG(extack, "Cannot do software bridging with a DSA conduit"); return notifier_from_errno(-EINVAL); } } return NOTIFY_DONE; } static void dsa_tree_migrate_ports_from_lag_conduit(struct dsa_switch_tree *dst, struct net_device *lag_dev) { struct net_device *new_conduit = dsa_tree_find_first_conduit(dst); struct dsa_port *dp; int err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_conduit(dp) != lag_dev) continue; err = dsa_user_change_conduit(dp->user, new_conduit, NULL); if (err) { netdev_err(dp->user, "failed to restore conduit to %s: %pe\n", new_conduit->name, ERR_PTR(err)); } } } static int dsa_conduit_lag_join(struct net_device *conduit, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo, struct netlink_ext_ack *extack) { struct dsa_port *cpu_dp = conduit->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *dp; int err; err = dsa_conduit_lag_setup(lag_dev, cpu_dp, uinfo, extack); if (err) return err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_conduit(dp) != conduit) continue; err = dsa_user_change_conduit(dp->user, lag_dev, extack); if (err) goto restore; } return 0; restore: dsa_tree_for_each_user_port_continue_reverse(dp, dst) { if (dsa_port_to_conduit(dp) != lag_dev) continue; err = dsa_user_change_conduit(dp->user, conduit, NULL); if (err) { netdev_err(dp->user, "failed to restore conduit to %s: %pe\n", conduit->name, ERR_PTR(err)); } } dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr); return err; } static void dsa_conduit_lag_leave(struct net_device *conduit, struct net_device *lag_dev) { struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *new_cpu_dp = NULL; struct net_device *lower; struct list_head *iter; netdev_for_each_lower_dev(lag_dev, lower, iter) { if (netdev_uses_dsa(lower)) { new_cpu_dp = lower->dsa_ptr; break; } } if (new_cpu_dp) { /* Update the CPU port of the user ports still under the LAG * so that dsa_port_to_conduit() continues to work properly */ dsa_tree_for_each_user_port(dp, dst) if (dsa_port_to_conduit(dp) == lag_dev) dp->cpu_dp = new_cpu_dp; /* Update the index of the virtual CPU port to match the lowest * physical CPU port */ lag_dev->dsa_ptr = new_cpu_dp; wmb(); } else { /* If the LAG DSA conduit has no ports left, migrate back all * user ports to the first physical CPU port */ dsa_tree_migrate_ports_from_lag_conduit(dst, lag_dev); } /* This DSA conduit has left its LAG in any case, so let * the CPU port leave the hardware LAG as well */ dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr); } static int dsa_conduit_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack; int err = NOTIFY_DONE; if (!netdev_uses_dsa(dev)) return err; extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_conduit_lag_join(dev, info->upper_dev, info->upper_info, extack); err = notifier_from_errno(err); } else { dsa_conduit_lag_leave(dev, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_user_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_PRECHANGEUPPER: { struct netdev_notifier_changeupper_info *info = ptr; int err; err = dsa_user_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_conduit_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_lag_conduit_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_bridge_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_user_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_user_lag_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGEUPPER: { int err; err = dsa_user_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_user_lag_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_conduit_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGELOWERSTATE: { struct netdev_notifier_changelowerstate_info *info = ptr; struct dsa_port *dp; int err = 0; if (dsa_user_dev_check(dev)) { dp = dsa_user_to_port(dev); err = dsa_port_lag_change(dp, info->lower_state_info); } /* Mirror LAG port events on DSA conduits that are in * a LAG towards their respective switch CPU ports */ if (netdev_uses_dsa(dev)) { dp = dev->dsa_ptr; err = dsa_port_lag_change(dp, info->lower_state_info); } return notifier_from_errno(err); } case NETDEV_CHANGE: case NETDEV_UP: { /* Track state of conduit port. * DSA driver may require the conduit port (and indirectly * the tagger) to be available for some special operation. */ if (netdev_uses_dsa(dev)) { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->ds->dst; /* Track when the conduit port is UP */ dsa_tree_conduit_oper_state_change(dst, dev, netif_oper_up(dev)); /* Track when the conduit port is ready and can accept * packet. * NETDEV_UP event is not enough to flag a port as ready. * We also have to wait for linkwatch_do_dev to dev_activate * and emit a NETDEV_CHANGE event. * We check if a conduit port is ready by checking if the dev * have a qdisc assigned and is not noop. */ dsa_tree_conduit_admin_state_change(dst, dev, !qdisc_tx_is_noop(dev)); return NOTIFY_OK; } return NOTIFY_DONE; } case NETDEV_GOING_DOWN: { struct dsa_port *dp, *cpu_dp; struct dsa_switch_tree *dst; LIST_HEAD(close_list); if (!netdev_uses_dsa(dev)) return NOTIFY_DONE; cpu_dp = dev->dsa_ptr; dst = cpu_dp->ds->dst; dsa_tree_conduit_admin_state_change(dst, dev, false); list_for_each_entry(dp, &dst->ports, list) { if (!dsa_port_is_user(dp)) continue; if (dp->cpu_dp != cpu_dp) continue; list_add(&dp->user->close_list, &close_list); } dev_close_many(&close_list, true); return NOTIFY_OK; } default: break; } return NOTIFY_DONE; } static void dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work) { struct switchdev_notifier_fdb_info info = {}; info.addr = switchdev_work->addr; info.vid = switchdev_work->vid; info.offloaded = true; call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, switchdev_work->orig_dev, &info.info, NULL); } static void dsa_user_switchdev_event_work(struct work_struct *work) { struct dsa_switchdev_event_work *switchdev_work = container_of(work, struct dsa_switchdev_event_work, work); const unsigned char *addr = switchdev_work->addr; struct net_device *dev = switchdev_work->dev; u16 vid = switchdev_work->vid; struct dsa_switch *ds; struct dsa_port *dp; int err; dp = dsa_user_to_port(dev); ds = dp->ds; switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_add(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_add(dp, addr, vid); else err = dsa_port_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } dsa_fdb_offload_notify(switchdev_work); break; case SWITCHDEV_FDB_DEL_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_del(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_del(dp, addr, vid); else err = dsa_port_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; } kfree(switchdev_work); } static bool dsa_foreign_dev_check(const struct net_device *dev, const struct net_device *foreign_dev) { const struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch_tree *dst = dp->ds->dst; if (netif_is_bridge_master(foreign_dev)) return !dsa_tree_offloads_bridge_dev(dst, foreign_dev); if (netif_is_bridge_port(foreign_dev)) return !dsa_tree_offloads_bridge_port(dst, foreign_dev); /* Everything else is foreign */ return true; } static int dsa_user_fdb_event(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info) { struct dsa_switchdev_event_work *switchdev_work; struct dsa_port *dp = dsa_user_to_port(dev); bool host_addr = fdb_info->is_local; struct dsa_switch *ds = dp->ds; if (ctx && ctx != dp) return 0; if (!dp->bridge) return 0; if (switchdev_fdb_is_dynamically_learned(fdb_info)) { if (dsa_port_offloads_bridge_port(dp, orig_dev)) return 0; /* FDB entries learned by the software bridge or by foreign * bridge ports should be installed as host addresses only if * the driver requests assisted learning. */ if (!ds->assisted_learning_on_cpu_port) return 0; } /* Also treat FDB entries on foreign interfaces bridged with us as host * addresses. */ if (dsa_foreign_dev_check(dev, orig_dev)) host_addr = true; /* Check early that we're not doing work in vain. * Host addresses on LAG ports still require regular FDB ops, * since the CPU port isn't in a LAG. */ if (dp->lag && !host_addr) { if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del) return -EOPNOTSUPP; } else { if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del) return -EOPNOTSUPP; } switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); if (!switchdev_work) return -ENOMEM; netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n", event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting", orig_dev->name, fdb_info->addr, fdb_info->vid, host_addr ? " as host address" : ""); INIT_WORK(&switchdev_work->work, dsa_user_switchdev_event_work); switchdev_work->event = event; switchdev_work->dev = dev; switchdev_work->orig_dev = orig_dev; ether_addr_copy(switchdev_work->addr, fdb_info->addr); switchdev_work->vid = fdb_info->vid; switchdev_work->host_addr = host_addr; dsa_schedule_work(&switchdev_work->work); return 0; } /* Called under rcu_read_lock() */ static int dsa_user_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_user_dev_check, dsa_user_port_attr_set); return notifier_from_errno(err); case SWITCHDEV_FDB_ADD_TO_DEVICE: case SWITCHDEV_FDB_DEL_TO_DEVICE: err = switchdev_handle_fdb_event_to_device(dev, event, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_fdb_event); return notifier_from_errno(err); default: return NOTIFY_DONE; } return NOTIFY_OK; } static int dsa_user_switchdev_blocking_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_OBJ_ADD: err = switchdev_handle_port_obj_add_foreign(dev, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_port_obj_add); return notifier_from_errno(err); case SWITCHDEV_PORT_OBJ_DEL: err = switchdev_handle_port_obj_del_foreign(dev, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_port_obj_del); return notifier_from_errno(err); case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_user_dev_check, dsa_user_port_attr_set); return notifier_from_errno(err); } return NOTIFY_DONE; } static struct notifier_block dsa_user_nb __read_mostly = { .notifier_call = dsa_user_netdevice_event, }; struct notifier_block dsa_user_switchdev_notifier = { .notifier_call = dsa_user_switchdev_event, }; struct notifier_block dsa_user_switchdev_blocking_notifier = { .notifier_call = dsa_user_switchdev_blocking_event, }; int dsa_user_register_notifier(void) { struct notifier_block *nb; int err; err = register_netdevice_notifier(&dsa_user_nb); if (err) return err; err = register_switchdev_notifier(&dsa_user_switchdev_notifier); if (err) goto err_switchdev_nb; nb = &dsa_user_switchdev_blocking_notifier; err = register_switchdev_blocking_notifier(nb); if (err) goto err_switchdev_blocking_nb; return 0; err_switchdev_blocking_nb: unregister_switchdev_notifier(&dsa_user_switchdev_notifier); err_switchdev_nb: unregister_netdevice_notifier(&dsa_user_nb); return err; } void dsa_user_unregister_notifier(void) { struct notifier_block *nb; int err; nb = &dsa_user_switchdev_blocking_notifier; err = unregister_switchdev_blocking_notifier(nb); if (err) pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err); err = unregister_switchdev_notifier(&dsa_user_switchdev_notifier); if (err) pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err); err = unregister_netdevice_notifier(&dsa_user_nb); if (err) pr_err("DSA: failed to unregister user notifier (%d)\n", err); } |
168 168 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/fcntl.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/syscalls.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/sched/task.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/file.h> #include <linux/capability.h> #include <linux/dnotify.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/pipe_fs_i.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/memfd.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/rw_hint.h> #include <linux/poll.h> #include <asm/siginfo.h> #include <linux/uaccess.h> #include "internal.h" #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) static int setfl(int fd, struct file * filp, unsigned int arg) { struct inode * inode = file_inode(filp); int error = 0; /* * O_APPEND cannot be cleared if the file is marked as append-only * and the file is open for write. */ if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) if (!inode_owner_or_capable(file_mnt_idmap(filp), inode)) return -EPERM; /* required for strict SunOS emulation */ if (O_NONBLOCK != O_NDELAY) if (arg & O_NDELAY) arg |= O_NONBLOCK; /* Pipe packetized mode is controlled by O_DIRECT flag */ if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT) && !(filp->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; if (filp->f_op->check_flags) error = filp->f_op->check_flags(arg); if (error) return error; /* * ->fasync() is responsible for setting the FASYNC bit. */ if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) { error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); if (error < 0) goto out; if (error > 0) error = 0; } spin_lock(&filp->f_lock); filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); filp->f_iocb_flags = iocb_flags(filp); spin_unlock(&filp->f_lock); out: return error; } /* * Allocate an file->f_owner struct if it doesn't exist, handling racing * allocations correctly. */ int file_f_owner_allocate(struct file *file) { struct fown_struct *f_owner; f_owner = file_f_owner(file); if (f_owner) return 0; f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL); if (!f_owner) return -ENOMEM; rwlock_init(&f_owner->lock); f_owner->file = file; /* If someone else raced us, drop our allocation. */ if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner))) kfree(f_owner); return 0; } EXPORT_SYMBOL(file_f_owner_allocate); void file_f_owner_release(struct file *file) { struct fown_struct *f_owner; f_owner = file_f_owner(file); if (f_owner) { put_pid(f_owner->pid); kfree(f_owner); } } void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { struct fown_struct *f_owner; f_owner = file_f_owner(filp); if (WARN_ON_ONCE(!f_owner)) return; write_lock_irq(&f_owner->lock); if (force || !f_owner->pid) { put_pid(f_owner->pid); f_owner->pid = get_pid(pid); f_owner->pid_type = type; if (pid) { const struct cred *cred = current_cred(); security_file_set_fowner(filp); f_owner->uid = cred->uid; f_owner->euid = cred->euid; } } write_unlock_irq(&f_owner->lock); } EXPORT_SYMBOL(__f_setown); int f_setown(struct file *filp, int who, int force) { enum pid_type type; struct pid *pid = NULL; int ret = 0; might_sleep(); type = PIDTYPE_TGID; if (who < 0) { /* avoid overflow below */ if (who == INT_MIN) return -EINVAL; type = PIDTYPE_PGID; who = -who; } ret = file_f_owner_allocate(filp); if (ret) return ret; rcu_read_lock(); if (who) { pid = find_vpid(who); if (!pid) ret = -ESRCH; } if (!ret) __f_setown(filp, pid, type, force); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(f_setown); void f_delown(struct file *filp) { __f_setown(filp, NULL, PIDTYPE_TGID, 1); } pid_t f_getown(struct file *filp) { pid_t pid = 0; struct fown_struct *f_owner; f_owner = file_f_owner(filp); if (!f_owner) return pid; read_lock_irq(&f_owner->lock); rcu_read_lock(); if (pid_task(f_owner->pid, f_owner->pid_type)) { pid = pid_vnr(f_owner->pid); if (f_owner->pid_type == PIDTYPE_PGID) pid = -pid; } rcu_read_unlock(); read_unlock_irq(&f_owner->lock); return pid; } static int f_setown_ex(struct file *filp, unsigned long arg) { struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner; struct pid *pid; int type; int ret; ret = copy_from_user(&owner, owner_p, sizeof(owner)); if (ret) return -EFAULT; switch (owner.type) { case F_OWNER_TID: type = PIDTYPE_PID; break; case F_OWNER_PID: type = PIDTYPE_TGID; break; case F_OWNER_PGRP: type = PIDTYPE_PGID; break; default: return -EINVAL; } ret = file_f_owner_allocate(filp); if (ret) return ret; rcu_read_lock(); pid = find_vpid(owner.pid); if (owner.pid && !pid) ret = -ESRCH; else __f_setown(filp, pid, type, 1); rcu_read_unlock(); return ret; } static int f_getown_ex(struct file *filp, unsigned long arg) { struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner = {}; int ret = 0; struct fown_struct *f_owner; enum pid_type pid_type = PIDTYPE_PID; f_owner = file_f_owner(filp); if (f_owner) { read_lock_irq(&f_owner->lock); rcu_read_lock(); if (pid_task(f_owner->pid, f_owner->pid_type)) owner.pid = pid_vnr(f_owner->pid); rcu_read_unlock(); pid_type = f_owner->pid_type; } switch (pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; break; case PIDTYPE_TGID: owner.type = F_OWNER_PID; break; case PIDTYPE_PGID: owner.type = F_OWNER_PGRP; break; default: WARN_ON(1); ret = -EINVAL; break; } if (f_owner) read_unlock_irq(&f_owner->lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); if (ret) ret = -EFAULT; } return ret; } #ifdef CONFIG_CHECKPOINT_RESTORE static int f_getowner_uids(struct file *filp, unsigned long arg) { struct user_namespace *user_ns = current_user_ns(); struct fown_struct *f_owner; uid_t __user *dst = (void __user *)arg; uid_t src[2] = {0, 0}; int err; f_owner = file_f_owner(filp); if (f_owner) { read_lock_irq(&f_owner->lock); src[0] = from_kuid(user_ns, f_owner->uid); src[1] = from_kuid(user_ns, f_owner->euid); read_unlock_irq(&f_owner->lock); } err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); return err; } #else static int f_getowner_uids(struct file *filp, unsigned long arg) { return -EINVAL; } #endif static bool rw_hint_valid(u64 hint) { BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET); BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE); BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT); BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM); BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG); BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME); switch (hint) { case RWH_WRITE_LIFE_NOT_SET: case RWH_WRITE_LIFE_NONE: case RWH_WRITE_LIFE_SHORT: case RWH_WRITE_LIFE_MEDIUM: case RWH_WRITE_LIFE_LONG: case RWH_WRITE_LIFE_EXTREME: return true; default: return false; } } static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; u64 hint = READ_ONCE(inode->i_write_hint); if (copy_to_user(argp, &hint, sizeof(*argp))) return -EFAULT; return 0; } static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; u64 hint; if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) return -EPERM; if (copy_from_user(&hint, argp, sizeof(hint))) return -EFAULT; if (!rw_hint_valid(hint)) return -EINVAL; WRITE_ONCE(inode->i_write_hint, hint); /* * file->f_mapping->host may differ from inode. As an example, * blkdev_open() modifies file->f_mapping. */ if (file->f_mapping->host != inode) WRITE_ONCE(file->f_mapping->host->i_write_hint, hint); return 0; } /* Is the file descriptor a dup of the file? */ static long f_dupfd_query(int fd, struct file *filp) { CLASS(fd_raw, f)(fd); if (fd_empty(f)) return -EBADF; /* * We can do the 'fdput()' immediately, as the only thing that * matters is the pointer value which isn't changed by the fdput. * * Technically we didn't need a ref at all, and 'fdget()' was * overkill, but given our lockless file pointer lookup, the * alternatives are complicated. */ return fd_file(f) == filp; } /* Let the caller figure out whether a given file was just created. */ static long f_created_query(const struct file *filp) { return !!(filp->f_mode & FMODE_CREATED); } static int f_owner_sig(struct file *filp, int signum, bool setsig) { int ret = 0; struct fown_struct *f_owner; might_sleep(); if (setsig) { if (!valid_signal(signum)) return -EINVAL; ret = file_f_owner_allocate(filp); if (ret) return ret; } f_owner = file_f_owner(filp); if (setsig) f_owner->signum = signum; else if (f_owner) ret = f_owner->signum; return ret; } static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; int argi = (int)arg; struct flock flock; long err = -EINVAL; switch (cmd) { case F_CREATED_QUERY: err = f_created_query(filp); break; case F_DUPFD: err = f_dupfd(argi, filp, 0); break; case F_DUPFD_CLOEXEC: err = f_dupfd(argi, filp, O_CLOEXEC); break; case F_DUPFD_QUERY: err = f_dupfd_query(argi, filp); break; case F_GETFD: err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; break; case F_SETFD: err = 0; set_close_on_exec(fd, argi & FD_CLOEXEC); break; case F_GETFL: err = filp->f_flags; break; case F_SETFL: err = setfl(fd, filp, argi); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ case F_OFD_GETLK: #endif case F_GETLK: if (copy_from_user(&flock, argp, sizeof(flock))) return -EFAULT; err = fcntl_getlk(filp, cmd, &flock); if (!err && copy_to_user(argp, &flock, sizeof(flock))) return -EFAULT; break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ case F_OFD_SETLK: case F_OFD_SETLKW: fallthrough; #endif case F_SETLK: case F_SETLKW: if (copy_from_user(&flock, argp, sizeof(flock))) return -EFAULT; err = fcntl_setlk(fd, filp, cmd, &flock); break; case F_GETOWN: /* * XXX If f_owner is a process group, the * negative return value will get converted * into an error. Oops. If we keep the * current syscall conventions, the only way * to fix this will be in libc. */ err = f_getown(filp); force_successful_syscall_return(); break; case F_SETOWN: err = f_setown(filp, argi, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); break; case F_SETOWN_EX: err = f_setown_ex(filp, arg); break; case F_GETOWNER_UIDS: err = f_getowner_uids(filp, arg); break; case F_GETSIG: err = f_owner_sig(filp, 0, false); break; case F_SETSIG: err = f_owner_sig(filp, argi, true); break; case F_GETLEASE: err = fcntl_getlease(filp); break; case F_SETLEASE: err = fcntl_setlease(fd, filp, argi); break; case F_NOTIFY: err = fcntl_dirnotify(fd, filp, argi); break; case F_SETPIPE_SZ: case F_GETPIPE_SZ: err = pipe_fcntl(filp, cmd, argi); break; case F_ADD_SEALS: case F_GET_SEALS: err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: err = fcntl_get_rw_hint(filp, cmd, arg); break; case F_SET_RW_HINT: err = fcntl_set_rw_hint(filp, cmd, arg); break; default: break; } return err; } static int check_fcntl_cmd(unsigned cmd) { switch (cmd) { case F_CREATED_QUERY: case F_DUPFD: case F_DUPFD_CLOEXEC: case F_DUPFD_QUERY: case F_GETFD: case F_SETFD: case F_GETFL: return 1; } return 0; } SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { CLASS(fd_raw, f)(fd); long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (!err) err = do_fcntl(fd, cmd, arg, fd_file(f)); return err; } #if BITS_PER_LONG == 32 SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { void __user *argp = (void __user *)arg; CLASS(fd_raw, f)(fd); struct flock64 flock; long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) return err; switch (cmd) { case F_GETLK64: case F_OFD_GETLK: err = -EFAULT; if (copy_from_user(&flock, argp, sizeof(flock))) break; err = fcntl_getlk64(fd_file(f), cmd, &flock); if (!err && copy_to_user(argp, &flock, sizeof(flock))) err = -EFAULT; break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: err = -EFAULT; if (copy_from_user(&flock, argp, sizeof(flock))) break; err = fcntl_setlk64(fd, fd_file(f), cmd, &flock); break; default: err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } return err; } #endif #ifdef CONFIG_COMPAT /* careful - don't use anywhere else */ #define copy_flock_fields(dst, src) \ (dst)->l_type = (src)->l_type; \ (dst)->l_whence = (src)->l_whence; \ (dst)->l_start = (src)->l_start; \ (dst)->l_len = (src)->l_len; \ (dst)->l_pid = (src)->l_pid; static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl) { struct compat_flock fl; if (copy_from_user(&fl, ufl, sizeof(struct compat_flock))) return -EFAULT; copy_flock_fields(kfl, &fl); return 0; } static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl) { struct compat_flock64 fl; if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64))) return -EFAULT; copy_flock_fields(kfl, &fl); return 0; } static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl) { struct compat_flock fl; memset(&fl, 0, sizeof(struct compat_flock)); copy_flock_fields(&fl, kfl); if (copy_to_user(ufl, &fl, sizeof(struct compat_flock))) return -EFAULT; return 0; } static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl) { struct compat_flock64 fl; BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start)); BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len)); memset(&fl, 0, sizeof(struct compat_flock64)); copy_flock_fields(&fl, kfl); if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64))) return -EFAULT; return 0; } #undef copy_flock_fields static unsigned int convert_fcntl_cmd(unsigned int cmd) { switch (cmd) { case F_GETLK64: return F_GETLK; case F_SETLK64: return F_SETLK; case F_SETLKW64: return F_SETLKW; } return cmd; } /* * GETLK was successful and we need to return the data, but it needs to fit in * the compat structure. * l_start shouldn't be too big, unless the original start + end is greater than * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return * -EOVERFLOW in that case. l_len could be too big, in which case we just * truncate it, and only allow the app to see that part of the conflicting lock * that might make sense to it anyway */ static int fixup_compat_flock(struct flock *flock) { if (flock->l_start > COMPAT_OFF_T_MAX) return -EOVERFLOW; if (flock->l_len > COMPAT_OFF_T_MAX) flock->l_len = COMPAT_OFF_T_MAX; return 0; } static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, compat_ulong_t arg) { CLASS(fd_raw, f)(fd); struct flock flock; long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) return err; switch (cmd) { case F_GETLK: err = get_compat_flock(&flock, compat_ptr(arg)); if (err) break; err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock); if (err) break; err = fixup_compat_flock(&flock); if (!err) err = put_compat_flock(&flock, compat_ptr(arg)); break; case F_GETLK64: case F_OFD_GETLK: err = get_compat_flock64(&flock, compat_ptr(arg)); if (err) break; err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock); if (!err) err = put_compat_flock64(&flock, compat_ptr(arg)); break; case F_SETLK: case F_SETLKW: err = get_compat_flock(&flock, compat_ptr(arg)); if (err) break; err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock); break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: err = get_compat_flock64(&flock, compat_ptr(arg)); if (err) break; err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock); break; default: err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } return err; } COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { return do_compat_fcntl64(fd, cmd, arg); } COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { switch (cmd) { case F_GETLK64: case F_SETLK64: case F_SETLKW64: case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: return -EINVAL; } return do_compat_fcntl64(fd, cmd, arg); } #endif /* Table to convert sigio signal codes into poll band bitmaps */ static const __poll_t band_table[NSIGPOLL] = { EPOLLIN | EPOLLRDNORM, /* POLL_IN */ EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND, /* POLL_OUT */ EPOLLIN | EPOLLRDNORM | EPOLLMSG, /* POLL_MSG */ EPOLLERR, /* POLL_ERR */ EPOLLPRI | EPOLLRDBAND, /* POLL_PRI */ EPOLLHUP | EPOLLERR /* POLL_HUP */ }; static inline int sigio_perm(struct task_struct *p, struct fown_struct *fown, int sig) { const struct cred *cred; int ret; rcu_read_lock(); cred = __task_cred(p); ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) || uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) || uid_eq(fown->uid, cred->suid) || uid_eq(fown->uid, cred->uid)) && !security_file_send_sigiotask(p, fown, sig)); rcu_read_unlock(); return ret; } static void send_sigio_to_task(struct task_struct *p, struct fown_struct *fown, int fd, int reason, enum pid_type type) { /* * F_SETSIG can change ->signum lockless in parallel, make * sure we read it once and use the same value throughout. */ int signum = READ_ONCE(fown->signum); if (!sigio_perm(p, fown, signum)) return; switch (signum) { default: { kernel_siginfo_t si; /* Queue a rt signal with the appropriate fd as its value. We use SI_SIGIO as the source, not SI_KERNEL, since kernel signals always get delivered even if we can't queue. Failure to queue in this case _should_ be reported; we fall back to SIGIO in that case. --sct */ clear_siginfo(&si); si.si_signo = signum; si.si_errno = 0; si.si_code = reason; /* * Posix definies POLL_IN and friends to be signal * specific si_codes for SIG_POLL. Linux extended * these si_codes to other signals in a way that is * ambiguous if other signals also have signal * specific si_codes. In that case use SI_SIGIO instead * to remove the ambiguity. */ if ((signum != SIGPOLL) && sig_specific_sicodes(signum)) si.si_code = SI_SIGIO; /* Make sure we are called with one of the POLL_* reasons, otherwise we could leak kernel stack into userspace. */ BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL)); if (reason - POLL_IN >= NSIGPOLL) si.si_band = ~0L; else si.si_band = mangle_poll(band_table[reason - POLL_IN]); si.si_fd = fd; if (!do_send_sig_info(signum, &si, p, type)) break; } fallthrough; /* fall back on the old plain SIGIO signal */ case 0: do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type); } } void send_sigio(struct fown_struct *fown, int fd, int band) { struct task_struct *p; enum pid_type type; unsigned long flags; struct pid *pid; read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) send_sigio_to_task(p, fown, fd, band, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { send_sigio_to_task(p, fown, fd, band, type); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); } out_unlock_fown: read_unlock_irqrestore(&fown->lock, flags); } static void send_sigurg_to_task(struct task_struct *p, struct fown_struct *fown, enum pid_type type) { if (sigio_perm(p, fown, SIGURG)) do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); } int send_sigurg(struct file *file) { struct fown_struct *fown; struct task_struct *p; enum pid_type type; struct pid *pid; unsigned long flags; int ret = 0; fown = file_f_owner(file); if (!fown) return 0; read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; ret = 1; if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) send_sigurg_to_task(p, fown, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { send_sigurg_to_task(p, fown, type); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); } out_unlock_fown: read_unlock_irqrestore(&fown->lock, flags); return ret; } static DEFINE_SPINLOCK(fasync_lock); static struct kmem_cache *fasync_cache __ro_after_init; /* * Remove a fasync entry. If successfully removed, return * positive and clear the FASYNC flag. If no entry exists, * do nothing and return 0. * * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". * */ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *fa, **fp; int result = 0; spin_lock(&filp->f_lock); spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; write_lock_irq(&fa->fa_lock); fa->fa_file = NULL; write_unlock_irq(&fa->fa_lock); *fp = fa->fa_next; kfree_rcu(fa, fa_rcu); filp->f_flags &= ~FASYNC; result = 1; break; } spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return result; } struct fasync_struct *fasync_alloc(void) { return kmem_cache_alloc(fasync_cache, GFP_KERNEL); } /* * NOTE! This can be used only for unused fasync entries: * entries that actually got inserted on the fasync list * need to be released by rcu - see fasync_remove_entry. */ void fasync_free(struct fasync_struct *new) { kmem_cache_free(fasync_cache, new); } /* * Insert a new entry into the fasync list. Return the pointer to the * old one if we didn't use the new one. * * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". */ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new) { struct fasync_struct *fa, **fp; spin_lock(&filp->f_lock); spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; write_lock_irq(&fa->fa_lock); fa->fa_fd = fd; write_unlock_irq(&fa->fa_lock); goto out; } rwlock_init(&new->fa_lock); new->magic = FASYNC_MAGIC; new->fa_file = filp; new->fa_fd = fd; new->fa_next = *fapp; rcu_assign_pointer(*fapp, new); filp->f_flags |= FASYNC; out: spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return fa; } /* * Add a fasync entry. Return negative on error, positive if * added, and zero if did nothing but change an existing one. */ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *new; new = fasync_alloc(); if (!new) return -ENOMEM; /* * fasync_insert_entry() returns the old (update) entry if * it existed. * * So free the (unused) new entry and return 0 to let the * caller know that we didn't add any new fasync entries. */ if (fasync_insert_entry(fd, filp, fapp, new)) { fasync_free(new); return 0; } return 1; } /* * fasync_helper() is used by almost all character device drivers * to set up the fasync queue, and for regular files by the file * lease code. It returns negative on error, 0 if it did no changes * and positive if it added/deleted the entry. */ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) { if (!on) return fasync_remove_entry(filp, fapp); return fasync_add_entry(fd, filp, fapp); } EXPORT_SYMBOL(fasync_helper); /* * rcu_read_lock() is held */ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = file_f_owner(fa->fa_file); if (!fown) goto next; /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } next: read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } } void kill_fasync(struct fasync_struct **fp, int sig, int band) { /* First a quick test without locking: usually * the list is empty. */ if (*fp) { rcu_read_lock(); kill_fasync_rcu(rcu_dereference(*fp), sig, band); rcu_read_unlock(); } } EXPORT_SYMBOL(kill_fasync); static int __init fcntl_init(void) { /* * Please add new bits here to ensure allocation uniqueness. * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | __FMODE_EXEC)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } module_init(fcntl_init) |
23 3 26 24 2 26 26 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/attr.c * * Copyright (C) 1991, 1992 Linus Torvalds * changes by Thomas Schoebel-Theuer */ #include <linux/export.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/sched/signal.h> #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> #include <linux/filelock.h> #include <linux/security.h> /** * setattr_should_drop_sgid - determine whether the setgid bit needs to be * removed * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the setgid bit needs to be removed. * We retain backwards compatibility and require setgid bit to be removed * unconditionally if S_IXGRP is set. Otherwise we have the exact same * requirements as setattr_prepare() and setattr_copy(). * * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. */ int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode) { umode_t mode = inode->i_mode; if (!(mode & S_ISGID)) return 0; if (mode & S_IXGRP) return ATTR_KILL_SGID; if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) return ATTR_KILL_SGID; return 0; } EXPORT_SYMBOL(setattr_should_drop_sgid); /** * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to * be dropped * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the set{g,u}id bits need to be removed. * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both * set{g,u}id bits need to be removed the corresponding mask of both flags is * returned. * * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits * to remove, 0 otherwise. */ int setattr_should_drop_suidgid(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; int kill = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; kill |= setattr_should_drop_sgid(idmap, inode); if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; return 0; } EXPORT_SYMBOL(setattr_should_drop_suidgid); /** * chown_ok - verify permissions to chown inode * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsuid: uid to chown @inode to * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ static bool chown_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsuid_t ia_vfsuid) { vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && vfsuid_eq(ia_vfsuid, vfsuid)) return true; if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; } /** * chgrp_ok - verify permissions to chgrp inode * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsgid: gid to chown @inode to * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ static bool chgrp_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t ia_vfsgid) { vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { if (vfsgid_eq(ia_vfsgid, vfsgid)) return true; if (vfsgid_in_group_p(ia_vfsgid)) return true; } if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; } /** * setattr_prepare - check if attribute changes to a dentry are allowed * @idmap: idmap of the mount the inode was found from * @dentry: dentry to check * @attr: attributes to change * * Check if we are allowed to change the attributes contained in @attr * in the given dentry. This includes the normal unix access permission * checks, as well as checks for rlimits and others. The function also clears * SGID bit from mode if user is not allowed to set it. Also file capabilities * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. */ int setattr_prepare(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; /* * First check size constraints. These can't be overriden using * ATTR_FORCE. */ if (ia_valid & ATTR_SIZE) { int error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; } /* If force is set do it anyway. */ if (ia_valid & ATTR_FORCE) goto kill_priv; /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && !chown_ok(idmap, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && !chgrp_ok(idmap, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { vfsgid_t vfsgid; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ia_valid & ATTR_GID) vfsgid = attr->ia_vfsgid; else vfsgid = i_gid_into_vfsgid(idmap, inode); /* Also check the setgid bit! */ if (!in_group_or_capable(idmap, inode, vfsgid)) attr->ia_mode &= ~S_ISGID; } /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { if (!inode_owner_or_capable(idmap, inode)) return -EPERM; } kill_priv: /* User has permission for the change */ if (ia_valid & ATTR_KILL_PRIV) { int error; error = security_inode_killpriv(idmap, dentry); if (error) return error; } return 0; } EXPORT_SYMBOL(setattr_prepare); /** * inode_newsize_ok - may this inode be truncated to a given size * @inode: the inode to be truncated * @offset: the new size to assign to the inode * * inode_newsize_ok must be called with i_mutex held. * * inode_newsize_ok will check filesystem limits and ulimits to check that the * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ * when necessary. Caller must not proceed with inode size change if failure is * returned. @inode must be a file (not directory), with appropriate * permissions to allow truncate (inode_newsize_ok does NOT check these * conditions). * * Return: 0 on success, -ve errno on failure */ int inode_newsize_ok(const struct inode *inode, loff_t offset) { if (offset < 0) return -EINVAL; if (inode->i_size < offset) { unsigned long limit; limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY && offset > limit) goto out_sig; if (offset > inode->i_sb->s_maxbytes) goto out_big; } else { /* * truncation of in-use swapfiles is disallowed - it would * cause subsequent swapout to scribble on the now-freed * blocks. */ if (IS_SWAPFILE(inode)) return -ETXTBSY; } return 0; out_sig: send_sig(SIGXFSZ, current, 0); out_big: return -EFBIG; } EXPORT_SYMBOL(inode_newsize_ok); /** * setattr_copy_mgtime - update timestamps for mgtime inodes * @inode: inode timestamps to be updated * @attr: attrs for the update * * With multigrain timestamps, take more care to prevent races when * updating the ctime. Always update the ctime to the very latest using * the standard mechanism, and use that to populate the atime and mtime * appropriately (unless those are being set to specific values). */ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; struct timespec64 now; if (ia_valid & ATTR_CTIME) { /* * In the case of an update for a write delegation, we must respect * the value in ia_ctime and not use the current time. */ if (ia_valid & ATTR_DELEG) now = inode_set_ctime_deleg(inode, attr->ia_ctime); else now = inode_set_ctime_current(inode); } else { /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */ WARN_ON_ONCE(ia_valid & ATTR_MTIME); now = current_time(inode); } if (ia_valid & ATTR_ATIME_SET) inode_set_atime_to_ts(inode, attr->ia_atime); else if (ia_valid & ATTR_ATIME) inode_set_atime_to_ts(inode, now); if (ia_valid & ATTR_MTIME_SET) inode_set_mtime_to_ts(inode, attr->ia_mtime); else if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, now); } /** * setattr_copy - copy simple metadata updates into the generic inode * @idmap: idmap of the mount the inode was found from * @inode: the inode to be updated * @attr: the new attributes * * setattr_copy must be called with i_mutex held. * * setattr_copy updates the inode's metadata with that specified * in attr on idmapped mounts. Necessary permission checks to determine * whether or not the S_ISGID property needs to be removed are performed with * the correct idmapped mount permission helpers. * Noticeably missing is inode size update, which is more complex * as it requires pagecache updates. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * The inode is not marked as dirty after this operation. The rationale is * that for "simple" filesystems, the struct inode is the inode storage. * The caller is free to mark the inode dirty afterwards if needed. */ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; inode->i_mode = mode; } if (is_mgtime(inode)) return setattr_copy_mgtime(inode, attr); if (ia_valid & ATTR_ATIME) inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, attr->ia_mtime); if (ia_valid & ATTR_CTIME) { if (ia_valid & ATTR_DELEG) inode_set_ctime_deleg(inode, attr->ia_ctime); else inode_set_ctime_to_ts(inode, attr->ia_ctime); } } EXPORT_SYMBOL(setattr_copy); int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid) { int error; if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; } /* * If utimes(2) and friends are called with times == NULL (or both * times are UTIME_NOW), then we need to check for write permission */ if (ia_valid & ATTR_TOUCH) { if (IS_IMMUTABLE(inode)) return -EPERM; if (!inode_owner_or_capable(idmap, inode)) { error = inode_permission(idmap, inode, MAY_WRITE); if (error) return error; } } return 0; } EXPORT_SYMBOL(may_setattr); /** * notify_change - modify attributes of a filesystem object * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated * * The caller must hold the i_mutex on the affected object. * * If notify_change discovers a delegation in need of breaking, * it will return -EWOULDBLOCK and return a reference to the inode in * delegated_inode. The caller should then break the delegation and * retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. Also, passing NULL is fine for callers holding * the file open for write, as there can be no conflicting delegation in * that case. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; int error; struct timespec64 now; unsigned int ia_valid = attr->ia_valid; WARN_ON_ONCE(!inode_is_locked(inode)); error = may_setattr(idmap, inode, ia_valid); if (error) return error; if ((ia_valid & ATTR_MODE)) { /* * Don't allow changing the mode of symlinks: * * (1) The vfs doesn't take the mode of symlinks into account * during permission checking. * (2) This has never worked correctly. Most major filesystems * did return EOPNOTSUPP due to interactions with POSIX ACLs * but did still updated the mode of the symlink. * This inconsistency led system call wrapper providers such * as libc to block changing the mode of symlinks with * EOPNOTSUPP already. * (3) To even do this in the first place one would have to use * specific file descriptors and quite some effort. */ if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; /* Flag setting protected by i_mutex */ if (is_sxid(attr->ia_mode)) inode->i_flags &= ~S_NOSEC; } now = current_time(inode); attr->ia_ctime = now; if (!(ia_valid & ATTR_ATIME_SET)) attr->ia_atime = now; else attr->ia_atime = timestamp_truncate(attr->ia_atime, inode); if (!(ia_valid & ATTR_MTIME_SET)) attr->ia_mtime = now; else attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode); if (ia_valid & ATTR_KILL_PRIV) { error = security_inode_need_killpriv(dentry); if (error < 0) return error; if (error == 0) ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV; } /* * We now pass ATTR_KILL_S*ID to the lower level setattr function so * that the function has the ability to reinterpret a mode change * that's due to these bits. This adds an implicit restriction that * no function will ever call notify_change with both ATTR_MODE and * ATTR_KILL_S*ID set. */ if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) && (ia_valid & ATTR_MODE)) BUG(); if (ia_valid & ATTR_KILL_SUID) { if (mode & S_ISUID) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = (inode->i_mode & ~S_ISUID); } } if (ia_valid & ATTR_KILL_SGID) { if (mode & S_ISGID) { if (!(ia_valid & ATTR_MODE)) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = inode->i_mode; } attr->ia_mode &= ~S_ISGID; } } if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) return 0; /* * Verify that uid/gid changes are valid in the target * namespace of the superblock. */ if (ia_valid & ATTR_UID && !vfsuid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && !vfsgid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsgid)) return -EOVERFLOW; /* Don't allow modifications of files with invalid uids or * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && !vfsuid_valid(i_uid_into_vfsuid(idmap, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; error = security_inode_setattr(idmap, dentry, attr); if (error) return error; /* * If ATTR_DELEG is set, then these attributes are being set on * behalf of the holder of a write delegation. We want to avoid * breaking the delegation in this case. */ if (!(ia_valid & ATTR_DELEG)) { error = try_break_deleg(inode, delegated_inode); if (error) return error; } if (inode->i_op->setattr) error = inode->i_op->setattr(idmap, dentry, attr); else error = simple_setattr(idmap, dentry, attr); if (!error) { fsnotify_change(dentry, ia_valid); security_inode_post_setattr(idmap, dentry, ia_valid); } return error; } EXPORT_SYMBOL(notify_change); |
131 132 130 131 131 130 131 8 1 131 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 | // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/swap_state.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie * * Rewritten to use page cache, (C) 1998 Stephen Tweedie */ #include <linux/mm.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/mempolicy.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/huge_mm.h> #include <linux/shmem_fs.h> #include "internal.h" #include "swap.h" /* * swapper_space is a fiction, retained to simplify the path through * vmscan's shrink_folio_list. */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif }; struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_ORDER_CEILING 5 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) #define SWAP_RA_VAL(addr, win, hits) \ (((addr) & PAGE_MASK) | \ (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ ((hits) & SWAP_RA_HITS_MASK)) /* Initial readahead hits is 4 to start up with a small window */ #define GET_SWAP_RA_VAL(vma) \ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); printk("Total swap = %lukB\n", K(total_swap_pages)); } void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swap_cache_index(entry); void *shadow; shadow = xa_load(&address_space->i_pages, idx); if (xa_is_value(shadow)) return shadow; return NULL; } /* * add_to_swap_cache resembles filemap_add_folio on swapper_space, * but sets SwapCache flag and 'swap' instead of mapping and index. */ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swap_cache_index(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); unsigned long i, nr = folio_nr_pages(folio); void *old; xas_set_update(&xas, workingset_update_node); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); folio_ref_add(folio, nr); folio_set_swapcache(folio); folio->swap = entry; do { xas_lock_irq(&xas); xas_create_range(&xas); if (xas_error(&xas)) goto unlock; for (i = 0; i < nr; i++) { VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); if (shadowp) { old = xas_load(&xas); if (xa_is_value(old)) *shadowp = old; } xas_store(&xas, folio); xas_next(&xas); } address_space->nrpages += nr; __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (!xas_error(&xas)) return 0; folio_clear_swapcache(folio); folio_ref_sub(folio, nr); return xas_error(&xas); } /* * This must be called only on folios that have * been verified to be in the swap cache. */ void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow) { struct address_space *address_space = swap_address_space(entry); int i; long nr = folio_nr_pages(folio); pgoff_t idx = swap_cache_index(entry); XA_STATE(xas, &address_space->i_pages, idx); xas_set_update(&xas, workingset_update_node); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); VM_BUG_ON_PAGE(entry != folio, entry); xas_next(&xas); } folio->swap.val = 0; folio_clear_swapcache(folio); address_space->nrpages -= nr; __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); } /* * This must be called only on folios that have * been verified to be in the swap cache and locked. * It will never put the folio into the free list, * the caller has a reference on the folio. */ void delete_from_swap_cache(struct folio *folio) { swp_entry_t entry = folio->swap; struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end) { unsigned long curr = begin; void *old; for (;;) { swp_entry_t entry = swp_entry(type, curr); unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; struct address_space *address_space = swap_address_space(entry); XA_STATE(xas, &address_space->i_pages, index); xas_set_update(&xas, workingset_update_node); xa_lock_irq(&address_space->i_pages); xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { if (!xa_is_value(old)) continue; xas_store(&xas, NULL); } xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); if (curr > end) break; } } /* * If we are the only user, then try to free up the swap cache. * * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside * folio_free_swap() _with_ the lock. * - Marcelo */ void free_swap_cache(struct folio *folio) { if (folio_test_swapcache(folio) && !folio_mapped(folio) && folio_trylock(folio)) { folio_free_swap(folio); folio_unlock(folio); } } /* * Perform a free_page(), also freeing any swap cache associated with * this page if it is the last user of the page. */ void free_page_and_swap_cache(struct page *page) { struct folio *folio = page_folio(page); free_swap_cache(folio); if (!is_huge_zero_folio(folio)) folio_put(folio); } /* * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { struct folio_batch folios; unsigned int refs[PAGEVEC_SIZE]; folio_batch_init(&folios); for (int i = 0; i < nr; i++) { struct folio *folio = page_folio(encoded_page_ptr(pages[i])); free_swap_cache(folio); refs[folios.nr] = 1; if (unlikely(encoded_page_flags(pages[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) refs[folios.nr] = encoded_nr_pages(pages[++i]); if (folio_batch_add(&folios, folio) == 0) folios_put_refs(&folios, refs); } if (folios.nr) folios_put_refs(&folios, refs); } static inline bool swap_use_vma_readahead(void) { return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); } /* * Lookup a swap entry in the swap cache. A found folio will be returned * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the folio * lock before returning. * * Caller must lock the swap device or hold a reference to keep it valid. */ struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio; folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (!IS_ERR(folio)) { bool vma_ra = swap_use_vma_readahead(); bool readahead; /* * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ if (unlikely(folio_test_large(folio))) return folio; readahead = folio_test_clear_readahead(folio); if (vma && vma_ra) { unsigned long ra_val; int win, hits; ra_val = GET_SWAP_RA_VAL(vma); win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); if (readahead) hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(addr, win, hits)); } if (readahead) { count_vm_event(SWAP_RA_HIT); if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } } else { folio = NULL; } return folio; } /** * filemap_get_incore_folio - Find and get a folio from the page or swap caches. * @mapping: The address_space to search. * @index: The page cache index. * * This differs from filemap_get_folio() in that it will also look for the * folio in the swap cache. * * Return: The found folio or %NULL. */ struct folio *filemap_get_incore_folio(struct address_space *mapping, pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; struct folio *folio = filemap_get_entry(mapping, index); if (!folio) return ERR_PTR(-ENOENT); if (!xa_is_value(folio)) return folio; if (!shmem_mapping(mapping)) return ERR_PTR(-ENOENT); swp = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(swp)) return ERR_PTR(-ENOENT); /* Prevent swapoff from happening to us */ si = get_swap_device(swp); if (!si) return ERR_PTR(-ENOENT); index = swap_cache_index(swp); folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); return folio; } struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { struct swap_info_struct *si = swp_swap_info(entry); struct folio *folio; struct folio *new_folio = NULL; struct folio *result = NULL; void *shadow = NULL; *new_page_allocated = false; for (;;) { int err; /* * First check the swap cache. Since this is normally * called after swap_cache_get_folio() failed, re-calling * that would confuse statistics. */ folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (!IS_ERR(folio)) goto got_folio; /* * Just skip read ahead for unused swap slot. */ if (!swap_entry_swapped(si, entry)) goto put_and_return; /* * Get a new folio to read into from swap. Allocate it now if * new_folio not exist, before marking swap_map SWAP_HAS_CACHE, * when -EEXIST will cause any racers to loop around until we * add it to cache. */ if (!new_folio) { new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); if (!new_folio) goto put_and_return; } /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry, 1); if (!err) break; else if (err != -EEXIST) goto put_and_return; /* * Protect against a recursive call to __read_swap_cache_async() * on the same entry waiting forever here because SWAP_HAS_CACHE * is set but the folio is not the swap cache yet. This can * happen today if mem_cgroup_swapin_charge_folio() below * triggers reclaim through zswap, which may call * __read_swap_cache_async() in the writeback path. */ if (skip_if_exists) goto put_and_return; /* * We might race against __delete_from_swap_cache(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE * in swap_map, but not yet added its folio to swap cache. */ schedule_timeout_uninterruptible(1); } /* * The swap entry is ours to swap in. Prepare the new folio. */ __folio_set_locked(new_folio); __folio_set_swapbacked(new_folio); if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; memcg1_swapin(entry, 1); if (shadow) workingset_refault(new_folio, shadow); /* Caller will initiate read into locked new_folio */ folio_add_lru(new_folio); *new_page_allocated = true; folio = new_folio; got_folio: result = folio; goto put_and_return; fail_unlock: put_swap_folio(new_folio, entry); folio_unlock(new_folio); put_and_return: if (!(*new_page_allocated) && new_folio) folio_put(new_folio); return result; } /* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. * * get/put_swap_device() aren't needed to call this function, because * __read_swap_cache_async() call them and swap_read_folio() holds the * swap cache folio lock. */ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { struct swap_info_struct *si; bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; si = get_swap_device(entry); if (!si) return NULL; mpol = get_vma_policy(vma, addr, 0, &ilx); folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); mpol_cond_put(mpol); if (page_allocated) swap_read_folio(folio, plug); put_swap_device(si); return folio; } static unsigned int __swapin_nr_pages(unsigned long prev_offset, unsigned long offset, int hits, int max_pages, int prev_win) { unsigned int pages, last_ra; /* * This heuristic has been found to work well on both sequential and * random loads, swapping to hard disk or to SSD: please don't ask * what the "+ 2" means, it just happens to work well, that's all. */ pages = hits + 2; if (pages == 2) { /* * We can have no readahead hits to judge by: but must not get * stuck here forever, so check for an adjacent offset instead * (and don't even bother to check whether swap type is same). */ if (offset != prev_offset + 1 && offset != prev_offset - 1) pages = 1; } else { unsigned int roundup = 4; while (roundup < pages) roundup <<= 1; pages = roundup; } if (pages > max_pages) pages = max_pages; /* Don't shrink readahead too fast */ last_ra = prev_win / 2; if (pages < last_ra) pages = last_ra; return pages; } static unsigned long swapin_nr_pages(unsigned long offset) { static unsigned long prev_offset; unsigned int hits, pages, max_pages; static atomic_t last_readahead_pages; max_pages = 1 << READ_ONCE(page_cluster); if (max_pages <= 1) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, max_pages, atomic_read(&last_readahead_pages)); if (!hits) WRITE_ONCE(prev_offset, offset); atomic_set(&last_readahead_pages, pages); return pages; } /** * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * * Note: it is intentional that the same NUMA policy and interleave index * are used for every page of the readahead: neighbouring pages on swap * are fairly likely to have been swapped out from the same node. */ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { struct folio *folio; unsigned long entry_offset = swp_offset(entry); unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; if (!start_offset) /* First page is swap header. */ start_offset++; if (end_offset >= si->max) end_offset = si->max - 1; blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ folio = __read_swap_cache_async( swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, &page_allocated, false); if (!folio) continue; if (page_allocated) { swap_read_folio(folio, &splug); if (offset != entry_offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } } folio_put(folio); } blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; } int init_swap_address_space(unsigned int type, unsigned long nr_pages) { struct address_space *spaces, *space; unsigned int i, nr; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); if (!spaces) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ mapping_set_no_writeback_tags(space); } nr_swapper_spaces[type] = nr; swapper_spaces[type] = spaces; return 0; } void exit_swap_address_space(unsigned int type) { int i; struct address_space *spaces = swapper_spaces[type]; for (i = 0; i < nr_swapper_spaces[type]; i++) VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); kvfree(spaces); nr_swapper_spaces[type] = 0; swapper_spaces[type] = NULL; } static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, unsigned long *end) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; unsigned long faddr, prev_faddr, left, right; unsigned int max_win, hits, prev_win, win; max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); if (max_win == 1) return 1; faddr = vmf->address; ra_val = GET_SWAP_RA_VAL(vma); prev_faddr = SWAP_RA_ADDR(ra_val); prev_win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); if (win == 1) return 1; if (faddr == prev_faddr + PAGE_SIZE) left = faddr; else if (prev_faddr == faddr + PAGE_SIZE) left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE; else left = faddr - (((win - 1) / 2) << PAGE_SHIFT); right = left + (win << PAGE_SHIFT); if ((long)left < 0) left = 0; *start = max3(left, vma->vm_start, faddr & PMD_MASK); *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE); return win; } /** * swap_vma_readahead - swap in pages in hope we need them soon * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read in a few pages whose * virtual addresses are around the fault address in the same vma. * * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; struct folio *folio; pte_t *pte = NULL, pentry; int win; unsigned long start, end, addr; swp_entry_t entry; pgoff_t ilx; bool page_allocated; win = swap_vma_ra_win(vmf, &start, &end); if (win == 1) goto skip; ilx = targ_ilx - PFN_DOWN(vmf->address - start); blk_start_plug(&plug); for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) break; } pentry = ptep_get_lockless(pte); if (!is_swap_pte(pentry)) continue; entry = pte_to_swp_entry(pentry); if (unlikely(non_swap_entry(entry))) continue; pte_unmap(pte); pte = NULL; folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (!folio) continue; if (page_allocated) { swap_read_folio(folio, &splug); if (addr != vmf->address) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } } folio_put(folio); } if (pte) pte_unmap(pte); blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); skip: /* The folio was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; } /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @vmf: fault information * * Returns the struct folio for entry and addr, after queueing swapin. * * It's a main entry function for swap readahead. By the configuration, * it will read ahead blocks by cluster-based(ie, physical disk based) * or vma-based(ie, virtual address based on faulty address) readahead. */ struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); folio = swap_use_vma_readahead() ? swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : swap_cluster_readahead(entry, gfp_mask, mpol, ilx); mpol_cond_put(mpol); return folio; } #ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { ssize_t ret; ret = kstrtobool(buf, &enable_vma_readahead); if (ret) return ret; return count; } static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, NULL, }; static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; static int __init swap_init_sysfs(void) { int err; struct kobject *swap_kobj; swap_kobj = kobject_create_and_add("swap", mm_kobj); if (!swap_kobj) { pr_err("failed to create swap kobject\n"); return -ENOMEM; } err = sysfs_create_group(swap_kobj, &swap_attr_group); if (err) { pr_err("failed to register swap group\n"); goto delete_obj; } return 0; delete_obj: kobject_put(swap_kobj); return err; } subsys_initcall(swap_init_sysfs); #endif |
22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 | // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/setup.c * * Copyright (C) 1995-2001 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/acpi.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/ioport.h> #include <linux/delay.h> #include <linux/initrd.h> #include <linux/console.h> #include <linux/cache.h> #include <linux/screen_info.h> #include <linux/init.h> #include <linux/kexec.h> #include <linux/root_dev.h> #include <linux/cpu.h> #include <linux/interrupt.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/panic_notifier.h> #include <linux/proc_fs.h> #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/efi.h> #include <linux/psci.h> #include <linux/sched/task.h> #include <linux/scs.h> #include <linux/mm.h> #include <asm/acpi.h> #include <asm/fixmap.h> #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/daifflags.h> #include <asm/elf.h> #include <asm/cpufeature.h> #include <asm/cpu_ops.h> #include <asm/kasan.h> #include <asm/numa.h> #include <asm/rsi.h> #include <asm/scs.h> #include <asm/sections.h> #include <asm/setup.h> #include <asm/smp_plat.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/traps.h> #include <asm/efi.h> #include <asm/xen/hypervisor.h> #include <asm/mmu_context.h> static int num_standard_resources; static struct resource *standard_resources; phys_addr_t __fdt_pointer __initdata; u64 mmu_enabled_at_boot __initdata; /* * Standard memory resources */ static struct resource mem_res[] = { { .name = "Kernel code", .start = 0, .end = 0, .flags = IORESOURCE_SYSTEM_RAM }, { .name = "Kernel data", .start = 0, .end = 0, .flags = IORESOURCE_SYSTEM_RAM } }; #define kernel_code mem_res[0] #define kernel_data mem_res[1] /* * The recorded values of x0 .. x3 upon kernel entry. */ u64 __cacheline_aligned boot_args[4]; void __init smp_setup_processor_id(void) { u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; set_cpu_logical_map(0, mpidr); pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n", (unsigned long)mpidr, read_cpuid_id()); } bool arch_match_cpu_phys_id(int cpu, u64 phys_id) { return phys_id == cpu_logical_map(cpu); } struct mpidr_hash mpidr_hash; /** * smp_build_mpidr_hash - Pre-compute shifts required at each affinity * level in order to build a linear index from an * MPIDR value. Resulting algorithm is a collision * free hash carried out through shifting and ORing */ static void __init smp_build_mpidr_hash(void) { u32 i, affinity, fs[4], bits[4], ls; u64 mask = 0; /* * Pre-scan the list of MPIDRS and filter out bits that do * not contribute to affinity levels, ie they never toggle. */ for_each_possible_cpu(i) mask |= (cpu_logical_map(i) ^ cpu_logical_map(0)); pr_debug("mask of set bits %#llx\n", mask); /* * Find and stash the last and first bit set at all affinity levels to * check how many bits are required to represent them. */ for (i = 0; i < 4; i++) { affinity = MPIDR_AFFINITY_LEVEL(mask, i); /* * Find the MSB bit and LSB bits position * to determine how many bits are required * to express the affinity level. */ ls = fls(affinity); fs[i] = affinity ? ffs(affinity) - 1 : 0; bits[i] = ls - fs[i]; } /* * An index can be created from the MPIDR_EL1 by isolating the * significant bits at each affinity level and by shifting * them in order to compress the 32 bits values space to a * compressed set of values. This is equivalent to hashing * the MPIDR_EL1 through shifting and ORing. It is a collision free * hash though not minimal since some levels might contain a number * of CPUs that is not an exact power of 2 and their bit * representation might contain holes, eg MPIDR_EL1[7:0] = {0x2, 0x80}. */ mpidr_hash.shift_aff[0] = MPIDR_LEVEL_SHIFT(0) + fs[0]; mpidr_hash.shift_aff[1] = MPIDR_LEVEL_SHIFT(1) + fs[1] - bits[0]; mpidr_hash.shift_aff[2] = MPIDR_LEVEL_SHIFT(2) + fs[2] - (bits[1] + bits[0]); mpidr_hash.shift_aff[3] = MPIDR_LEVEL_SHIFT(3) + fs[3] - (bits[2] + bits[1] + bits[0]); mpidr_hash.mask = mask; mpidr_hash.bits = bits[3] + bits[2] + bits[1] + bits[0]; pr_debug("MPIDR hash: aff0[%u] aff1[%u] aff2[%u] aff3[%u] mask[%#llx] bits[%u]\n", mpidr_hash.shift_aff[0], mpidr_hash.shift_aff[1], mpidr_hash.shift_aff[2], mpidr_hash.shift_aff[3], mpidr_hash.mask, mpidr_hash.bits); /* * 4x is an arbitrary value used to warn on a hash table much bigger * than expected on most systems. */ if (mpidr_hash_size() > 4 * num_possible_cpus()) pr_warn("Large number of MPIDR hash buckets detected\n"); } static void __init setup_machine_fdt(phys_addr_t dt_phys) { int size; void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); const char *name; if (dt_virt) memblock_reserve(dt_phys, size); /* * dt_virt is a fixmap address, hence __pa(dt_virt) can't be used. * Pass dt_phys directly. */ if (!early_init_dt_scan(dt_virt, dt_phys)) { pr_crit("\n" "Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n" "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n" "\nPlease check your bootloader.", &dt_phys, dt_virt); /* * Note that in this _really_ early stage we cannot even BUG() * or oops, so the least terrible thing to do is cpu_relax(), * or else we could end-up printing non-initialized data, etc. */ while (true) cpu_relax(); } /* Early fixups are done, map the FDT as read-only now */ fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO); name = of_flat_dt_get_machine_name(); if (!name) return; pr_info("Machine model: %s\n", name); dump_stack_set_arch_desc("%s (DT)", name); } static void __init request_standard_resources(void) { struct memblock_region *region; struct resource *res; unsigned long i = 0; size_t res_size; kernel_code.start = __pa_symbol(_stext); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); insert_resource(&iomem_resource, &kernel_code); insert_resource(&iomem_resource, &kernel_data); num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; if (memblock_is_nomap(region)) { res->name = "reserved"; res->flags = IORESOURCE_MEM; res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region)); res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1; } else { res->name = "System RAM"; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; } insert_resource(&iomem_resource, res); } } static int __init reserve_memblock_reserved_regions(void) { u64 i, j; for (i = 0; i < num_standard_resources; ++i) { struct resource *mem = &standard_resources[i]; phys_addr_t r_start, r_end, mem_size = resource_size(mem); if (!memblock_is_region_reserved(mem->start, mem_size)) continue; for_each_reserved_mem_range(j, &r_start, &r_end) { resource_size_t start, end; start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start); end = min(PFN_PHYS(PFN_UP(r_end)) - 1, mem->end); if (start > mem->end || end < mem->start) continue; reserve_region_with_split(mem, start, end, "reserved"); } } return 0; } arch_initcall(reserve_memblock_reserved_regions); u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; u64 cpu_logical_map(unsigned int cpu) { return __cpu_logical_map[cpu]; } void __init __no_sanitize_address setup_arch(char **cmdline_p) { setup_initial_init_mm(_stext, _etext, _edata, _end); *cmdline_p = boot_command_line; kaslr_init(); early_fixmap_init(); early_ioremap_init(); setup_machine_fdt(__fdt_pointer); /* * Initialise the static keys early as they may be enabled by the * cpufeature code and early parameters. */ jump_label_init(); parse_early_param(); dynamic_scs_init(); /* * The primary CPU enters the kernel with all DAIF exceptions masked. * * We must unmask Debug and SError before preemption or scheduling is * possible to ensure that these are consistently unmasked across * threads, and we want to unmask SError as soon as possible after * initializing earlycon so that we can report any SErrors immediately. * * IRQ and FIQ will be unmasked after the root irqchip has been * detected and initialized. */ local_daif_restore(DAIF_PROCCTX_NOIRQ); /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. */ cpu_uninstall_idmap(); xen_early_init(); efi_init(); if (!efi_enabled(EFI_BOOT)) { if ((u64)_text % MIN_KIMG_ALIGN) pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!"); WARN_TAINT(mmu_enabled_at_boot, TAINT_FIRMWARE_WORKAROUND, FW_BUG "Booted with MMU enabled!"); } arm64_memblock_init(); paging_init(); acpi_table_upgrade(); /* Parse the ACPI tables for possible boot-time configuration */ acpi_boot_table_init(); if (acpi_disabled) unflatten_device_tree(); bootmem_init(); kasan_init(); request_standard_resources(); early_ioremap_reset(); if (acpi_disabled) psci_dt_init(); else psci_acpi_init(); arm64_rsi_init(); init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Make sure init_thread_info.ttbr0 always generates translation * faults in case uaccess_enable() is inadvertently called by the init * thread. */ init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); #endif if (boot_args[1] || boot_args[2] || boot_args[3]) { pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n" "\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n" "This indicates a broken bootloader or old kernel\n", boot_args[1], boot_args[2], boot_args[3]); } } static inline bool cpu_can_disable(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU const struct cpu_operations *ops = get_cpu_ops(cpu); if (ops && ops->cpu_can_disable) return ops->cpu_can_disable(cpu); #endif return false; } bool arch_cpu_is_hotpluggable(int num) { return cpu_can_disable(num); } static void dump_kernel_offset(void) { const unsigned long offset = kaslr_offset(); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && offset > 0) { pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n", offset, KIMAGE_VADDR); pr_emerg("PHYS_OFFSET: 0x%llx\n", PHYS_OFFSET); } else { pr_emerg("Kernel Offset: disabled\n"); } } static int arm64_panic_block_dump(struct notifier_block *self, unsigned long v, void *p) { dump_kernel_offset(); dump_cpu_features(); dump_mem_limit(); return 0; } static struct notifier_block arm64_panic_block = { .notifier_call = arm64_panic_block_dump }; static int __init register_arm64_panic_block(void) { atomic_notifier_chain_register(&panic_notifier_list, &arm64_panic_block); return 0; } device_initcall(register_arm64_panic_block); static int __init check_mmu_enabled_at_boot(void) { if (!efi_enabled(EFI_BOOT) && mmu_enabled_at_boot) panic("Non-EFI boot detected with MMU and caches enabled"); return 0; } device_initcall_sync(check_mmu_enabled_at_boot); |
34 34 34 34 34 37 37 3 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 6 34 34 34 34 34 34 34 34 34 34 34 34 34 3 38 35 3 4 2 4 4 34 34 34 34 34 34 34 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/signal.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson * * 2003-06-02 Jim Houston - Concurrent Computer Corp. * Changes to use preallocated sigqueue structures * to allow signals to be sent reliably. */ #include <linux/slab.h> #include <linux/export.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/user.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/coredump.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/signalfd.h> #include <linux/ratelimit.h> #include <linux/task_work.h> #include <linux/capability.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/uprobes.h> #include <linux/compat.h> #include <linux/cn_proc.h> #include <linux/compiler.h> #include <linux/posix-timers.h> #include <linux/cgroup.h> #include <linux/audit.h> #include <linux/sysctl.h> #include <uapi/linux/pidfd.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> #include <asm/param.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/siginfo.h> #include <asm/cacheflush.h> #include <asm/syscall.h> /* for syscall_get_* */ #include "time/posix-timers.h" /* * SLAB caches for signal bits. */ static struct kmem_cache *sigqueue_cachep; int print_fatal_signals __read_mostly; static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; } static inline bool sig_handler_ignored(void __user *handler, int sig) { /* Is it explicitly or implicitly ignored? */ return handler == SIG_IGN || (handler == SIG_DFL && sig_kernel_ignore(sig)); } static bool sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; handler = sig_handler(t, sig); /* SIGKILL and SIGSTOP may not be sent to the global init */ if (unlikely(is_global_init(t) && sig_kernel_only(sig))) return true; if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && handler == SIG_DFL && !(force && sig_kernel_only(sig))) return true; /* Only allow kernel generated signals to this kthread */ if (unlikely((t->flags & PF_KTHREAD) && (handler == SIG_KTHREAD_KERNEL) && !force)) return true; return sig_handler_ignored(handler, sig); } static bool sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the * signal handler may change by the time it is * unblocked. */ if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return false; /* * Tracers may want to know about even ignored signal unless it * is SIGKILL which can't be reported anyway but can be ignored * by SIGNAL_UNKILLABLE task. */ if (t->ptrace && sig != SIGKILL) return false; return sig_task_ignored(t, sig, force); } /* * Re-calculate pending state from the set of locally pending * signals, globally pending signals, and blocked signals. */ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) { unsigned long ready; long i; switch (_NSIG_WORDS) { default: for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) ready |= signal->sig[i] &~ blocked->sig[i]; break; case 4: ready = signal->sig[3] &~ blocked->sig[3]; ready |= signal->sig[2] &~ blocked->sig[2]; ready |= signal->sig[1] &~ blocked->sig[1]; ready |= signal->sig[0] &~ blocked->sig[0]; break; case 2: ready = signal->sig[1] &~ blocked->sig[1]; ready |= signal->sig[0] &~ blocked->sig[0]; break; case 1: ready = signal->sig[0] &~ blocked->sig[0]; } return ready != 0; } #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) static bool recalc_sigpending_tsk(struct task_struct *t) { if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked) || cgroup_task_frozen(t)) { set_tsk_thread_flag(t, TIF_SIGPENDING); return true; } /* * We must never clear the flag in another thread, or in current * when it's possible the current syscall is returning -ERESTART*. * So we don't clear it here, and only callers who know they should do. */ return false; } void recalc_sigpending(void) { if (!recalc_sigpending_tsk(current) && !freezing(current)) { if (unlikely(test_thread_flag(TIF_SIGPENDING))) clear_thread_flag(TIF_SIGPENDING); } } EXPORT_SYMBOL(recalc_sigpending); void calculate_sigpending(void) { /* Have any signals or users of TIF_SIGPENDING been delayed * until after fork? */ spin_lock_irq(¤t->sighand->siglock); set_tsk_thread_flag(current, TIF_SIGPENDING); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } /* Given the mask, find the first available signal that should be serviced. */ #define SYNCHRONOUS_MASK \ (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) int next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; int sig = 0; s = pending->signal.sig; m = mask->sig; /* * Handle the first word specially: it contains the * synchronous signals that need to be dequeued first. */ x = *s &~ *m; if (x) { if (x & SYNCHRONOUS_MASK) x &= SYNCHRONOUS_MASK; sig = ffz(~x) + 1; return sig; } switch (_NSIG_WORDS) { default: for (i = 1; i < _NSIG_WORDS; ++i) { x = *++s &~ *++m; if (!x) continue; sig = ffz(~x) + i*_NSIG_BPW + 1; break; } break; case 2: x = s[1] &~ m[1]; if (!x) break; sig = ffz(~x) + _NSIG_BPW + 1; break; case 1: /* Nothing to do */ break; } return sig; } static inline void print_dropped_signal(int sig) { static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); if (!print_fatal_signals) return; if (!__ratelimit(&ratelimit_state)) return; pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", current->comm, current->pid, sig); } /** * task_set_jobctl_pending - set jobctl pending bits * @task: target task * @mask: pending bits to set * * Clear @mask from @task->jobctl. @mask must be subset of * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK | * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is * cleared. If @task is already being killed or exiting, this function * becomes noop. * * CONTEXT: * Must be called with @task->sighand->siglock held. * * RETURNS: * %true if @mask is set, %false if made noop because @task was dying. */ bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) return false; if (mask & JOBCTL_STOP_SIGMASK) task->jobctl &= ~JOBCTL_STOP_SIGMASK; task->jobctl |= mask; return true; } /** * task_clear_jobctl_trapping - clear jobctl trapping bit * @task: target task * * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED. * Clear it and wake up the ptracer. Note that we don't need any further * locking. @task->siglock guarantees that @task->parent points to the * ptracer. * * CONTEXT: * Must be called with @task->sighand->siglock held. */ void task_clear_jobctl_trapping(struct task_struct *task) { if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { task->jobctl &= ~JOBCTL_TRAPPING; smp_mb(); /* advised by wake_up_bit() */ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); } } /** * task_clear_jobctl_pending - clear jobctl pending bits * @task: target task * @mask: pending bits to clear * * Clear @mask from @task->jobctl. @mask must be subset of * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other * STOP bits are cleared together. * * If clearing of @mask leaves no stop or trap pending, this function calls * task_clear_jobctl_trapping(). * * CONTEXT: * Must be called with @task->sighand->siglock held. */ void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~JOBCTL_PENDING_MASK); if (mask & JOBCTL_STOP_PENDING) mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED; task->jobctl &= ~mask; if (!(task->jobctl & JOBCTL_PENDING_MASK)) task_clear_jobctl_trapping(task); } /** * task_participate_group_stop - participate in a group stop * @task: task participating in a group stop * * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. * Group stop states are cleared and the group stop count is consumed if * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group * stop, the appropriate `SIGNAL_*` flags are set. * * CONTEXT: * Must be called with @task->sighand->siglock held. * * RETURNS: * %true if group stop completion should be notified to the parent, %false * otherwise. */ static bool task_participate_group_stop(struct task_struct *task) { struct signal_struct *sig = task->signal; bool consume = task->jobctl & JOBCTL_STOP_CONSUME; WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)); task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING); if (!consume) return false; if (!WARN_ON_ONCE(sig->group_stop_count == 0)) sig->group_stop_count--; /* * Tell the caller to notify completion iff we are entering into a * fresh group stop. Read comment in do_signal_stop() for details. */ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED); return true; } return false; } void task_join_group_stop(struct task_struct *task) { unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK; struct signal_struct *sig = current->signal; if (sig->group_stop_count) { sig->group_stop_count++; mask |= JOBCTL_STOP_CONSUME; } else if (!(sig->flags & SIGNAL_STOP_STOPPED)) return; /* Have the new thread join an on-going signal group stop */ task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } static struct ucounts *sig_get_ucounts(struct task_struct *t, int sig, int override_rlimit) { struct ucounts *ucounts; long sigpending; /* * Protect access to @t credentials. This can go away when all * callers hold rcu read lock. * * NOTE! A pending signal will hold on to the user refcount, * and we get/put the refcount only when the sigpending count * changes from/to zero. */ rcu_read_lock(); ucounts = task_ucounts(t); sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, override_rlimit); rcu_read_unlock(); if (!sigpending) return NULL; if (unlikely(!override_rlimit && sigpending > task_rlimit(t, RLIMIT_SIGPENDING))) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); print_dropped_signal(sig); return NULL; } return ucounts; } static void __sigqueue_init(struct sigqueue *q, struct ucounts *ucounts, const unsigned int sigqueue_flags) { INIT_LIST_HEAD(&q->list); q->flags = sigqueue_flags; q->ucounts = ucounts; } /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue *sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, int override_rlimit) { struct ucounts *ucounts = sig_get_ucounts(t, sig, override_rlimit); struct sigqueue *q; if (!ucounts) return NULL; q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); if (!q) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); return NULL; } __sigqueue_init(q, ucounts, 0); return q; } static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) { posixtimer_sigqueue_putref(q); return; } if (q->ucounts) { dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING); q->ucounts = NULL; } kmem_cache_free(sigqueue_cachep, q); } void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; sigemptyset(&queue->signal); while (!list_empty(&queue->list)) { q = list_entry(queue->list.next, struct sigqueue , list); list_del_init(&q->list); __sigqueue_free(q); } } /* * Flush all pending signals for this kthread. */ void flush_signals(struct task_struct *t) { unsigned long flags; spin_lock_irqsave(&t->sighand->siglock, flags); clear_tsk_thread_flag(t, TIF_SIGPENDING); flush_sigqueue(&t->pending); flush_sigqueue(&t->signal->shared_pending); spin_unlock_irqrestore(&t->sighand->siglock, flags); } EXPORT_SYMBOL(flush_signals); void ignore_signals(struct task_struct *t) { int i; for (i = 0; i < _NSIG; ++i) t->sighand->action[i].sa.sa_handler = SIG_IGN; flush_signals(t); } /* * Flush all handlers for a task. */ void flush_signal_handlers(struct task_struct *t, int force_default) { int i; struct k_sigaction *ka = &t->sighand->action[0]; for (i = _NSIG ; i != 0 ; i--) { if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; #ifdef __ARCH_HAS_SA_RESTORER ka->sa.sa_restorer = NULL; #endif sigemptyset(&ka->sa.sa_mask); ka++; } } bool unhandled_signal(struct task_struct *tsk, int sig) { void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) return true; if (handler != SIG_IGN && handler != SIG_DFL) return false; /* If dying, we handle all new signals by ignoring them */ if (fatal_signal_pending(tsk)) return false; /* if ptraced, let the tracer determine */ return !tsk->ptrace; } static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info, struct sigqueue **timer_sigq) { struct sigqueue *q, *first = NULL; /* * Collect the siginfo appropriate to this signal. Check if * there is another siginfo for the same signal. */ list_for_each_entry(q, &list->list, list) { if (q->info.si_signo == sig) { if (first) goto still_pending; first = q; } } sigdelset(&list->signal, sig); if (first) { still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); /* * posix-timer signals are preallocated and freed when the last * reference count is dropped in posixtimer_deliver_signal() or * immediately on timer deletion when the signal is not pending. * Spare the extra round through __sigqueue_free() which is * ignoring preallocated signals. */ if (unlikely((first->flags & SIGQUEUE_PREALLOC) && (info->si_code == SI_TIMER))) *timer_sigq = first; else __sigqueue_free(first); } else { /* * Ok, it wasn't in the queue. This must be * a fast-pathed signal or we must have been * out of queue space. So zero out the info. */ clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; info->si_code = SI_USER; info->si_pid = 0; info->si_uid = 0; } } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, kernel_siginfo_t *info, struct sigqueue **timer_sigq) { int sig = next_signal(pending, mask); if (sig) collect_signal(sig, pending, info, timer_sigq); return sig; } /* * Try to dequeue a signal. If a deliverable signal is found fill in the * caller provided siginfo and return the signal number. Otherwise return * 0. */ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) { struct task_struct *tsk = current; struct sigqueue *timer_sigq; int signr; lockdep_assert_held(&tsk->sighand->siglock); again: *type = PIDTYPE_PID; timer_sigq = NULL; signr = __dequeue_signal(&tsk->pending, mask, info, &timer_sigq); if (!signr) { *type = PIDTYPE_TGID; signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info, &timer_sigq); if (unlikely(signr == SIGALRM)) posixtimer_rearm_itimer(tsk); } recalc_sigpending(); if (!signr) return 0; if (unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our * caller might release the siglock and then the pending * stop signal it is about to process is no longer in the * pending bitmasks, but must still be cleared by a SIGCONT * (and overruled by a SIGKILL). So those cases clear this * shared flag after we've set it. Note that this flag may * remain set after the signal we return is ignored or * handled. That doesn't matter because its only purpose * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ current->jobctl |= JOBCTL_STOP_DEQUEUED; } if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(timer_sigq)) { if (!posixtimer_deliver_signal(info, timer_sigq)) goto again; } return signr; } EXPORT_SYMBOL_GPL(dequeue_signal); static int dequeue_synchronous_signal(kernel_siginfo_t *info) { struct task_struct *tsk = current; struct sigpending *pending = &tsk->pending; struct sigqueue *q, *sync = NULL; /* * Might a synchronous signal be in the queue? */ if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK)) return 0; /* * Return the first synchronous signal in the queue. */ list_for_each_entry(q, &pending->list, list) { /* Synchronous signals have a positive si_code */ if ((q->info.si_code > SI_USER) && (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) { sync = q; goto next; } } return 0; next: /* * Check if there is another siginfo for the same signal. */ list_for_each_entry_continue(q, &pending->list, list) { if (q->info.si_signo == sync->info.si_signo) goto still_pending; } sigdelset(&pending->signal, sync->info.si_signo); recalc_sigpending(); still_pending: list_del_init(&sync->list); copy_siginfo(info, &sync->info); __sigqueue_free(sync); return info->si_signo; } /* * Tell a process that it has a new active signal.. * * NOTE! we rely on the previous spin_lock to * lock interrupts for us! We can only be called with * "siglock" held, and the local interrupt must * have been disabled when that got acquired! * * No need to set need_resched since signal event passing * goes through ->blocked */ void signal_wake_up_state(struct task_struct *t, unsigned int state) { lockdep_assert_held(&t->sighand->siglock); set_tsk_thread_flag(t, TIF_SIGPENDING); /* * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. * By using wake_up_state, we ensure the process will wake up and * handle its death signal. */ if (!wake_up_state(t, state | TASK_INTERRUPTIBLE)) kick_process(t); } static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q); static void sigqueue_free_ignored(struct task_struct *tsk, struct sigqueue *q) { if (likely(!(q->flags & SIGQUEUE_PREALLOC) || q->info.si_code != SI_TIMER)) __sigqueue_free(q); else posixtimer_sig_ignore(tsk, q); } /* Remove signals in mask from the pending set and queue. */ static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; lockdep_assert_held(&p->sighand->siglock); sigandsets(&m, mask, &s->signal); if (sigisemptyset(&m)) return; sigandnsets(&s->signal, &s->signal, mask); list_for_each_entry_safe(q, n, &s->list, list) { if (sigismember(mask, q->info.si_signo)) { list_del_init(&q->list); sigqueue_free_ignored(p, q); } } } static inline int is_si_special(const struct kernel_siginfo *info) { return info <= SEND_SIG_PRIV; } static inline bool si_fromuser(const struct kernel_siginfo *info) { return info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)); } /* * called with RCU read lock from check_kill_permission() */ static bool kill_ok_by_cred(struct task_struct *t) { const struct cred *cred = current_cred(); const struct cred *tcred = __task_cred(t); return uid_eq(cred->euid, tcred->suid) || uid_eq(cred->euid, tcred->uid) || uid_eq(cred->uid, tcred->suid) || uid_eq(cred->uid, tcred->uid) || ns_capable(tcred->user_ns, CAP_KILL); } /* * Bad permissions for sending the signal * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct kernel_siginfo *info, struct task_struct *t) { struct pid *sid; int error; if (!valid_signal(sig)) return -EINVAL; if (!si_fromuser(info)) return 0; error = audit_signal_info(sig, t); /* Let audit system see the signal */ if (error) return error; if (!same_thread_group(current, t) && !kill_ok_by_cred(t)) { switch (sig) { case SIGCONT: sid = task_session(t); /* * We don't return the error if sid == NULL. The * task was unhashed, the caller must notice this. */ if (!sid || sid == task_session(current)) break; fallthrough; default: return -EPERM; } } return security_task_kill(t, info, sig, NULL); } /** * ptrace_trap_notify - schedule trap to notify ptracer * @t: tracee wanting to notify tracer * * This function schedules sticky ptrace trap which is cleared on the next * TRAP_STOP to notify ptracer of an event. @t must have been seized by * ptracer. * * If @t is running, STOP trap will be taken. If trapped for STOP and * ptracer is listening for events, tracee is woken up so that it can * re-trap for the new event. If trapped otherwise, STOP trap will be * eventually taken without returning to userland after the existing traps * are finished by PTRACE_CONT. * * CONTEXT: * Must be called with @task->sighand->siglock held. */ static void ptrace_trap_notify(struct task_struct *t) { WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); lockdep_assert_held(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); } /* * Handle magic process-wide effects of stop/continue signals. Unlike * the signal actions, these happen immediately at signal-generation * time regardless of blocking, ignoring, or handling. This does the * actual continuing for SIGCONT, but not the actual stopping for stop * signals. The process stop is done as a signal action for SIG_DFL. * * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ static bool prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; sigset_t flush; if (signal->flags & SIGNAL_GROUP_EXIT) { if (signal->core_state) return sig == SIGKILL; /* * The process is in the middle of dying, drop the signal. */ return false; } else if (sig_kernel_stop(sig)) { /* * This is a stop signal. Remove SIGCONT from all queues. */ siginitset(&flush, sigmask(SIGCONT)); flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) flush_sigqueue_mask(p, &flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ siginitset(&flush, SIG_KERNEL_STOP_MASK); flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) { flush_sigqueue_mask(p, &flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); if (likely(!(t->ptrace & PT_SEIZED))) { t->jobctl &= ~JOBCTL_STOPPED; wake_up_state(t, __TASK_STOPPED); } else ptrace_trap_notify(t); } /* * Notify the parent with CLD_CONTINUED if we were stopped. * * If we were in the middle of a group stop, we pretend it * was already finished, and then continued. Since SIGCHLD * doesn't queue we report only CLD_STOPPED, as if the next * CLD_CONTINUED was dropped. */ why = 0; if (signal->flags & SIGNAL_STOP_STOPPED) why |= SIGNAL_CLD_CONTINUED; else if (signal->group_stop_count) why |= SIGNAL_CLD_STOPPED; if (why) { /* * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal(). */ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; signal->group_exit_code = 0; } } return !sig_ignored(p, sig, force); } /* * Test if P wants to take SIG. After we've checked all threads with this, * it's equivalent to finding no threads not blocking SIG. Any threads not * blocking SIG were ruled out because they are not running and already * have pending signals. Such threads will dequeue from the shared queue * as soon as they're available, so putting the signal on the shared queue * will be equivalent to sending it to one such thread. */ static inline bool wants_signal(int sig, struct task_struct *p) { if (sigismember(&p->blocked, sig)) return false; if (p->flags & PF_EXITING) return false; if (sig == SIGKILL) return true; if (task_is_stopped_or_traced(p)) return false; return task_curr(p) || !task_sigpending(p); } static void complete_signal(int sig, struct task_struct *p, enum pid_type type) { struct signal_struct *signal = p->signal; struct task_struct *t; /* * Now find a thread we can wake up to take the signal off the queue. * * Try the suggested task first (may or may not be the main thread). */ if (wants_signal(sig, p)) t = p; else if ((type == PIDTYPE_PID) || thread_group_empty(p)) /* * There is just one thread and it does not need to be woken. * It will dequeue unblocked signals before it runs again. */ return; else { /* * Otherwise try to find a suitable thread. */ t = signal->curr_target; while (!wants_signal(sig, t)) { t = next_thread(t); if (t == signal->curr_target) /* * No thread needs to be woken. * Any eligible threads will see * the signal in the queue soon. */ return; } signal->curr_target = t; } /* * Found a killable thread. If the signal will be fatal, * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && (sig == SIGKILL || !p->ptrace)) { /* * This signal will be fatal to the whole group. */ if (!sig_kernel_coredump(sig)) { /* * Start a group exit and wake everybody up. * This way we don't have other threads * running and doing things after a slower * thread has the fatal signal pending. */ signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = sig; signal->group_stop_count = 0; __for_each_thread(signal, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } return; } } /* * The signal is already in the shared-pending queue. * Tell the chosen thread to wake up and dequeue it. */ signal_wake_up(t, sig == SIGKILL); return; } static inline bool legacy_queue(struct sigpending *signals, int sig) { return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } static int __send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; int override_rlimit; int ret = 0, result; lockdep_assert_held(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, force)) goto ret; pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; /* * Short-circuit ignored signals and support queuing * exactly one non-rt signal, so that we can get more * detailed information about the cause of the signal. */ result = TRACE_SIGNAL_ALREADY_PENDING; if (legacy_queue(pending, sig)) goto ret; result = TRACE_SIGNAL_DELIVERED; /* * Skip useless siginfo allocation for SIGKILL and kernel threads. */ if ((sig == SIGKILL) || (t->flags & PF_KTHREAD)) goto out_set; /* * Real-time signals must be queued if sent by sigqueue, or * some other real-time mechanism. It is implementation * defined whether kill() does so. We attempt to do so, on * the principle of least surprise, but since kill is not * allowed to fail with EAGAIN when low on memory we just * make sure at least one signal gets delivered and don't * pass on the info struct. */ if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0); else override_rlimit = 0; q = sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { case (unsigned long) SEND_SIG_NOINFO: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); rcu_read_lock(); q->info.si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), current_uid()); rcu_read_unlock(); break; case (unsigned long) SEND_SIG_PRIV: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; q->info.si_pid = 0; q->info.si_uid = 0; break; default: copy_siginfo(&q->info, info); break; } } else if (!is_si_special(info) && sig >= SIGRTMIN && info->si_code != SI_USER) { /* * Queue overflow, abort. We may abort if the * signal was rt and sent by user using something * other than kill(). */ result = TRACE_SIGNAL_OVERFLOW_FAIL; ret = -EAGAIN; goto ret; } else { /* * This is a silent loss of information. We still * send the signal, but the *info bits are lost. */ result = TRACE_SIGNAL_LOSE_INFO; } out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); /* Let multiprocess signals appear after on-going forks */ if (type > PIDTYPE_TGID) { struct multiprocess_signals *delayed; hlist_for_each_entry(delayed, &t->signal->multiprocess, node) { sigset_t *signal = &delayed->signal; /* Can't queue both a stop and a continue signal */ if (sig == SIGCONT) sigdelsetmask(signal, SIG_KERNEL_STOP_MASK); else if (sig_kernel_stop(sig)) sigdelset(signal, SIGCONT); sigaddset(signal, sig); } } complete_signal(sig, t, type); ret: trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; } static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) { bool ret = false; switch (siginfo_layout(info->si_signo, info->si_code)) { case SIL_KILL: case SIL_CHLD: case SIL_RT: ret = true; break; case SIL_TIMER: case SIL_POLL: case SIL_FAULT: case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: case SIL_FAULT_PERF_EVENT: case SIL_SYS: ret = false; break; } return ret; } int send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type) { /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ bool force = false; if (info == SEND_SIG_NOINFO) { /* Force if sent from an ancestor pid namespace */ force = !task_pid_nr_ns(current, task_active_pid_ns(t)); } else if (info == SEND_SIG_PRIV) { /* Don't ignore kernel generated signals */ force = true; } else if (has_si_pid_and_uid(info)) { /* SIGKILL and SIGSTOP is special or has ids */ struct user_namespace *t_user_ns; rcu_read_lock(); t_user_ns = task_cred_xxx(t, user_ns); if (current_user_ns() != t_user_ns) { kuid_t uid = make_kuid(current_user_ns(), info->si_uid); info->si_uid = from_kuid_munged(t_user_ns, uid); } rcu_read_unlock(); /* A kernel generated signal? */ force = (info->si_code == SI_KERNEL); /* From an ancestor pid namespace? */ if (!task_pid_nr_ns(current, task_active_pid_ns(t))) { info->si_pid = 0; force = true; } } return __send_signal_locked(sig, info, t, type, force); } static void print_fatal_signal(int signr) { struct pt_regs *regs = task_pt_regs(current); struct file *exe_file; exe_file = get_task_exe_file(current); if (exe_file) { pr_info("%pD: %s: potentially unexpected fatal signal %d.\n", exe_file, current->comm, signr); fput(exe_file); } else { pr_info("%s: potentially unexpected fatal signal %d.\n", current->comm, signr); } #if defined(__i386__) && !defined(__arch_um__) pr_info("code at %08lx: ", regs->ip); { int i; for (i = 0; i < 16; i++) { unsigned char insn; if (get_user(insn, (unsigned char *)(regs->ip + i))) break; pr_cont("%02x ", insn); } } pr_cont("\n"); #endif preempt_disable(); show_regs(regs); preempt_enable(); } static int __init setup_print_fatal_signals(char *str) { get_option (&str, &print_fatal_signals); return 1; } __setup("print-fatal-signals=", setup_print_fatal_signals); int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { unsigned long flags; int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { ret = send_signal_locked(sig, info, p, type); unlock_task_sighand(p, &flags); } return ret; } enum sig_handler { HANDLER_CURRENT, /* If reachable use the current handler */ HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */ HANDLER_EXIT, /* Only visible as the process exit code */ }; /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. * * Note: If we unblock the signal, we always reset it to SIG_DFL, * since we do not want to have a signal handler that was blocked * be invoked when user space had explicitly blocked it. * * We don't want to have recursive SIGSEGV's etc, for example, * that is why we also clear SIGNAL_UNKILLABLE. */ static int force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, enum sig_handler handler) { unsigned long int flags; int ret, blocked, ignored; struct k_sigaction *action; int sig = info->si_signo; spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; blocked = sigismember(&t->blocked, sig); if (blocked || ignored || (handler != HANDLER_CURRENT)) { action->sa.sa_handler = SIG_DFL; if (handler == HANDLER_EXIT) action->sa.sa_flags |= SA_IMMUTABLE; if (blocked) sigdelset(&t->blocked, sig); } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect * debugging to leave init killable. But HANDLER_EXIT is always fatal. */ if (action->sa.sa_handler == SIG_DFL && (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal_locked(sig, info, t, PIDTYPE_PID); /* This can happen if the signal was already pending and blocked */ if (!task_sigpending(t)) signal_wake_up(t, 0); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; } int force_sig_info(struct kernel_siginfo *info) { return force_sig_info_to_task(info, current, HANDLER_CURRENT); } /* * Nuke all other threads in the group. */ int zap_other_threads(struct task_struct *p) { struct task_struct *t; int count = 0; p->signal->group_stop_count = 0; for_other_threads(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); count++; /* Don't bother with already dead threads */ if (t->exit_state) continue; sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } return count; } struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; rcu_read_lock(); for (;;) { sighand = rcu_dereference(tsk->sighand); if (unlikely(sighand == NULL)) break; /* * This sighand can be already freed and even reused, but * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which * initializes ->siglock: this slab can't go away, it has * the same object type, ->siglock can't be reinitialized. * * We need to ensure that tsk->sighand is still the same * after we take the lock, we can race with de_thread() or * __exit_signal(). In the latter case the next iteration * must see ->sighand == NULL. */ spin_lock_irqsave(&sighand->siglock, *flags); if (likely(sighand == rcu_access_pointer(tsk->sighand))) break; spin_unlock_irqrestore(&sighand->siglock, *flags); } rcu_read_unlock(); return sighand; } #ifdef CONFIG_LOCKDEP void lockdep_assert_task_sighand_held(struct task_struct *task) { struct sighand_struct *sighand; rcu_read_lock(); sighand = rcu_dereference(task->sighand); if (sighand) lockdep_assert_held(&sighand->siglock); else WARN_ON_ONCE(1); rcu_read_unlock(); } #endif /* * send signal info to all the members of a thread group or to the * individual thread if type == PIDTYPE_PID. */ int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { int ret; rcu_read_lock(); ret = check_kill_permission(sig, info, p); rcu_read_unlock(); if (!ret && sig) ret = do_send_sig_info(sig, info, p, type); return ret; } /* * __kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) * - the caller must hold at least a readlock on tasklist_lock */ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; int ret = -ESRCH; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID); /* * If group_send_sig_info() succeeds at least once ret * becomes 0 and after that the code below has no effect. * Otherwise we return the last err or -ESRCH if this * process group is empty. */ if (ret) ret = err; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return ret; } static int kill_pid_info_type(int sig, struct kernel_siginfo *info, struct pid *pid, enum pid_type type) { int error = -ESRCH; struct task_struct *p; for (;;) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) error = group_send_sig_info(sig, info, p, type); rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; /* * The task was unhashed in between, try again. If it * is dead, pid_task() will return NULL, if we race with * de_thread() it will find the new leader. */ } } int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) { return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID); } static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) { int error; rcu_read_lock(); error = kill_pid_info(sig, info, find_vpid(pid)); rcu_read_unlock(); return error; } static inline bool kill_as_cred_perm(const struct cred *cred, struct task_struct *target) { const struct cred *pcred = __task_cred(target); return uid_eq(cred->euid, pcred->suid) || uid_eq(cred->euid, pcred->uid) || uid_eq(cred->uid, pcred->suid) || uid_eq(cred->uid, pcred->uid); } /* * The usb asyncio usage of siginfo is wrong. The glibc support * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT. * AKA after the generic fields: * kernel_pid_t si_pid; * kernel_uid32_t si_uid; * sigval_t si_value; * * Unfortunately when usb generates SI_ASYNCIO it assumes the layout * after the generic fields is: * void __user *si_addr; * * This is a practical problem when there is a 64bit big endian kernel * and a 32bit userspace. As the 32bit address will encoded in the low * 32bits of the pointer. Those low 32bits will be stored at higher * address than appear in a 32 bit pointer. So userspace will not * see the address it was expecting for it's completions. * * There is nothing in the encoding that can allow * copy_siginfo_to_user32 to detect this confusion of formats, so * handle this by requiring the caller of kill_pid_usb_asyncio to * notice when this situration takes place and to store the 32bit * pointer in sival_int, instead of sival_addr of the sigval_t addr * parameter. */ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *pid, const struct cred *cred) { struct kernel_siginfo info; struct task_struct *p; unsigned long flags; int ret = -EINVAL; if (!valid_signal(sig)) return ret; clear_siginfo(&info); info.si_signo = sig; info.si_errno = errno; info.si_code = SI_ASYNCIO; *((sigval_t *)&info.si_pid) = addr; rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; goto out_unlock; } if (!kill_as_cred_perm(cred, p)) { ret = -EPERM; goto out_unlock; } ret = security_task_kill(p, &info, sig, cred); if (ret) goto out_unlock; if (sig) { if (lock_task_sighand(p, &flags)) { ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false); unlock_task_sighand(p, &flags); } else ret = -ESRCH; } out_unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio); /* * kill_something_info() interprets pid in interesting ways just like kill(2). * * POSIX specifies that kill(-1,sig) is unspecified, but what we have * is probably wrong. Should make it like BSD or SYSV. */ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) { int ret; if (pid > 0) return kill_proc_info(sig, info, pid); /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ if (pid == INT_MIN) return -ESRCH; read_lock(&tasklist_lock); if (pid != -1) { ret = __kill_pgrp_info(sig, info, pid ? find_vpid(-pid) : task_pgrp(current)); } else { int retval = 0, count = 0; struct task_struct * p; for_each_process(p) { if (task_pid_vnr(p) > 1 && !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p, PIDTYPE_MAX); ++count; if (err != -EPERM) retval = err; } } ret = count ? retval : -ESRCH; } read_unlock(&tasklist_lock); return ret; } /* * These are for backward compatibility with the rest of the kernel source. */ int send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) { /* * Make sure legacy kernel users don't send in bad values * (normal paths check this in check_kill_permission). */ if (!valid_signal(sig)) return -EINVAL; return do_send_sig_info(sig, info, p, PIDTYPE_PID); } EXPORT_SYMBOL(send_sig_info); #define __si_special(priv) \ ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) int send_sig(int sig, struct task_struct *p, int priv) { return send_sig_info(sig, __si_special(priv), p); } EXPORT_SYMBOL(send_sig); void force_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info(&info); } EXPORT_SYMBOL(force_sig); void force_fatal_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info_to_task(&info, current, HANDLER_SIG_DFL); } void force_exit_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info_to_task(&info, current, HANDLER_EXIT); } /* * When things go south during signal handling, we * will force a SIGSEGV. And if the signal that caused * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ void force_sigsegv(int sig) { if (sig == SIGSEGV) force_fatal_sig(SIGSEGV); else force_sig(SIGSEGV); } int force_sig_fault_to_task(int sig, int code, void __user *addr, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; return force_sig_info_to_task(&info, t, HANDLER_CURRENT); } int force_sig_fault(int sig, int code, void __user *addr) { return force_sig_fault_to_task(sig, code, addr, current); } int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; return send_sig_info(info.si_signo, &info, t); } int force_sig_mceerr(int code, void __user *addr, short lsb) { struct kernel_siginfo info; WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); clear_siginfo(&info); info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; return force_sig_info(&info); } int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) { struct kernel_siginfo info; WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); clear_siginfo(&info); info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; return send_sig_info(info.si_signo, &info, t); } EXPORT_SYMBOL(send_sig_mceerr); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = SEGV_BNDERR; info.si_addr = addr; info.si_lower = lower; info.si_upper = upper; return force_sig_info(&info); } #ifdef SEGV_PKUERR int force_sig_pkuerr(void __user *addr, u32 pkey) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = SEGV_PKUERR; info.si_addr = addr; info.si_pkey = pkey; return force_sig_info(&info); } #endif int send_sig_perf(void __user *addr, u32 type, u64 sig_data) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = TRAP_PERF; info.si_addr = addr; info.si_perf_data = sig_data; info.si_perf_type = type; /* * Signals generated by perf events should not terminate the whole * process if SIGTRAP is blocked, however, delivering the signal * asynchronously is better than not delivering at all. But tell user * space if the signal was asynchronous, so it can clearly be * distinguished from normal synchronous ones. */ info.si_perf_flags = sigismember(¤t->blocked, info.si_signo) ? TRAP_PERF_FLAG_ASYNC : 0; return send_sig_info(info.si_signo, &info, current); } /** * force_sig_seccomp - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland * @reason: filter-supplied reason code to send to userland (via si_errno) * @force_coredump: true to trigger a coredump * * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. */ int force_sig_seccomp(int syscall, int reason, bool force_coredump) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSYS; info.si_code = SYS_SECCOMP; info.si_call_addr = (void __user *)KSTK_EIP(current); info.si_errno = reason; info.si_arch = syscall_get_arch(current); info.si_syscall = syscall; return force_sig_info_to_task(&info, current, force_coredump ? HANDLER_EXIT : HANDLER_CURRENT); } /* For the crazy architectures that include trap information in * the errno field, instead of an actual errno value. */ int force_sig_ptrace_errno_trap(int errno, void __user *addr) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = errno; info.si_code = TRAP_HWBKPT; info.si_addr = addr; return force_sig_info(&info); } /* For the rare architectures that include trap information using * si_trapno. */ int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_trapno = trapno; return force_sig_info(&info); } /* For the rare architectures that include trap information using * si_trapno. */ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_trapno = trapno; return send_sig_info(info.si_signo, &info, t); } static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { int ret; read_lock(&tasklist_lock); ret = __kill_pgrp_info(sig, info, pgrp); read_unlock(&tasklist_lock); return ret; } int kill_pgrp(struct pid *pid, int sig, int priv) { return kill_pgrp_info(sig, __si_special(priv), pid); } EXPORT_SYMBOL(kill_pgrp); int kill_pid(struct pid *pid, int sig, int priv) { return kill_pid_info(sig, __si_special(priv), pid); } EXPORT_SYMBOL(kill_pid); #ifdef CONFIG_POSIX_TIMERS /* * These functions handle POSIX timer signals. POSIX timers use * preallocated sigqueue structs for sending signals. */ static void __flush_itimer_signals(struct sigpending *pending) { sigset_t signal, retain; struct sigqueue *q, *n; signal = pending->signal; sigemptyset(&retain); list_for_each_entry_safe(q, n, &pending->list, list) { int sig = q->info.si_signo; if (likely(q->info.si_code != SI_TIMER)) { sigaddset(&retain, sig); } else { sigdelset(&signal, sig); list_del_init(&q->list); __sigqueue_free(q); } } sigorsets(&pending->signal, &signal, &retain); } void flush_itimer_signals(void) { struct task_struct *tsk = current; guard(spinlock_irqsave)(&tsk->sighand->siglock); __flush_itimer_signals(&tsk->pending); __flush_itimer_signals(&tsk->signal->shared_pending); } bool posixtimer_init_sigqueue(struct sigqueue *q) { struct ucounts *ucounts = sig_get_ucounts(current, -1, 0); if (!ucounts) return false; clear_siginfo(&q->info); __sigqueue_init(q, ucounts, SIGQUEUE_PREALLOC); return true; } static void posixtimer_queue_sigqueue(struct sigqueue *q, struct task_struct *t, enum pid_type type) { struct sigpending *pending; int sig = q->info.si_signo; signalfd_notify(t, sig); pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); complete_signal(sig, t, type); } /* * This function is used by POSIX timers to deliver a timer signal. * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID * set), the signal must be delivered to the specific thread (queues * into t->pending). * * Where type is not PIDTYPE_PID, signals must be delivered to the * process. In this case, prefer to deliver to current if it is in * the same thread group as the target process and its sighand is * stable, which avoids unnecessarily waking up a potentially idle task. */ static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr) { struct task_struct *t = pid_task(tmr->it_pid, tmr->it_pid_type); if (t && tmr->it_pid_type != PIDTYPE_PID && same_thread_group(t, current) && !current->exit_state) t = current; return t; } void posixtimer_send_sigqueue(struct k_itimer *tmr) { struct sigqueue *q = &tmr->sigq; int sig = q->info.si_signo; struct task_struct *t; unsigned long flags; int result; guard(rcu)(); t = posixtimer_get_target(tmr); if (!t) return; if (!likely(lock_task_sighand(t, &flags))) return; /* * Update @tmr::sigqueue_seq for posix timer signals with sighand * locked to prevent a race against dequeue_signal(). */ tmr->it_sigqueue_seq = tmr->it_signal_seq; /* * Set the signal delivery status under sighand lock, so that the * ignored signal handling can distinguish between a periodic and a * non-periodic timer. */ tmr->it_sig_periodic = tmr->it_status == POSIX_TIMER_REQUEUE_PENDING; if (!prepare_signal(sig, t, false)) { result = TRACE_SIGNAL_IGNORED; if (!list_empty(&q->list)) { /* * The signal was ignored and blocked. The timer * expiry queued it because blocked signals are * queued independent of the ignored state. * * The unblocking set SIGPENDING, but the signal * was not yet dequeued from the pending list. * So prepare_signal() sees unblocked and ignored, * which ends up here. Leave it queued like a * regular signal. * * The same happens when the task group is exiting * and the signal is already queued. * prepare_signal() treats SIGNAL_GROUP_EXIT as * ignored independent of its queued state. This * gets cleaned up in __exit_signal(). */ goto out; } /* Periodic timers with SIG_IGN are queued on the ignored list */ if (tmr->it_sig_periodic) { /* * Already queued means the timer was rearmed after * the previous expiry got it on the ignore list. * Nothing to do for that case. */ if (hlist_unhashed(&tmr->ignored_list)) { /* * Take a signal reference and queue it on * the ignored list. */ posixtimer_sigqueue_getref(q); posixtimer_sig_ignore(t, q); } } else if (!hlist_unhashed(&tmr->ignored_list)) { /* * Covers the case where a timer was periodic and * then the signal was ignored. Later it was rearmed * as oneshot timer. The previous signal is invalid * now, and this oneshot signal has to be dropped. * Remove it from the ignored list and drop the * reference count as the signal is not longer * queued. */ hlist_del_init(&tmr->ignored_list); posixtimer_putref(tmr); } goto out; } if (unlikely(!list_empty(&q->list))) { /* This holds a reference count already */ result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } /* * If the signal is on the ignore list, it got blocked after it was * ignored earlier. But nothing lifted the ignore. Move it back to * the pending list to be consistent with the regular signal * handling. This already holds a reference count. * * If it's not on the ignore list acquire a reference count. */ if (likely(hlist_unhashed(&tmr->ignored_list))) posixtimer_sigqueue_getref(q); else hlist_del_init(&tmr->ignored_list); posixtimer_queue_sigqueue(q, t, tmr->it_pid_type); result = TRACE_SIGNAL_DELIVERED; out: trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); } static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); /* * If the timer is marked deleted already or the signal originates * from a non-periodic timer, then just drop the reference * count. Otherwise queue it on the ignored list. */ if (posixtimer_valid(tmr) && tmr->it_sig_periodic) hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); else posixtimer_putref(tmr); } static void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { struct hlist_head *head = &tsk->signal->ignored_posix_timers; struct hlist_node *tmp; struct k_itimer *tmr; if (likely(hlist_empty(head))) return; /* * Rearming a timer with sighand lock held is not possible due to * lock ordering vs. tmr::it_lock. Just stick the sigqueue back and * let the signal delivery path deal with it whether it needs to be * rearmed or not. This cannot be decided here w/o dropping sighand * lock and creating a loop retry horror show. */ hlist_for_each_entry_safe(tmr, tmp , head, ignored_list) { struct task_struct *target; /* * tmr::sigq.info.si_signo is immutable, so accessing it * without holding tmr::it_lock is safe. */ if (tmr->sigq.info.si_signo != sig) continue; hlist_del_init(&tmr->ignored_list); /* This should never happen and leaks a reference count */ if (WARN_ON_ONCE(!list_empty(&tmr->sigq.list))) continue; /* * Get the target for the signal. If target is a thread and * has exited by now, drop the reference count. */ guard(rcu)(); target = posixtimer_get_target(tmr); if (target) posixtimer_queue_sigqueue(&tmr->sigq, target, tmr->it_pid_type); else posixtimer_putref(tmr); } } #else /* CONFIG_POSIX_TIMERS */ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { } static inline void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { } #endif /* !CONFIG_POSIX_TIMERS */ void do_notify_pidfd(struct task_struct *task) { struct pid *pid = task_pid(task); WARN_ON(task->exit_state == 0); __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0, poll_to_key(EPOLLIN | EPOLLRDNORM)); } /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * * Returns true if our parent ignored us and so we've switched to * self-reaping. */ bool do_notify_parent(struct task_struct *tsk, int sig) { struct kernel_siginfo info; unsigned long flags; struct sighand_struct *psig; bool autoreap = false; u64 utime, stime; WARN_ON_ONCE(sig == -1); /* do_notify_parent_cldstop should have been called instead. */ WARN_ON_ONCE(task_is_stopped_or_traced(tsk)); WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); /* ptraced, or group-leader without sub-threads */ do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* * This is only possible if parent == real_parent. * Check if it has changed security domain. */ if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id)) sig = SIGCHLD; } clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; /* * We are under tasklist_lock here so our parent is tied to * us and cannot change. * * task_active_pid_ns will always return the same pid namespace * until a task passes through release_task. * * write_lock() currently calls preempt_disable() which is the * same as rcu_read_lock(), but according to Oleg, this is not * correct to rely on this */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent)); info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), task_uid(tsk)); rcu_read_unlock(); task_cputime(tsk, &utime, &stime); info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime); info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) info.si_code = CLD_DUMPED; else if (tsk->exit_code & 0x7f) info.si_code = CLD_KILLED; else { info.si_code = CLD_EXITED; info.si_status = tsk->exit_code >> 8; } psig = tsk->parent->sighand; spin_lock_irqsave(&psig->siglock, flags); if (!tsk->ptrace && sig == SIGCHLD && (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { /* * We are exiting and our parent doesn't care. POSIX.1 * defines special semantics for setting SIGCHLD to SIG_IGN * or setting the SA_NOCLDWAIT flag: we should be reaped * automatically and not left for our parent's wait4 call. * Rather than having the parent do it as a magic kind of * signal handler, we just set this to tell do_exit that we * can be cleaned up without becoming a zombie. Note that * we still call __wake_up_parent in this case, because a * blocked sys_wait4 might now return -ECHILD. * * Whether we send SIGCHLD or not for SA_NOCLDWAIT * is implementation-defined: we do (if you don't want * it, just use SIG_IGN instead). */ autoreap = true; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = 0; } /* * Send with __send_signal as si_pid and si_uid are in the * parent's namespaces. */ if (valid_signal(sig) && sig) __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); return autoreap; } /** * do_notify_parent_cldstop - notify parent of stopped/continued state change * @tsk: task reporting the state change * @for_ptracer: the notification is for ptracer * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report * * Notify @tsk's parent that the stopped/continued state has changed. If * @for_ptracer is %false, @tsk's group leader notifies to its real parent. * If %true, @tsk reports to @tsk->parent which should be the ptracer. * * CONTEXT: * Must be called with tasklist_lock at least read locked. */ static void do_notify_parent_cldstop(struct task_struct *tsk, bool for_ptracer, int why) { struct kernel_siginfo info; unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; u64 utime, stime; if (for_ptracer) { parent = tsk->parent; } else { tsk = tsk->group_leader; parent = tsk->real_parent; } clear_siginfo(&info); info.si_signo = SIGCHLD; info.si_errno = 0; /* * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); task_cputime(tsk, &utime, &stime); info.si_utime = nsec_to_clock_t(utime); info.si_stime = nsec_to_clock_t(stime); info.si_code = why; switch (why) { case CLD_CONTINUED: info.si_status = SIGCONT; break; case CLD_STOPPED: info.si_status = tsk->signal->group_exit_code & 0x7f; break; case CLD_TRAPPED: info.si_status = tsk->exit_code & 0x7f; break; default: BUG(); } sighand = parent->sighand; spin_lock_irqsave(&sighand->siglock, flags); if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID); /* * Even if SIGCHLD is not generated, we must wake up wait4 calls. */ __wake_up_parent(tsk, parent); spin_unlock_irqrestore(&sighand->siglock, flags); } /* * This must be called with current->sighand->siglock held. * * This should be the path for all ptrace stops. * We always set current->last_siginfo while stopped here. * That makes it a way to test a stopped process for * being ptrace-stopped vs being job-control-stopped. * * Returns the signal the ptracer requested the code resume * with. If the code did not stop because the tracer is gone, * the stop signal remains unchanged unless clear_code. */ static int ptrace_stop(int exit_code, int why, unsigned long message, kernel_siginfo_t *info) __releases(¤t->sighand->siglock) __acquires(¤t->sighand->siglock) { bool gstop_done = false; if (arch_ptrace_stop_needed()) { /* * The arch code has something special to do before a * ptrace stop. This is allowed to block, e.g. for faults * on user stack pages. We can't keep the siglock while * calling arch_ptrace_stop, so we must release it now. * To preserve proper semantics, we must do this before * any signal bookkeeping like checking group_stop_count. */ spin_unlock_irq(¤t->sighand->siglock); arch_ptrace_stop(); spin_lock_irq(¤t->sighand->siglock); } /* * After this point ptrace_signal_wake_up or signal_wake_up * will clear TASK_TRACED if ptrace_unlink happens or a fatal * signal comes in. Handle previous ptrace_unlinks and fatal * signals here to prevent ptrace_stop sleeping in schedule. */ if (!current->ptrace || __fatal_signal_pending(current)) return exit_code; set_special_state(TASK_TRACED); current->jobctl |= JOBCTL_TRACED; /* * We're committing to trapping. TRACED should be visible before * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). * Also, transition to TRACED and updates to ->jobctl should be * atomic with respect to siglock and should be done after the arch * hook as siglock is released and regrabbed across it. * * TRACER TRACEE * * ptrace_attach() * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED) * do_wait() * set_current_state() smp_wmb(); * ptrace_do_wait() * wait_task_stopped() * task_stopped_code() * [L] task_is_traced() [S] task_clear_jobctl_trapping(); */ smp_wmb(); current->ptrace_message = message; current->last_siginfo = info; current->exit_code = exit_code; /* * If @why is CLD_STOPPED, we're trapping to participate in a group * stop. Do the bookkeeping. Note that if SIGCONT was delievered * across siglock relocks since INTERRUPT was scheduled, PENDING * could be clear now. We act as if SIGCONT is received after * TASK_TRACED is entered - ignore it. */ if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) gstop_done = task_participate_group_stop(current); /* any trap clears pending STOP trap, STOP trap clears NOTIFY */ task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP); if (info && info->si_code >> 8 == PTRACE_EVENT_STOP) task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY); /* entering a trap, clear TRAPPING */ task_clear_jobctl_trapping(current); spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); /* * Notify parents of the stop. * * While ptraced, there are two parents - the ptracer and * the real_parent of the group_leader. The ptracer should * know about every stop while the real parent is only * interested in the completion of group stop. The states * for the two don't interact with each other. Notify * separately unless they're gonna be duplicates. */ if (current->ptrace) do_notify_parent_cldstop(current, true, why); if (gstop_done && (!current->ptrace || ptrace_reparented(current))) do_notify_parent_cldstop(current, false, why); /* * The previous do_notify_parent_cldstop() invocation woke ptracer. * One a PREEMPTION kernel this can result in preemption requirement * which will be fulfilled after read_unlock() and the ptracer will be * put on the CPU. * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for * this task wait in schedule(). If this task gets preempted then it * remains enqueued on the runqueue. The ptracer will observe this and * then sleep for a delay of one HZ tick. In the meantime this task * gets scheduled, enters schedule() and will wait for the ptracer. * * This preemption point is not bad from a correctness point of * view but extends the runtime by one HZ tick time due to the * ptracer's sleep. The preempt-disable section ensures that there * will be no preemption between unlock and schedule() and so * improving the performance since the ptracer will observe that * the tracee is scheduled out once it gets on the CPU. * * On PREEMPT_RT locking tasklist_lock does not disable preemption. * Therefore the task can be preempted after do_notify_parent_cldstop() * before unlocking tasklist_lock so there is no benefit in doing this. * * In fact disabling preemption is harmful on PREEMPT_RT because * the spinlock_t in cgroup_enter_frozen() must not be acquired * with preemption disabled due to the 'sleeping' spinlock * substitution of RT. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); read_unlock(&tasklist_lock); cgroup_enter_frozen(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable_no_resched(); schedule(); cgroup_leave_frozen(true); /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with * any signal-sending on another CPU that wants to examine it. */ spin_lock_irq(¤t->sighand->siglock); exit_code = current->exit_code; current->last_siginfo = NULL; current->ptrace_message = 0; current->exit_code = 0; /* LISTENING can be set only during STOP traps, clear it */ current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN); /* * Queued signals ignored us while we were stopped for tracing. * So check for any that we should take before resuming user mode. * This sets TIF_SIGPENDING, but never clears it. */ recalc_sigpending_tsk(current); return exit_code; } static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message) { kernel_siginfo_t info; clear_siginfo(&info); info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); /* Let the debugger run. */ return ptrace_stop(exit_code, why, message, &info); } int ptrace_notify(int exit_code, unsigned long message) { int signr; BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); if (unlikely(task_work_pending(current))) task_work_run(); spin_lock_irq(¤t->sighand->siglock); signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message); spin_unlock_irq(¤t->sighand->siglock); return signr; } /** * do_signal_stop - handle group stop for SIGSTOP and other stop signals * @signr: signr causing group stop if initiating * * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr * and participate in it. If already set, participate in the existing * group stop. If participated in a group stop (and thus slept), %true is * returned with siglock released. * * If ptraced, this function doesn't handle stop itself. Instead, * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock * untouched. The caller must ensure that INTERRUPT trap handling takes * places afterwards. * * CONTEXT: * Must be called with @current->sighand->siglock held, which is released * on %true return. * * RETURNS: * %false if group stop is already cancelled or ptrace trap is scheduled. * %true if participated in group stop. */ static bool do_signal_stop(int signr) __releases(¤t->sighand->siglock) { struct signal_struct *sig = current->signal; if (!(current->jobctl & JOBCTL_STOP_PENDING)) { unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; struct task_struct *t; /* signr will be recorded in task->jobctl for retries */ WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || unlikely(sig->flags & SIGNAL_GROUP_EXIT) || unlikely(sig->group_exec_task)) return false; /* * There is no group stop already in progress. We must * initiate one now. * * While ptraced, a task may be resumed while group stop is * still in effect and then receive a stop signal and * initiate another group stop. This deviates from the * usual behavior as two consecutive stop signals can't * cause two group stops when !ptraced. That is why we * also check !task_is_stopped(t) below. * * The condition can be distinguished by testing whether * SIGNAL_STOP_STOPPED is already set. Don't generate * group_exit_code in such case. * * This is not necessary for SIGNAL_STOP_CONTINUED because * an intervening stop signal is required to cause two * continued events regardless of ptrace. */ if (!(sig->flags & SIGNAL_STOP_STOPPED)) sig->group_exit_code = signr; sig->group_stop_count = 0; if (task_set_jobctl_pending(current, signr | gstop)) sig->group_stop_count++; for_other_threads(current, t) { /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, * so this check has no races. */ if (!task_is_stopped(t) && task_set_jobctl_pending(t, signr | gstop)) { sig->group_stop_count++; if (likely(!(t->ptrace & PT_SEIZED))) signal_wake_up(t, 0); else ptrace_trap_notify(t); } } } if (likely(!current->ptrace)) { int notify = 0; /* * If there are no other threads in the group, or if there * is a group stop in progress and we are the last to stop, * report to the parent. */ if (task_participate_group_stop(current)) notify = CLD_STOPPED; current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); /* * Notify the parent of the group stop completion. Because * we're not holding either the siglock or tasklist_lock * here, ptracer may attach inbetween; however, this is for * group stop and should always be delivered to the real * parent of the group leader. The new ptracer will get * its notification when this task transitions into * TASK_TRACED. */ if (notify) { read_lock(&tasklist_lock); do_notify_parent_cldstop(current, false, notify); read_unlock(&tasklist_lock); } /* Now we don't run again until woken by SIGCONT or SIGKILL */ cgroup_enter_frozen(); schedule(); return true; } else { /* * While ptraced, group stop is handled by STOP trap. * Schedule it and let the caller deal with it. */ task_set_jobctl_pending(current, JOBCTL_TRAP_STOP); return false; } } /** * do_jobctl_trap - take care of ptrace jobctl traps * * When PT_SEIZED, it's used for both group stop and explicit * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with * accompanying siginfo. If stopped, lower eight bits of exit_code contain * the stop signal; otherwise, %SIGTRAP. * * When !PT_SEIZED, it's used only for group stop trap with stop signal * number as exit_code and no siginfo. * * CONTEXT: * Must be called with @current->sighand->siglock held, which may be * released and re-acquired before returning with intervening sleep. */ static void do_jobctl_trap(void) { struct signal_struct *signal = current->signal; int signr = current->jobctl & JOBCTL_STOP_SIGMASK; if (current->ptrace & PT_SEIZED) { if (!signal->group_stop_count && !(signal->flags & SIGNAL_STOP_STOPPED)) signr = SIGTRAP; WARN_ON_ONCE(!signr); ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8), CLD_STOPPED, 0); } else { WARN_ON_ONCE(!signr); ptrace_stop(signr, CLD_STOPPED, 0, NULL); } } /** * do_freezer_trap - handle the freezer jobctl trap * * Puts the task into frozen state, if only the task is not about to quit. * In this case it drops JOBCTL_TRAP_FREEZE. * * CONTEXT: * Must be called with @current->sighand->siglock held, * which is always released before returning. */ static void do_freezer_trap(void) __releases(¤t->sighand->siglock) { /* * If there are other trap bits pending except JOBCTL_TRAP_FREEZE, * let's make another loop to give it a chance to be handled. * In any case, we'll return back. */ if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) != JOBCTL_TRAP_FREEZE) { spin_unlock_irq(¤t->sighand->siglock); return; } /* * Now we're sure that there is no pending fatal signal and no * pending traps. Clear TIF_SIGPENDING to not get out of schedule() * immediately (if there is a non-fatal signal pending), and * put the task into sleep. */ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); clear_thread_flag(TIF_SIGPENDING); spin_unlock_irq(¤t->sighand->siglock); cgroup_enter_frozen(); schedule(); /* * We could've been woken by task_work, run it to clear * TIF_NOTIFY_SIGNAL. The caller will retry if necessary. */ clear_notify_signal(); if (unlikely(task_work_pending(current))) task_work_run(); } static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) { /* * We do not check sig_kernel_stop(signr) but set this marker * unconditionally because we do not know whether debugger will * change signr. This flag has no meaning unless we are going * to stop after return from ptrace_stop(). In this case it will * be checked in do_signal_stop(), we should only stop if it was * not cleared by SIGCONT while we were sleeping. See also the * comment in dequeue_signal(). */ current->jobctl |= JOBCTL_STOP_DEQUEUED; signr = ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ if (signr == 0) return signr; /* * Update the siginfo structure if the signal has * changed. If the debugger wanted something * specific in the siginfo structure then it should * have updated *info via PTRACE_SETSIGINFO. */ if (signr != info->si_signo) { clear_siginfo(info); info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); info->si_uid = from_kuid_munged(current_user_ns(), task_uid(current->parent)); rcu_read_unlock(); } /* If the (new) signal is now blocked, requeue it. */ if (sigismember(¤t->blocked, signr) || fatal_signal_pending(current)) { send_signal_locked(signr, info, current, type); signr = 0; } return signr; } static void hide_si_addr_tag_bits(struct ksignal *ksig) { switch (siginfo_layout(ksig->sig, ksig->info.si_code)) { case SIL_FAULT: case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: case SIL_FAULT_PERF_EVENT: ksig->info.si_addr = arch_untagged_si_addr( ksig->info.si_addr, ksig->sig, ksig->info.si_code); break; case SIL_KILL: case SIL_TIMER: case SIL_POLL: case SIL_CHLD: case SIL_RT: case SIL_SYS: break; } } bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; int signr; clear_notify_signal(); if (unlikely(task_work_pending(current))) task_work_run(); if (!task_sigpending(current)) return false; if (unlikely(uprobe_deny_signal())) return false; /* * Do this once, we can't return to user-mode if freezing() == T. * do_signal_stop() and ptrace_stop() do freezable_schedule() and * thus do not need another check after return. */ try_to_freeze(); relock: spin_lock_irq(&sighand->siglock); /* * Every stopped thread goes here after wakeup. Check to see if * we should notify the parent, prepare_signal(SIGCONT) encodes * the CLD_ si_code into SIGNAL_CLD_MASK bits. */ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { int why; if (signal->flags & SIGNAL_CLD_CONTINUED) why = CLD_CONTINUED; else why = CLD_STOPPED; signal->flags &= ~SIGNAL_CLD_MASK; spin_unlock_irq(&sighand->siglock); /* * Notify the parent that we're continuing. This event is * always per-process and doesn't make whole lot of sense * for ptracers, who shouldn't consume the state via * wait(2) either, but, for backward compatibility, notify * the ptracer of the group leader too unless it's gonna be * a duplicate. */ read_lock(&tasklist_lock); do_notify_parent_cldstop(current, false, why); if (ptrace_reparented(current->group_leader)) do_notify_parent_cldstop(current->group_leader, true, why); read_unlock(&tasklist_lock); goto relock; } for (;;) { struct k_sigaction *ka; enum pid_type type; /* Has this task already been marked for death? */ if ((signal->flags & SIGNAL_GROUP_EXIT) || signal->group_exec_task) { signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, &sighand->action[SIGKILL-1]); recalc_sigpending(); /* * implies do_group_exit() or return to PF_USER_WORKER, * no need to initialize ksig->info/etc. */ goto fatal; } if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && do_signal_stop(0)) goto relock; if (unlikely(current->jobctl & (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) { if (current->jobctl & JOBCTL_TRAP_MASK) { do_jobctl_trap(); spin_unlock_irq(&sighand->siglock); } else if (current->jobctl & JOBCTL_TRAP_FREEZE) do_freezer_trap(); goto relock; } /* * If the task is leaving the frozen state, let's update * cgroup counters and reset the frozen bit. */ if (unlikely(cgroup_task_frozen(current))) { spin_unlock_irq(&sighand->siglock); cgroup_leave_frozen(false); goto relock; } /* * Signals generated by the execution of an instruction * need to be delivered before any other pending signals * so that the instruction pointer in the signal stack * frame points to the faulting instruction. */ type = PIDTYPE_PID; signr = dequeue_synchronous_signal(&ksig->info); if (!signr) signr = dequeue_signal(¤t->blocked, &ksig->info, &type); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && (signr != SIGKILL) && !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) { signr = ptrace_signal(signr, &ksig->info, type); if (!signr) continue; } ka = &sighand->action[signr-1]; /* Trace actually delivered signals. */ trace_signal_deliver(signr, &ksig->info, ka); if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { /* Run the handler. */ ksig->ka = *ka; if (ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; break; /* will return non-zero "signr" value */ } /* * Now we are doing the default action for this signal. */ if (sig_kernel_ignore(signr)) /* Default is nothing. */ continue; /* * Global init gets no signals it doesn't want. * Container-init gets no signals it doesn't want from same * container. * * Note that if global/container-init sees a sig_kernel_only() * signal here, the signal must have been generated internally * or must have come from an ancestor namespace. In either * case, the signal cannot be dropped. */ if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && !sig_kernel_only(signr)) continue; if (sig_kernel_stop(signr)) { /* * The default action is to stop all threads in * the thread group. The job control signals * do nothing in an orphaned pgrp, but SIGSTOP * always works. Note that siglock needs to be * dropped during the call to is_orphaned_pgrp() * because of lock ordering with tasklist_lock. * This allows an intervening SIGCONT to be posted. * We need to check for that and bail out if necessary. */ if (signr != SIGSTOP) { spin_unlock_irq(&sighand->siglock); /* signals can be posted during this window */ if (is_current_pgrp_orphaned()) goto relock; spin_lock_irq(&sighand->siglock); } if (likely(do_signal_stop(signr))) { /* It released the siglock. */ goto relock; } /* * We didn't actually stop, due to a race * with SIGCONT or something like that. */ continue; } fatal: spin_unlock_irq(&sighand->siglock); if (unlikely(cgroup_task_frozen(current))) cgroup_leave_frozen(true); /* * Anything else is fatal, maybe with a core dump. */ current->flags |= PF_SIGNALED; if (sig_kernel_coredump(signr)) { if (print_fatal_signals) print_fatal_signal(signr); proc_coredump_connector(current); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with * their demise. If we lost the race with another * thread getting here, it set group_exit_code * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ do_coredump(&ksig->info); } /* * PF_USER_WORKER threads will catch and exit on fatal signals * themselves. They have cleanup that must be performed, so we * cannot call do_exit() on their behalf. Note that ksig won't * be properly initialized, PF_USER_WORKER's shouldn't use it. */ if (current->flags & PF_USER_WORKER) goto out; /* * Death signals, no core dump. */ do_group_exit(signr); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); ksig->sig = signr; if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) hide_si_addr_tag_bits(ksig); out: return signr > 0; } /** * signal_delivered - called after signal delivery to update blocked signals * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask * is always blocked), and the signal itself is blocked unless %SA_NODEFER * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ static void signal_delivered(struct ksignal *ksig, int stepping) { sigset_t blocked; /* A signal was successfully delivered, and the saved sigmask was stored on the signal frame, and will be restored by sigreturn. So we can simply clear the restore sigmask flag. */ clear_restore_sigmask(); sigorsets(&blocked, ¤t->blocked, &ksig->ka.sa.sa_mask); if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) sigaddset(&blocked, ksig->sig); set_current_blocked(&blocked); if (current->sas_ss_flags & SS_AUTODISARM) sas_ss_reset(current); if (stepping) ptrace_notify(SIGTRAP, 0); } void signal_setup_done(int failed, struct ksignal *ksig, int stepping) { if (failed) force_sigsegv(ksig->sig); else signal_delivered(ksig, stepping); } /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take * the shared signals in @which since we will not. */ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) { sigset_t retarget; struct task_struct *t; sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); if (sigisemptyset(&retarget)) return; for_other_threads(tsk, t) { if (t->flags & PF_EXITING) continue; if (!has_pending_signals(&retarget, &t->blocked)) continue; /* Remove the signals this thread can handle. */ sigandsets(&retarget, &retarget, &t->blocked); if (!task_sigpending(t)) signal_wake_up(t, 0); if (sigisemptyset(&retarget)) break; } } void exit_signals(struct task_struct *tsk) { int group_stop = 0; sigset_t unblocked; /* * @tsk is about to have PF_EXITING set - lock out users which * expect stable threadgroup. */ cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; } spin_lock_irq(&tsk->sighand->siglock); /* * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); if (!task_sigpending(tsk)) goto out; unblocked = tsk->blocked; signotset(&unblocked); retarget_shared_pending(tsk, &unblocked); if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) && task_participate_group_stop(tsk)) group_stop = CLD_STOPPED; out: spin_unlock_irq(&tsk->sighand->siglock); /* * If group stop has completed, deliver the notification. This * should always go to the real parent of the group leader. */ if (unlikely(group_stop)) { read_lock(&tasklist_lock); do_notify_parent_cldstop(tsk, false, group_stop); read_unlock(&tasklist_lock); } } /* * System call entry points. */ /** * sys_restart_syscall - restart a system call */ SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = ¤t->restart_block; return restart->fn(restart); } long do_no_restart_syscall(struct restart_block *param) { return -EINTR; } static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) { if (task_sigpending(tsk) && !thread_group_empty(tsk)) { sigset_t newblocked; /* A set of now blocked but previously unblocked signals. */ sigandnsets(&newblocked, newset, ¤t->blocked); retarget_shared_pending(tsk, &newblocked); } tsk->blocked = *newset; recalc_sigpending(); } /** * set_current_blocked - change current->blocked mask * @newset: new mask * * It is wrong to change ->blocked directly, this helper should be used * to ensure the process can't miss a shared signal we are going to block. */ void set_current_blocked(sigset_t *newset) { sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); __set_current_blocked(newset); } void __set_current_blocked(const sigset_t *newset) { struct task_struct *tsk = current; /* * In case the signal mask hasn't changed, there is nothing we need * to do. The current->blocked shouldn't be modified by other task. */ if (sigequalsets(&tsk->blocked, newset)) return; spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, newset); spin_unlock_irq(&tsk->sighand->siglock); } /* * This is also useful for kernel threads that want to temporarily * (or permanently) block certain signals. * * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel * interface happily blocks "unblockable" signals like SIGKILL * and friends. */ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) { struct task_struct *tsk = current; sigset_t newset; /* Lockless, only current can change ->blocked, never from irq */ if (oldset) *oldset = tsk->blocked; switch (how) { case SIG_BLOCK: sigorsets(&newset, &tsk->blocked, set); break; case SIG_UNBLOCK: sigandnsets(&newset, &tsk->blocked, set); break; case SIG_SETMASK: newset = *set; break; default: return -EINVAL; } __set_current_blocked(&newset); return 0; } EXPORT_SYMBOL(sigprocmask); /* * The api helps set app-provided sigmasks. * * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed from userland for the syscalls. * * Note that it does set_restore_sigmask() in advance, so it must be always * paired with restore_saved_sigmask_unless() before return from syscall. */ int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize) { sigset_t kmask; if (!umask) return 0; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&kmask, umask, sizeof(sigset_t))) return -EFAULT; set_restore_sigmask(); current->saved_sigmask = current->blocked; set_current_blocked(&kmask); return 0; } #ifdef CONFIG_COMPAT int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize) { sigset_t kmask; if (!umask) return 0; if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (get_compat_sigset(&kmask, umask)) return -EFAULT; set_restore_sigmask(); current->saved_sigmask = current->blocked; set_current_blocked(&kmask); return 0; } #endif /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals * @nset: stores pending signals * @oset: previous value of signal mask if non-null * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, sigset_t __user *, oset, size_t, sigsetsize) { sigset_t old_set, new_set; int error; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; old_set = current->blocked; if (nset) { if (copy_from_user(&new_set, nset, sizeof(sigset_t))) return -EFAULT; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); if (error) return error; } if (oset) { if (copy_to_user(oset, &old_set, sizeof(sigset_t))) return -EFAULT; } return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, compat_sigset_t __user *, oset, compat_size_t, sigsetsize) { sigset_t old_set = current->blocked; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (nset) { sigset_t new_set; int error; if (get_compat_sigset(&new_set, nset)) return -EFAULT; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); if (error) return error; } return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0; } #endif static void do_sigpending(sigset_t *set) { spin_lock_irq(¤t->sighand->siglock); sigorsets(set, ¤t->pending.signal, ¤t->signal->shared_pending.signal); spin_unlock_irq(¤t->sighand->siglock); /* Outside the lock because only this thread touches it. */ sigandsets(set, ¤t->blocked, set); } /** * sys_rt_sigpending - examine a pending signal that has been raised * while blocked * @uset: stores pending signals * @sigsetsize: size of sigset_t type or larger */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; if (sigsetsize > sizeof(*uset)) return -EINVAL; do_sigpending(&set); if (copy_to_user(uset, &set, sigsetsize)) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { sigset_t set; if (sigsetsize > sizeof(*uset)) return -EINVAL; do_sigpending(&set); return put_compat_sigset(uset, &set, sigsetsize); } #endif static const struct { unsigned char limit, layout; } sig_sicodes[] = { [SIGILL] = { NSIGILL, SIL_FAULT }, [SIGFPE] = { NSIGFPE, SIL_FAULT }, [SIGSEGV] = { NSIGSEGV, SIL_FAULT }, [SIGBUS] = { NSIGBUS, SIL_FAULT }, [SIGTRAP] = { NSIGTRAP, SIL_FAULT }, #if defined(SIGEMT) [SIGEMT] = { NSIGEMT, SIL_FAULT }, #endif [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, [SIGPOLL] = { NSIGPOLL, SIL_POLL }, [SIGSYS] = { NSIGSYS, SIL_SYS }, }; static bool known_siginfo_layout(unsigned sig, int si_code) { if (si_code == SI_KERNEL) return true; else if ((si_code > SI_USER)) { if (sig_specific_sicodes(sig)) { if (si_code <= sig_sicodes[sig].limit) return true; } else if (si_code <= NSIGPOLL) return true; } else if (si_code >= SI_DETHREAD) return true; else if (si_code == SI_ASYNCNL) return true; return false; } enum siginfo_layout siginfo_layout(unsigned sig, int si_code) { enum siginfo_layout layout = SIL_KILL; if ((si_code > SI_USER) && (si_code < SI_KERNEL)) { if ((sig < ARRAY_SIZE(sig_sicodes)) && (si_code <= sig_sicodes[sig].limit)) { layout = sig_sicodes[sig].layout; /* Handle the exceptions */ if ((sig == SIGBUS) && (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO)) layout = SIL_FAULT_MCEERR; else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR)) layout = SIL_FAULT_BNDERR; #ifdef SEGV_PKUERR else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR)) layout = SIL_FAULT_PKUERR; #endif else if ((sig == SIGTRAP) && (si_code == TRAP_PERF)) layout = SIL_FAULT_PERF_EVENT; else if (IS_ENABLED(CONFIG_SPARC) && (sig == SIGILL) && (si_code == ILL_ILLTRP)) layout = SIL_FAULT_TRAPNO; else if (IS_ENABLED(CONFIG_ALPHA) && ((sig == SIGFPE) || ((sig == SIGTRAP) && (si_code == TRAP_UNK)))) layout = SIL_FAULT_TRAPNO; } else if (si_code <= NSIGPOLL) layout = SIL_POLL; } else { if (si_code == SI_TIMER) layout = SIL_TIMER; else if (si_code == SI_SIGIO) layout = SIL_POLL; else if (si_code < 0) layout = SIL_RT; } return layout; } static inline char __user *si_expansion(const siginfo_t __user *info) { return ((char __user *)info) + sizeof(struct kernel_siginfo); } int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from) { char __user *expansion = si_expansion(to); if (copy_to_user(to, from , sizeof(struct kernel_siginfo))) return -EFAULT; if (clear_user(expansion, SI_EXPANSION_SIZE)) return -EFAULT; return 0; } static int post_copy_siginfo_from_user(kernel_siginfo_t *info, const siginfo_t __user *from) { if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) { char __user *expansion = si_expansion(from); char buf[SI_EXPANSION_SIZE]; int i; /* * An unknown si_code might need more than * sizeof(struct kernel_siginfo) bytes. Verify all of the * extra bytes are 0. This guarantees copy_siginfo_to_user * will return this data to userspace exactly. */ if (copy_from_user(&buf, expansion, SI_EXPANSION_SIZE)) return -EFAULT; for (i = 0; i < SI_EXPANSION_SIZE; i++) { if (buf[i] != 0) return -E2BIG; } } return 0; } static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to, const siginfo_t __user *from) { if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) return -EFAULT; to->si_signo = signo; return post_copy_siginfo_from_user(to, from); } int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) { if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) return -EFAULT; return post_copy_siginfo_from_user(to, from); } #ifdef CONFIG_COMPAT /** * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo * @to: compat siginfo destination * @from: kernel siginfo source * * Note: This function does not work properly for the SIGCHLD on x32, but * fortunately it doesn't have to. The only valid callers for this function are * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. * The latter does not care because SIGCHLD will never cause a coredump. */ void copy_siginfo_to_external32(struct compat_siginfo *to, const struct kernel_siginfo *from) { memset(to, 0, sizeof(*to)); to->si_signo = from->si_signo; to->si_errno = from->si_errno; to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: to->si_pid = from->si_pid; to->si_uid = from->si_uid; break; case SIL_TIMER: to->si_tid = from->si_tid; to->si_overrun = from->si_overrun; to->si_int = from->si_int; break; case SIL_POLL: to->si_band = from->si_band; to->si_fd = from->si_fd; break; case SIL_FAULT: to->si_addr = ptr_to_compat(from->si_addr); break; case SIL_FAULT_TRAPNO: to->si_addr = ptr_to_compat(from->si_addr); to->si_trapno = from->si_trapno; break; case SIL_FAULT_MCEERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_lower = ptr_to_compat(from->si_lower); to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_pkey = from->si_pkey; break; case SIL_FAULT_PERF_EVENT: to->si_addr = ptr_to_compat(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_status = from->si_status; to->si_utime = from->si_utime; to->si_stime = from->si_stime; break; case SIL_RT: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_int = from->si_int; break; case SIL_SYS: to->si_call_addr = ptr_to_compat(from->si_call_addr); to->si_syscall = from->si_syscall; to->si_arch = from->si_arch; break; } } int __copy_siginfo_to_user32(struct compat_siginfo __user *to, const struct kernel_siginfo *from) { struct compat_siginfo new; copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; return 0; } static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, const struct compat_siginfo *from) { clear_siginfo(to); to->si_signo = from->si_signo; to->si_errno = from->si_errno; to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: to->si_pid = from->si_pid; to->si_uid = from->si_uid; break; case SIL_TIMER: to->si_tid = from->si_tid; to->si_overrun = from->si_overrun; to->si_int = from->si_int; break; case SIL_POLL: to->si_band = from->si_band; to->si_fd = from->si_fd; break; case SIL_FAULT: to->si_addr = compat_ptr(from->si_addr); break; case SIL_FAULT_TRAPNO: to->si_addr = compat_ptr(from->si_addr); to->si_trapno = from->si_trapno; break; case SIL_FAULT_MCEERR: to->si_addr = compat_ptr(from->si_addr); to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = compat_ptr(from->si_addr); to->si_lower = compat_ptr(from->si_lower); to->si_upper = compat_ptr(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = compat_ptr(from->si_addr); to->si_pkey = from->si_pkey; break; case SIL_FAULT_PERF_EVENT: to->si_addr = compat_ptr(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_status = from->si_status; #ifdef CONFIG_X86_X32_ABI if (in_x32_syscall()) { to->si_utime = from->_sifields._sigchld_x32._utime; to->si_stime = from->_sifields._sigchld_x32._stime; } else #endif { to->si_utime = from->si_utime; to->si_stime = from->si_stime; } break; case SIL_RT: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_int = from->si_int; break; case SIL_SYS: to->si_call_addr = compat_ptr(from->si_call_addr); to->si_syscall = from->si_syscall; to->si_arch = from->si_arch; break; } return 0; } static int __copy_siginfo_from_user32(int signo, struct kernel_siginfo *to, const struct compat_siginfo __user *ufrom) { struct compat_siginfo from; if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) return -EFAULT; from.si_signo = signo; return post_copy_siginfo_from_user32(to, &from); } int copy_siginfo_from_user32(struct kernel_siginfo *to, const struct compat_siginfo __user *ufrom) { struct compat_siginfo from; if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) return -EFAULT; return post_copy_siginfo_from_user32(to, &from); } #endif /* CONFIG_COMPAT */ /** * do_sigtimedwait - wait for queued signals specified in @which * @which: queued signals to wait for * @info: if non-null, the signal's siginfo is returned here * @ts: upper bound on process time suspension */ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, const struct timespec64 *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; sigset_t mask = *which; enum pid_type type; int sig, ret = 0; if (ts) { if (!timespec64_valid(ts)) return -EINVAL; timeout = timespec64_to_ktime(*ts); to = &timeout; } /* * Invert the set of allowed signals to get those we want to block. */ sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); signotset(&mask); spin_lock_irq(&tsk->sighand->siglock); sig = dequeue_signal(&mask, info, &type); if (!sig && timeout) { /* * None ready, temporarily unblock those we're interested * while we are sleeping in so that we'll be awakened when * they arrive. Unblocking is always fine, we can avoid * set_current_blocked(). */ tsk->real_blocked = tsk->blocked; sigandsets(&tsk->blocked, &tsk->blocked, &mask); recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns, HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); sig = dequeue_signal(&mask, info, &type); } spin_unlock_irq(&tsk->sighand->siglock); if (sig) return sig; return ret ? -EINTR : -EAGAIN; } /** * sys_rt_sigtimedwait - synchronously wait for queued signals specified * in @uthese * @uthese: queued signals to wait for * @uinfo: if non-null, the signal's siginfo is returned here * @uts: upper bound on process time suspension * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct __kernel_timespec __user *, uts, size_t, sigsetsize) { sigset_t these; struct timespec64 ts; kernel_siginfo_t info; int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; if (uts) { if (get_timespec64(&ts, uts)) return -EFAULT; } ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user(uinfo, &info)) ret = -EFAULT; } return ret; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct old_timespec32 __user *, uts, size_t, sigsetsize) { sigset_t these; struct timespec64 ts; kernel_siginfo_t info; int ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; if (uts) { if (get_old_timespec32(&ts, uts)) return -EFAULT; } ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user(uinfo, &info)) ret = -EFAULT; } return ret; } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize) { sigset_t s; struct timespec64 t; kernel_siginfo_t info; long ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&s, uthese)) return -EFAULT; if (uts) { if (get_timespec64(&t, uts)) return -EFAULT; } ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user32(uinfo, &info)) ret = -EFAULT; } return ret; } #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { sigset_t s; struct timespec64 t; kernel_siginfo_t info; long ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&s, uthese)) return -EFAULT; if (uts) { if (get_old_timespec32(&t, uts)) return -EFAULT; } ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user32(uinfo, &info)) ret = -EFAULT; } return ret; } #endif #endif static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info, enum pid_type type) { clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER; info->si_pid = task_tgid_vnr(current); info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); } /** * sys_kill - send a signal to a process * @pid: the PID of the process * @sig: signal to be sent */ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; prepare_kill_siginfo(sig, &info, PIDTYPE_TGID); return kill_something_info(sig, &info, pid); } /* * Verify that the signaler and signalee either are in the same pid namespace * or that the signaler's pid namespace is an ancestor of the signalee's pid * namespace. */ static bool access_pidfd_pidns(struct pid *pid) { struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *p = ns_of_pid(pid); for (;;) { if (!p) return false; if (p == active) break; p = p->parent; } return true; } static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t __user *info) { #ifdef CONFIG_COMPAT /* * Avoid hooking up compat syscalls and instead handle necessary * conversions here. Note, this is a stop-gap measure and should not be * considered a generic solution. */ if (in_compat_syscall()) return copy_siginfo_from_user32( kinfo, (struct compat_siginfo __user *)info); #endif return copy_siginfo_from_user(kinfo, info); } static struct pid *pidfd_to_pid(const struct file *file) { struct pid *pid; pid = pidfd_pid(file); if (!IS_ERR(pid)) return pid; return tgid_pidfd_to_pid(file); } #define PIDFD_SEND_SIGNAL_FLAGS \ (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ PIDFD_SIGNAL_PROCESS_GROUP) static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type, siginfo_t __user *info, unsigned int flags) { kernel_siginfo_t kinfo; switch (flags) { case PIDFD_SIGNAL_THREAD: type = PIDTYPE_PID; break; case PIDFD_SIGNAL_THREAD_GROUP: type = PIDTYPE_TGID; break; case PIDFD_SIGNAL_PROCESS_GROUP: type = PIDTYPE_PGID; break; } if (info) { int ret; ret = copy_siginfo_from_user_any(&kinfo, info); if (unlikely(ret)) return ret; if (unlikely(sig != kinfo.si_signo)) return -EINVAL; /* Only allow sending arbitrary signals to yourself. */ if ((task_pid(current) != pid || type > PIDTYPE_TGID) && (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) return -EPERM; } else { prepare_kill_siginfo(sig, &kinfo, type); } if (type == PIDTYPE_PGID) return kill_pgrp_info(sig, &kinfo, pid); return kill_pid_info_type(sig, &kinfo, pid, type); } /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process * @sig: signal to send * @info: signal info * @flags: future flags * * Send the signal to the thread group or to the individual thread depending * on PIDFD_THREAD. * In the future extension to @flags may be used to override the default scope * of @pidfd. * * Return: 0 on success, negative errno on failure */ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { struct pid *pid; enum pid_type type; /* Enforce flags be set to 0 until we add an extension. */ if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) return -EINVAL; /* Ensure that only a single signal scope determining flag is set. */ if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; switch (pidfd) { case PIDFD_SELF_THREAD: pid = get_task_pid(current, PIDTYPE_PID); type = PIDTYPE_PID; break; case PIDFD_SELF_THREAD_GROUP: pid = get_task_pid(current, PIDTYPE_TGID); type = PIDTYPE_TGID; break; default: { CLASS(fd, f)(pidfd); if (fd_empty(f)) return -EBADF; /* Is this a pidfd? */ pid = pidfd_to_pid(fd_file(f)); if (IS_ERR(pid)) return PTR_ERR(pid); if (!access_pidfd_pidns(pid)) return -EINVAL; /* Infer scope from the type of pidfd. */ if (fd_file(f)->f_flags & PIDFD_THREAD) type = PIDTYPE_PID; else type = PIDTYPE_TGID; return do_pidfd_send_signal(pid, sig, type, info, flags); } } return do_pidfd_send_signal(pid, sig, type, info, flags); } static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) { struct task_struct *p; int error = -ESRCH; rcu_read_lock(); p = find_task_by_vpid(pid); if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { error = check_kill_permission(sig, info, p); /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. */ if (!error && sig) { error = do_send_sig_info(sig, info, p, PIDTYPE_PID); /* * If lock_task_sighand() failed we pretend the task * dies after receiving the signal. The window is tiny, * and the signal is private anyway. */ if (unlikely(error == -ESRCH)) error = 0; } } rcu_read_unlock(); return error; } static int do_tkill(pid_t tgid, pid_t pid, int sig) { struct kernel_siginfo info; prepare_kill_siginfo(sig, &info, PIDTYPE_PID); return do_send_specific(tgid, pid, sig, &info); } /** * sys_tgkill - send signal to one specific thread * @tgid: the thread group ID of the thread * @pid: the PID of the thread * @sig: signal to be sent * * This syscall also checks the @tgid and returns -ESRCH even if the PID * exists but it's not belonging to the target process anymore. This * method solves the problem of threads exiting and PIDs getting reused. */ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) return -EINVAL; return do_tkill(tgid, pid, sig); } /** * sys_tkill - send signal to one specific task * @pid: the PID of the task * @sig: signal to be sent * * Send a signal to only one task, even if it's a CLONE_THREAD task. */ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0) return -EINVAL; return do_tkill(0, pid, sig); } static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info) { /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && (task_pid_vnr(current) != pid)) return -EPERM; /* POSIX.1b doesn't mention process groups. */ return kill_proc_info(sig, info, pid); } /** * sys_rt_sigqueueinfo - send signal information to a signal * @pid: the PID of the thread * @sig: signal to be sent * @uinfo: signal info to be sent */ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_sigqueueinfo(pid, sig, &info); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, compat_pid_t, pid, int, sig, struct compat_siginfo __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user32(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_sigqueueinfo(pid, sig, &info); } #endif static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t *info) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) return -EINVAL; /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && (task_pid_vnr(current) != pid)) return -EPERM; return do_send_specific(tgid, pid, sig, info); } SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, compat_pid_t, tgid, compat_pid_t, pid, int, sig, struct compat_siginfo __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user32(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #endif /* * For kthreads only, must not be used if cloned with CLONE_SIGHAND */ void kernel_sigaction(int sig, __sighandler_t action) { spin_lock_irq(¤t->sighand->siglock); current->sighand->action[sig - 1].sa.sa_handler = action; if (action == SIG_IGN) { sigset_t mask; sigemptyset(&mask); sigaddset(&mask, sig); flush_sigqueue_mask(current, &mask, ¤t->signal->shared_pending); flush_sigqueue_mask(current, &mask, ¤t->pending); recalc_sigpending(); } spin_unlock_irq(¤t->sighand->siglock); } EXPORT_SYMBOL(kernel_sigaction); void __weak sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { } int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *p = current, *t; struct k_sigaction *k; sigset_t mask; if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) return -EINVAL; k = &p->sighand->action[sig-1]; spin_lock_irq(&p->sighand->siglock); if (k->sa.sa_flags & SA_IMMUTABLE) { spin_unlock_irq(&p->sighand->siglock); return -EINVAL; } if (oact) *oact = *k; /* * Make sure that we never accidentally claim to support SA_UNSUPPORTED, * e.g. by having an architecture use the bit in their uapi. */ BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED); /* * Clear unknown flag bits in order to allow userspace to detect missing * support for flag bits and to allow the kernel to use non-uapi bits * internally. */ if (act) act->sa.sa_flags &= UAPI_SA_FLAGS; if (oact) oact->sa.sa_flags &= UAPI_SA_FLAGS; sigaction_compat_abi(act, oact); if (act) { bool was_ignored = k->sa.sa_handler == SIG_IGN; sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); *k = *act; /* * POSIX 3.3.1.3: * "Setting a signal action to SIG_IGN for a signal that is * pending shall cause the pending signal to be discarded, * whether or not it is blocked." * * "Setting a signal action to SIG_DFL for a signal that is * pending and whose default action is to ignore the signal * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ if (sig_handler_ignored(sig_handler(p, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); flush_sigqueue_mask(p, &mask, &p->signal->shared_pending); for_each_thread(p, t) flush_sigqueue_mask(p, &mask, &t->pending); } else if (was_ignored) { posixtimer_sig_unignore(p, sig); } } spin_unlock_irq(&p->sighand->siglock); return 0; } #ifdef CONFIG_DYNAMIC_SIGFRAME static inline void sigaltstack_lock(void) __acquires(¤t->sighand->siglock) { spin_lock_irq(¤t->sighand->siglock); } static inline void sigaltstack_unlock(void) __releases(¤t->sighand->siglock) { spin_unlock_irq(¤t->sighand->siglock); } #else static inline void sigaltstack_lock(void) { } static inline void sigaltstack_unlock(void) { } #endif static int do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp, size_t min_ss_size) { struct task_struct *t = current; int ret = 0; if (oss) { memset(oss, 0, sizeof(stack_t)); oss->ss_sp = (void __user *) t->sas_ss_sp; oss->ss_size = t->sas_ss_size; oss->ss_flags = sas_ss_flags(sp) | (current->sas_ss_flags & SS_FLAG_BITS); } if (ss) { void __user *ss_sp = ss->ss_sp; size_t ss_size = ss->ss_size; unsigned ss_flags = ss->ss_flags; int ss_mode; if (unlikely(on_sig_stack(sp))) return -EPERM; ss_mode = ss_flags & ~SS_FLAG_BITS; if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && ss_mode != 0)) return -EINVAL; /* * Return before taking any locks if no actual * sigaltstack changes were requested. */ if (t->sas_ss_sp == (unsigned long)ss_sp && t->sas_ss_size == ss_size && t->sas_ss_flags == ss_flags) return 0; sigaltstack_lock(); if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { if (unlikely(ss_size < min_ss_size)) ret = -ENOMEM; if (!sigaltstack_size_valid(ss_size)) ret = -ENOMEM; } if (!ret) { t->sas_ss_sp = (unsigned long) ss_sp; t->sas_ss_size = ss_size; t->sas_ss_flags = ss_flags; } sigaltstack_unlock(); } return ret; } SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) { stack_t new, old; int err; if (uss && copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL, current_user_stack_pointer(), MINSIGSTKSZ); if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t))) err = -EFAULT; return err; } int restore_altstack(const stack_t __user *uss) { stack_t new; if (copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(), MINSIGSTKSZ); /* squash all but EFAULT for now */ return 0; } int __save_altstack(stack_t __user *uss, unsigned long sp) { struct task_struct *t = current; int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); return err; } #ifdef CONFIG_COMPAT static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr, compat_stack_t __user *uoss_ptr) { stack_t uss, uoss; int ret; if (uss_ptr) { compat_stack_t uss32; if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) return -EFAULT; uss.ss_sp = compat_ptr(uss32.ss_sp); uss.ss_flags = uss32.ss_flags; uss.ss_size = uss32.ss_size; } ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, compat_user_stack_pointer(), COMPAT_MINSIGSTKSZ); if (ret >= 0 && uoss_ptr) { compat_stack_t old; memset(&old, 0, sizeof(old)); old.ss_sp = ptr_to_compat(uoss.ss_sp); old.ss_flags = uoss.ss_flags; old.ss_size = uoss.ss_size; if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t))) ret = -EFAULT; } return ret; } COMPAT_SYSCALL_DEFINE2(sigaltstack, const compat_stack_t __user *, uss_ptr, compat_stack_t __user *, uoss_ptr) { return do_compat_sigaltstack(uss_ptr, uoss_ptr); } int compat_restore_altstack(const compat_stack_t __user *uss) { int err = do_compat_sigaltstack(uss, NULL); /* squash all but -EFAULT for now */ return err == -EFAULT ? err : 0; } int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) { int err; struct task_struct *t = current; err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) | __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); return err; } #endif #ifdef __ARCH_WANT_SYS_SIGPENDING /** * sys_sigpending - examine pending signals * @uset: where mask of pending signal is returned */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) { sigset_t set; if (sizeof(old_sigset_t) > sizeof(*uset)) return -EINVAL; do_sigpending(&set); if (copy_to_user(uset, &set, sizeof(old_sigset_t))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { sigset_t set; do_sigpending(&set); return put_user(set.sig[0], set32); } #endif #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK /** * sys_sigprocmask - examine and change blocked signals * @how: whether to add, remove, or set signals * @nset: signals to add or remove (if non-null) * @oset: previous value of signal mask if non-null * * Some platforms have their own version with special arguments; * others support only sys_rt_sigprocmask. */ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, old_sigset_t __user *, oset) { old_sigset_t old_set, new_set; sigset_t new_blocked; old_set = current->blocked.sig[0]; if (nset) { if (copy_from_user(&new_set, nset, sizeof(*nset))) return -EFAULT; new_blocked = current->blocked; switch (how) { case SIG_BLOCK: sigaddsetmask(&new_blocked, new_set); break; case SIG_UNBLOCK: sigdelsetmask(&new_blocked, new_set); break; case SIG_SETMASK: new_blocked.sig[0] = new_set; break; default: return -EINVAL; } set_current_blocked(&new_blocked); } if (oset) { if (copy_to_user(oset, &old_set, sizeof(*oset))) return -EFAULT; } return 0; } #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifndef CONFIG_ODD_RT_SIGACTION /** * sys_rt_sigaction - alter an action taken by a process * @sig: signal to be sent * @act: new sigaction * @oact: used to save the previous sigaction * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct sigaction __user *, act, struct sigaction __user *, oact, size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) return -EFAULT; ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); if (ret) return ret; if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct compat_sigaction __user *, act, struct compat_sigaction __user *, oact, compat_size_t, sigsetsize) { struct k_sigaction new_ka, old_ka; #ifdef __ARCH_HAS_SA_RESTORER compat_uptr_t restorer; #endif int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (act) { compat_uptr_t handler; ret = get_user(handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(handler); #ifdef __ARCH_HAS_SA_RESTORER ret |= get_user(restorer, &act->sa_restorer); new_ka.sa.sa_restorer = compat_ptr(restorer); #endif ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask); ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask, sizeof(oact->sa_mask)); ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); #endif } return ret; } #endif #endif /* !CONFIG_ODD_RT_SIGACTION */ #ifdef CONFIG_OLD_SIGACTION SYSCALL_DEFINE3(sigaction, int, sig, const struct old_sigaction __user *, act, struct old_sigaction __user *, oact) { struct k_sigaction new_ka, old_ka; int ret; if (act) { old_sigset_t mask; if (!access_ok(act, sizeof(*act)) || __get_user(new_ka.sa.sa_handler, &act->sa_handler) || __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || __get_user(mask, &act->sa_mask)) return -EFAULT; #ifdef __ARCH_HAS_KA_RESTORER new_ka.ka_restorer = NULL; #endif siginitset(&new_ka.sa.sa_mask, mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { if (!access_ok(oact, sizeof(*oact)) || __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; } return ret; } #endif #ifdef CONFIG_COMPAT_OLD_SIGACTION COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, const struct compat_old_sigaction __user *, act, struct compat_old_sigaction __user *, oact) { struct k_sigaction new_ka, old_ka; int ret; compat_old_sigset_t mask; compat_uptr_t handler, restorer; if (act) { if (!access_ok(act, sizeof(*act)) || __get_user(handler, &act->sa_handler) || __get_user(restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || __get_user(mask, &act->sa_mask)) return -EFAULT; #ifdef __ARCH_HAS_KA_RESTORER new_ka.ka_restorer = NULL; #endif new_ka.sa.sa_handler = compat_ptr(handler); new_ka.sa.sa_restorer = compat_ptr(restorer); siginitset(&new_ka.sa.sa_mask, mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { if (!access_ok(oact, sizeof(*oact)) || __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; } return ret; } #endif #ifdef CONFIG_SGETMASK_SYSCALL /* * For backwards compatibility. Functionality superseded by sigprocmask. */ SYSCALL_DEFINE0(sgetmask) { /* SMP safe */ return current->blocked.sig[0]; } SYSCALL_DEFINE1(ssetmask, int, newmask) { int old = current->blocked.sig[0]; sigset_t newset; siginitset(&newset, newmask); set_current_blocked(&newset); return old; } #endif /* CONFIG_SGETMASK_SYSCALL */ #ifdef __ARCH_WANT_SYS_SIGNAL /* * For backwards compatibility. Functionality superseded by sigaction. */ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) { struct k_sigaction new_sa, old_sa; int ret; new_sa.sa.sa_handler = handler; new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; sigemptyset(&new_sa.sa.sa_mask); ret = do_sigaction(sig, &new_sa, &old_sa); return ret ? ret : (unsigned long)old_sa.sa.sa_handler; } #endif /* __ARCH_WANT_SYS_SIGNAL */ #ifdef __ARCH_WANT_SYS_PAUSE SYSCALL_DEFINE0(pause) { while (!signal_pending(current)) { __set_current_state(TASK_INTERRUPTIBLE); schedule(); } return -ERESTARTNOHAND; } #endif static int sigsuspend(sigset_t *set) { current->saved_sigmask = current->blocked; set_current_blocked(set); while (!signal_pending(current)) { __set_current_state(TASK_INTERRUPTIBLE); schedule(); } set_restore_sigmask(); return -ERESTARTNOHAND; } /** * sys_rt_sigsuspend - replace the signal mask for a value with the * @unewset value until a signal is received * @unewset: new signal mask value * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&newset, unewset, sizeof(newset))) return -EFAULT; return sigsuspend(&newset); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) { sigset_t newset; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&newset, unewset)) return -EFAULT; return sigsuspend(&newset); } #endif #ifdef CONFIG_OLD_SIGSUSPEND SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask) { sigset_t blocked; siginitset(&blocked, mask); return sigsuspend(&blocked); } #endif #ifdef CONFIG_OLD_SIGSUSPEND3 SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) { sigset_t blocked; siginitset(&blocked, mask); return sigsuspend(&blocked); } #endif __weak const char *arch_vma_name(struct vm_area_struct *vma) { return NULL; } static inline void siginfo_buildtime_checks(void) { BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE); /* Verify the offsets in the two siginfos match */ #define CHECK_OFFSET(field) \ BUILD_BUG_ON(offsetof(siginfo_t, field) != offsetof(kernel_siginfo_t, field)) /* kill */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); /* timer */ CHECK_OFFSET(si_tid); CHECK_OFFSET(si_overrun); CHECK_OFFSET(si_value); /* rt */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); CHECK_OFFSET(si_value); /* sigchld */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); CHECK_OFFSET(si_status); CHECK_OFFSET(si_utime); CHECK_OFFSET(si_stime); /* sigfault */ CHECK_OFFSET(si_addr); CHECK_OFFSET(si_trapno); CHECK_OFFSET(si_addr_lsb); CHECK_OFFSET(si_lower); CHECK_OFFSET(si_upper); CHECK_OFFSET(si_pkey); CHECK_OFFSET(si_perf_data); CHECK_OFFSET(si_perf_type); CHECK_OFFSET(si_perf_flags); /* sigpoll */ CHECK_OFFSET(si_band); CHECK_OFFSET(si_fd); /* sigsys */ CHECK_OFFSET(si_call_addr); CHECK_OFFSET(si_syscall); CHECK_OFFSET(si_arch); #undef CHECK_OFFSET /* usb asyncio */ BUILD_BUG_ON(offsetof(struct siginfo, si_pid) != offsetof(struct siginfo, si_addr)); if (sizeof(int) == sizeof(void __user *)) { BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) != sizeof(void __user *)); } else { BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) + sizeof_field(struct siginfo, si_uid)) != sizeof(void __user *)); BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) != offsetof(struct siginfo, si_uid)); } #ifdef CONFIG_COMPAT BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) != offsetof(struct compat_siginfo, si_addr)); BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != sizeof(compat_uptr_t)); BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != sizeof_field(struct siginfo, si_pid)); #endif } #if defined(CONFIG_SYSCTL) static const struct ctl_table signal_debug_table[] = { #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { .procname = "exception-trace", .data = &show_unhandled_signals, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, #endif }; static int __init init_signal_sysctls(void) { register_sysctl_init("debug", signal_debug_table); return 0; } early_initcall(init_signal_sysctls); #endif /* CONFIG_SYSCTL */ void __init signals_init(void) { siginfo_buildtime_checks(); sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT); } #ifdef CONFIG_KGDB_KDB #include <linux/kdb.h> /* * kdb_send_sig - Allows kdb to send signals without exposing * signal internals. This function checks if the required locks are * available before calling the main signal code, to avoid kdb * deadlocks. */ void kdb_send_sig(struct task_struct *t, int sig) { static struct task_struct *kdb_prev_t; int new_t, ret; if (!spin_trylock(&t->sighand->siglock)) { kdb_printf("Can't do kill command now.\n" "The sigmask lock is held somewhere else in " "kernel, try again later\n"); return; } new_t = kdb_prev_t != t; kdb_prev_t = t; if (!task_is_running(t) && new_t) { spin_unlock(&t->sighand->siglock); kdb_printf("Process is not RUNNING, sending a signal from " "kdb risks deadlock\n" "on the run queue locks. " "The signal has _not_ been sent.\n" "Reissue the kill command if you want to risk " "the deadlock.\n"); return; } ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", sig, t->pid); else kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid); } #endif /* CONFIG_KGDB_KDB */ |
3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/vmalloc.h> #define CREATE_TRACE_POINTS #include <trace/events/notifier.h> /* * Notifier chain core routines. The exported routines below * are layered on top of these, with appropriate locking added. */ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n, bool unique_priority) { while ((*nl) != NULL) { if (unlikely((*nl) == n)) { WARN(1, "notifier callback %ps already registered", n->notifier_call); return -EEXIST; } if (n->priority > (*nl)->priority) break; if (n->priority == (*nl)->priority && unique_priority) return -EBUSY; nl = &((*nl)->next); } n->next = *nl; rcu_assign_pointer(*nl, n); trace_notifier_register((void *)n->notifier_call); return 0; } static int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { if ((*nl) == n) { rcu_assign_pointer(*nl, n->next); trace_notifier_unregister((void *)n->notifier_call); return 0; } nl = &((*nl)->next); } return -ENOENT; } /** * notifier_call_chain - Informs the registered notifiers about an event. * @nl: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: Number of notifier functions to be called. Don't care * value of this parameter is -1. * @nr_calls: Records the number of notifications sent. Don't care * value of this field is NULL. * Return: notifier_call_chain returns the value returned by the * last notifier function called. */ static int notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; nb = rcu_dereference_raw(*nl); while (nb && nr_to_call) { next_nb = rcu_dereference_raw(nb->next); #ifdef CONFIG_DEBUG_NOTIFIERS if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { WARN(1, "Invalid notifier called!"); nb = next_nb; continue; } #endif trace_notifier_run((void *)nb->notifier_call); ret = nb->notifier_call(nb, val, v); if (nr_calls) (*nr_calls)++; if (ret & NOTIFY_STOP_MASK) break; nb = next_nb; nr_to_call--; } return ret; } NOKPROBE_SYMBOL(notifier_call_chain); /** * notifier_call_chain_robust - Inform the registered notifiers about an event * and rollback on error. * @nl: Pointer to head of the blocking notifier chain * @val_up: Value passed unmodified to the notifier function * @val_down: Value passed unmodified to the notifier function when recovering * from an error on @val_up * @v: Pointer passed unmodified to the notifier function * * NOTE: It is important the @nl chain doesn't change between the two * invocations of notifier_call_chain() such that we visit the * exact same notifier callbacks; this rules out any RCU usage. * * Return: the return value of the @val_up call. */ static int notifier_call_chain_robust(struct notifier_block **nl, unsigned long val_up, unsigned long val_down, void *v) { int ret, nr = 0; ret = notifier_call_chain(nl, val_up, v, -1, &nr); if (ret & NOTIFY_STOP_MASK) notifier_call_chain(nl, val_down, v, nr-1, NULL); return ret; } /* * Atomic notifier chain routines. Registration and unregistration * use a spinlock, and call_chain is synchronized by RCU (no locks). */ /** * atomic_notifier_chain_register - Add notifier to an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: New entry in notifier chain * * Adds a notifier to an atomic notifier chain. * * Returns 0 on success, %-EEXIST on error. */ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_register(&nh->head, n, false); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); /** * atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: New entry in notifier chain * * Adds a notifier to an atomic notifier chain if there is no other * notifier registered using the same priority. * * Returns 0 on success, %-EEXIST or %-EBUSY on error. */ int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_register(&nh->head, n, true); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio); /** * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an atomic notifier chain. * * Returns zero on success or %-ENOENT on failure. */ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_unregister(&nh->head, n); spin_unlock_irqrestore(&nh->lock, flags); synchronize_rcu(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); /** * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in an atomic context, so they must not block. * This routine uses RCU to synchronize with changes to the chain. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v) { int ret; rcu_read_lock(); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); /** * atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty * @nh: Pointer to head of the atomic notifier chain * * Checks whether notifier chain is empty. * * Returns true is notifier chain is empty, false otherwise. */ bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh) { return !rcu_access_pointer(nh->head); } /* * Blocking notifier chain routines. All access to the chain is * synchronized by an rwsem. */ static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *n, bool unique_priority) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n, unique_priority); down_write(&nh->rwsem); ret = notifier_chain_register(&nh->head, n, unique_priority); up_write(&nh->rwsem); return ret; } /** * blocking_notifier_chain_register - Add notifier to a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * * Adds a notifier to a blocking notifier chain. * Must be called in process context. * * Returns 0 on success, %-EEXIST on error. */ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *n) { return __blocking_notifier_chain_register(nh, n, false); } EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** * blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * * Adds a notifier to an blocking notifier chain if there is no other * notifier registered using the same priority. * * Returns 0 on success, %-EEXIST or %-EBUSY on error. */ int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh, struct notifier_block *n) { return __blocking_notifier_chain_register(nh, n, true); } EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio); /** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a blocking notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); down_write(&nh->rwsem); ret = notifier_chain_unregister(&nh->head, n); up_write(&nh->rwsem); return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v) { int ret = NOTIFY_DONE; /* * We check the head outside the lock, but if this access is * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); up_read(&nh->rwsem); } return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust); /** * blocking_notifier_call_chain - Call functions in a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v) { int ret = NOTIFY_DONE; /* * We check the head outside the lock, but if this access is * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); up_read(&nh->rwsem); } return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); /* * Raw notifier chain routines. There is no protection; * the caller must provide it. Use at your own risk! */ /** * raw_notifier_chain_register - Add notifier to a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: New entry in notifier chain * * Adds a notifier to a raw notifier chain. * All locking must be provided by the caller. * * Returns 0 on success, %-EEXIST on error. */ int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_register(&nh->head, n, false); } EXPORT_SYMBOL_GPL(raw_notifier_chain_register); /** * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a raw notifier chain. * All locking must be provided by the caller. * * Returns zero on success or %-ENOENT on failure. */ int raw_notifier_chain_unregister(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_unregister(&nh->head, n); } EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v) { return notifier_call_chain_robust(&nh->head, val_up, val_down, v); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust); /** * raw_notifier_call_chain - Call functions in a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in an undefined context. * All locking must be provided by the caller. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v) { return notifier_call_chain(&nh->head, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). */ /** * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: New entry in notifier chain * * Adds a notifier to an SRCU notifier chain. * Must be called in process context. * * Returns 0 on success, %-EEXIST on error. */ int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n, false); mutex_lock(&nh->mutex); ret = notifier_chain_register(&nh->head, n, false); mutex_unlock(&nh->mutex); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); /** * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an SRCU notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); mutex_lock(&nh->mutex); ret = notifier_chain_unregister(&nh->head, n); mutex_unlock(&nh->mutex); synchronize_srcu(&nh->srcu); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); /** * srcu_notifier_call_chain - Call functions in an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v) { int ret; int idx; idx = srcu_read_lock(&nh->srcu); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); srcu_read_unlock(&nh->srcu, idx); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); /** * srcu_init_notifier_head - Initialize an SRCU notifier head * @nh: Pointer to head of the srcu notifier chain * * Unlike other sorts of notifier heads, SRCU notifier heads require * dynamic initialization. Be sure to call this routine before * calling any of the other SRCU notifier routines for this head. * * If an SRCU notifier head is deallocated, it must first be cleaned * up by calling srcu_cleanup_notifier_head(). Otherwise the head's * per-cpu data (used by the SRCU mechanism) will leak. */ void srcu_init_notifier_head(struct srcu_notifier_head *nh) { mutex_init(&nh->mutex); if (init_srcu_struct(&nh->srcu) < 0) BUG(); nh->head = NULL; } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { .regs = regs, .str = str, .err = err, .trapnr = trap, .signr = sig, }; RCU_LOCKDEP_WARN(!rcu_is_watching(), "notify_die called but RCU thinks we're quiescent"); return atomic_notifier_call_chain(&die_chain, val, &args); } NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); int unregister_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&die_chain, nb); } EXPORT_SYMBOL_GPL(unregister_die_notifier); |
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Multicast support for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c */ /* Changes: * * yoshfuji : fix format of router-alert option * YOSHIFUJI Hideaki @USAGI: * Fixed source address for MLD message based on * <draft-ietf-magma-mld-source-05.txt>. * YOSHIFUJI Hideaki @USAGI: * - Ignore Queries for invalid addresses. * - MLD for link-local addresses. * David L Stevens <dlstevens@us.ibm.com>: * - MLDv2 support */ #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/jiffies.h> #include <linux/net.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/pkt_sched.h> #include <net/mld.h> #include <linux/workqueue.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/if_inet6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/inet_common.h> #include <net/ip6_checksum.h> /* Ensure that we have struct in6_addr aligned on 32bit word. */ static int __mld2_query_bugs[] __attribute__((__unused__)) = { BUILD_BUG_ON_ZERO(offsetof(struct mld2_query, mld2q_srcs) % 4), BUILD_BUG_ON_ZERO(offsetof(struct mld2_report, mld2r_grec) % 4), BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4) }; static struct workqueue_struct *mld_wq; static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; static void igmp6_join_group(struct ifmcaddr6 *ma); static void igmp6_leave_group(struct ifmcaddr6 *ma); static void mld_mca_work(struct work_struct *work); static void mld_ifc_event(struct inet6_dev *idev); static bool mld_in_v1_mode(const struct inet6_dev *idev); static int sf_setstate(struct ifmcaddr6 *pmc); static void sf_markstate(struct ifmcaddr6 *pmc); static void ip6_mc_clear_src(struct ifmcaddr6 *pmc); static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta); static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta); static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); static int __ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr, unsigned int mode); #define MLD_QRV_DEFAULT 2 /* RFC3810, 9.2. Query Interval */ #define MLD_QI_DEFAULT (125 * HZ) /* RFC3810, 9.3. Query Response Interval */ #define MLD_QRI_DEFAULT (10 * HZ) /* RFC3810, 8.1 Query Version Distinctions */ #define MLD_V1_QUERY_LEN 24 #define MLD_V2_QUERY_LEN_MIN 28 #define IPV6_MLD_MAX_MSF 64 int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; /* * socket join on multicast group */ #define mc_dereference(e, idev) \ rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock)) #define sock_dereference(e, sk) \ rcu_dereference_protected(e, lockdep_sock_is_held(sk)) #define for_each_pmc_socklock(np, sk, pmc) \ for (pmc = sock_dereference((np)->ipv6_mc_list, sk); \ pmc; \ pmc = sock_dereference(pmc->next, sk)) #define for_each_pmc_rcu(np, pmc) \ for (pmc = rcu_dereference((np)->ipv6_mc_list); \ pmc; \ pmc = rcu_dereference(pmc->next)) #define for_each_psf_mclock(mc, psf) \ for (psf = mc_dereference((mc)->mca_sources, mc->idev); \ psf; \ psf = mc_dereference(psf->sf_next, mc->idev)) #define for_each_psf_rcu(mc, psf) \ for (psf = rcu_dereference((mc)->mca_sources); \ psf; \ psf = rcu_dereference(psf->sf_next)) #define for_each_psf_tomb(mc, psf) \ for (psf = mc_dereference((mc)->mca_tomb, mc->idev); \ psf; \ psf = mc_dereference(psf->sf_next, mc->idev)) #define for_each_mc_mclock(idev, mc) \ for (mc = mc_dereference((idev)->mc_list, idev); \ mc; \ mc = mc_dereference(mc->next, idev)) #define for_each_mc_rcu(idev, mc) \ for (mc = rcu_dereference((idev)->mc_list); \ mc; \ mc = rcu_dereference(mc->next)) #define for_each_mc_tomb(idev, mc) \ for (mc = mc_dereference((idev)->mc_tomb, idev); \ mc; \ mc = mc_dereference(mc->next, idev)) static int unsolicited_report_interval(struct inet6_dev *idev) { int iv; if (mld_in_v1_mode(idev)) iv = READ_ONCE(idev->cnf.mldv1_unsolicited_report_interval); else iv = READ_ONCE(idev->cnf.mldv2_unsolicited_report_interval); return iv > 0 ? iv : 1; } static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode) { struct net_device *dev = NULL; struct ipv6_mc_socklist *mc_lst; struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; ASSERT_RTNL(); if (!ipv6_addr_is_multicast(addr)) return -EINVAL; for_each_pmc_socklock(np, sk, mc_lst) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) return -EADDRINUSE; } mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); if (!mc_lst) return -ENOMEM; mc_lst->next = NULL; mc_lst->addr = *addr; if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); } } else dev = __dev_get_by_index(net, ifindex); if (!dev) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; } mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = mode; RCU_INIT_POINTER(mc_lst->sflist, NULL); /* * now add/increase the group membership on the device */ err = __ipv6_dev_mc_inc(dev, addr, mode); if (err) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return err; } mc_lst->next = np->ipv6_mc_list; rcu_assign_pointer(np->ipv6_mc_list, mc_lst); return 0; } int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE); } EXPORT_SYMBOL(ipv6_sock_mc_join); int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode) { return __ipv6_sock_mc_join(sk, ifindex, addr, mode); } /* * socket leave on multicast group */ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc_lst; struct ipv6_mc_socklist __rcu **lnk; struct net *net = sock_net(sk); ASSERT_RTNL(); if (!ipv6_addr_is_multicast(addr)) return -EINVAL; for (lnk = &np->ipv6_mc_list; (mc_lst = sock_dereference(*lnk, sk)) != NULL; lnk = &mc_lst->next) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { struct net_device *dev; *lnk = mc_lst->next; dev = __dev_get_by_index(net, mc_lst->ifindex); if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); ip6_mc_leave_src(sk, mc_lst, idev); if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); } else { ip6_mc_leave_src(sk, mc_lst, NULL); } atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); return 0; } } return -EADDRNOTAVAIL; } EXPORT_SYMBOL(ipv6_sock_mc_drop); static struct inet6_dev *ip6_mc_find_dev_rtnl(struct net *net, const struct in6_addr *group, int ifindex) { struct net_device *dev = NULL; struct inet6_dev *idev = NULL; if (ifindex == 0) { struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); } } else { dev = __dev_get_by_index(net, ifindex); } if (!dev) return NULL; idev = __in6_dev_get(dev); if (!idev) return NULL; if (idev->dead) return NULL; return idev; } void __ipv6_sock_mc_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc_lst; struct net *net = sock_net(sk); ASSERT_RTNL(); while ((mc_lst = sock_dereference(np->ipv6_mc_list, sk)) != NULL) { struct net_device *dev; np->ipv6_mc_list = mc_lst->next; dev = __dev_get_by_index(net, mc_lst->ifindex); if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); ip6_mc_leave_src(sk, mc_lst, idev); if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); } else { ip6_mc_leave_src(sk, mc_lst, NULL); } atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); } } void ipv6_sock_mc_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); if (!rcu_access_pointer(np->ipv6_mc_list)) return; rtnl_lock(); lock_sock(sk); __ipv6_sock_mc_close(sk); release_sock(sk); rtnl_unlock(); } int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr) { struct in6_addr *source, *group; struct ipv6_mc_socklist *pmc; struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *psl; struct net *net = sock_net(sk); int i, j, rv; int leavegroup = 0; int err; source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; idev = ip6_mc_find_dev_rtnl(net, group, pgsr->gsr_interface); if (!idev) return -ENODEV; err = -EADDRNOTAVAIL; mutex_lock(&idev->mc_lock); for_each_pmc_socklock(inet6, sk, pmc) { if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } /* if a source filter was set, must be the same mode as before */ if (rcu_access_pointer(pmc->sflist)) { if (pmc->sfmode != omode) { err = -EINVAL; goto done; } } else if (pmc->sfmode != omode) { /* allow mode switches for empty-set filters */ ip6_mc_add_src(idev, group, omode, 0, NULL, 0); ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); pmc->sfmode = omode; } psl = sock_dereference(pmc->sflist, sk); if (!add) { if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i = 0; i < psl->sl_count; i++) { rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) break; } if (rv) /* source not found */ goto done; /* err = -EADDRNOTAVAIL */ /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { leavegroup = 1; goto done; } /* update the interface filter */ ip6_mc_del_src(idev, group, omode, 1, source, 1); for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; goto done; } /* else, add a new source to the filter */ if (psl && psl->sl_count >= sysctl_mld_max_msf) { err = -ENOBUFS; goto done; } if (!psl || psl->sl_count == psl->sl_max) { struct ip6_sf_socklist *newpsl; int count = IP6_SFBLOCK; if (psl) count += psl->sl_max; newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = count; newpsl->sl_count = count - IP6_SFBLOCK; if (psl) { for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } rcu_assign_pointer(pmc->sflist, newpsl); kfree_rcu(psl, rcu); psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i = 0; i < psl->sl_count; i++) { rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) /* There is an error in the address. */ goto done; } for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = *source; psl->sl_count++; err = 0; /* update the interface list */ ip6_mc_add_src(idev, group, omode, 1, source, 1); done: mutex_unlock(&idev->mc_lock); if (leavegroup) err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; } int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list) { const struct in6_addr *group; struct ipv6_mc_socklist *pmc; struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *newpsl, *psl; struct net *net = sock_net(sk); int leavegroup = 0; int i, err; group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; if (gsf->gf_fmode != MCAST_INCLUDE && gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; idev = ip6_mc_find_dev_rtnl(net, group, gsf->gf_interface); if (!idev) return -ENODEV; err = 0; if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { leavegroup = 1; goto done; } for_each_pmc_socklock(inet6, sk, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } if (gsf->gf_numsrc) { newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, gsf->gf_numsrc), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc; for (i = 0; i < newpsl->sl_count; ++i, ++list) { struct sockaddr_in6 *psin6; psin6 = (struct sockaddr_in6 *)list; newpsl->sl_addr[i] = psin6->sin6_addr; } mutex_lock(&idev->mc_lock); err = ip6_mc_add_src(idev, group, gsf->gf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { mutex_unlock(&idev->mc_lock); sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr, newpsl->sl_max)); goto done; } mutex_unlock(&idev->mc_lock); } else { newpsl = NULL; mutex_lock(&idev->mc_lock); ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); mutex_unlock(&idev->mc_lock); } mutex_lock(&idev->mc_lock); psl = sock_dereference(pmc->sflist, sk); if (psl) { ip6_mc_del_src(idev, group, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } else { ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); } rcu_assign_pointer(pmc->sflist, newpsl); mutex_unlock(&idev->mc_lock); kfree_rcu(psl, rcu); pmc->sfmode = gsf->gf_fmode; err = 0; done: if (leavegroup) err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); return err; } int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t ss_offset) { struct ipv6_pinfo *inet6 = inet6_sk(sk); const struct in6_addr *group; struct ipv6_mc_socklist *pmc; struct ip6_sf_socklist *psl; unsigned int count; int i, copycount; group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; /* changes to the ipv6_mc_list require the socket lock and * rtnl lock. We have the socket lock, so reading the list is safe. */ for_each_pmc_socklock(inet6, sk, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(group, &pmc->addr)) break; } if (!pmc) /* must have a prior join */ return -EADDRNOTAVAIL; gsf->gf_fmode = pmc->sfmode; psl = sock_dereference(pmc->sflist, sk); count = psl ? psl->sl_count : 0; copycount = min(count, gsf->gf_numsrc); gsf->gf_numsrc = count; for (i = 0; i < copycount; i++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; psin6 = (struct sockaddr_in6 *)&ss; memset(&ss, 0, sizeof(ss)); psin6->sin6_family = AF_INET6; psin6->sin6_addr = psl->sl_addr[i]; if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss))) return -EFAULT; ss_offset += sizeof(ss); } return 0; } bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr) { const struct ipv6_pinfo *np = inet6_sk(sk); const struct ipv6_mc_socklist *mc; const struct ip6_sf_socklist *psl; bool rv = true; rcu_read_lock(); for_each_pmc_rcu(np, mc) { if (ipv6_addr_equal(&mc->addr, mc_addr)) break; } if (!mc) { rcu_read_unlock(); return inet6_test_bit(MC6_ALL, sk); } psl = rcu_dereference(mc->sflist); if (!psl) { rv = mc->sfmode == MCAST_EXCLUDE; } else { int i; for (i = 0; i < psl->sl_count; i++) { if (ipv6_addr_equal(&psl->sl_addr[i], src_addr)) break; } if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) rv = false; if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) rv = false; } rcu_read_unlock(); return rv; } /* called with mc_lock */ static void igmp6_group_added(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; if (!(mc->mca_flags&MAF_LOADED)) { mc->mca_flags |= MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) dev_mc_add(dev, buf); } if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) return; if (mld_in_v1_mode(mc->idev)) { igmp6_join_group(mc); return; } /* else v2 */ /* Based on RFC3810 6.1, for newly added INCLUDE SSM, we * should not send filter-mode change record as the mode * should be from IN() to IN(A). */ if (mc->mca_sfmode == MCAST_EXCLUDE) mc->mca_crcount = mc->idev->mc_qrv; mld_ifc_event(mc->idev); } /* called with mc_lock */ static void igmp6_group_dropped(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; if (mc->mca_flags&MAF_LOADED) { mc->mca_flags &= ~MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) dev_mc_del(dev, buf); } if (mc->mca_flags & MAF_NOREPORT) return; if (!mc->idev->dead) igmp6_leave_group(mc); if (cancel_delayed_work(&mc->mca_work)) refcount_dec(&mc->mca_refcnt); } /* * deleted ifmcaddr6 manipulation * called with mc_lock */ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ifmcaddr6 *pmc; /* this is an "ifmcaddr6" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not * used for management of the delete list. Using the same structure * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ pmc = kzalloc(sizeof(*pmc), GFP_KERNEL); if (!pmc) return; pmc->idev = im->idev; in6_dev_hold(idev); pmc->mca_addr = im->mca_addr; pmc->mca_crcount = idev->mc_qrv; pmc->mca_sfmode = im->mca_sfmode; if (pmc->mca_sfmode == MCAST_INCLUDE) { struct ip6_sf_list *psf; rcu_assign_pointer(pmc->mca_tomb, mc_dereference(im->mca_tomb, idev)); rcu_assign_pointer(pmc->mca_sources, mc_dereference(im->mca_sources, idev)); RCU_INIT_POINTER(im->mca_tomb, NULL); RCU_INIT_POINTER(im->mca_sources, NULL); for_each_psf_mclock(pmc, psf) psf->sf_crcount = pmc->mca_crcount; } rcu_assign_pointer(pmc->next, idev->mc_tomb); rcu_assign_pointer(idev->mc_tomb, pmc); } /* called with mc_lock */ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ip6_sf_list *psf, *sources, *tomb; struct in6_addr *pmca = &im->mca_addr; struct ifmcaddr6 *pmc, *pmc_prev; pmc_prev = NULL; for_each_mc_tomb(idev, pmc) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) break; pmc_prev = pmc; } if (pmc) { if (pmc_prev) rcu_assign_pointer(pmc_prev->next, pmc->next); else rcu_assign_pointer(idev->mc_tomb, pmc->next); } if (pmc) { im->idev = pmc->idev; if (im->mca_sfmode == MCAST_INCLUDE) { tomb = rcu_replace_pointer(im->mca_tomb, mc_dereference(pmc->mca_tomb, pmc->idev), lockdep_is_held(&im->idev->mc_lock)); rcu_assign_pointer(pmc->mca_tomb, tomb); sources = rcu_replace_pointer(im->mca_sources, mc_dereference(pmc->mca_sources, pmc->idev), lockdep_is_held(&im->idev->mc_lock)); rcu_assign_pointer(pmc->mca_sources, sources); for_each_psf_mclock(im, psf) psf->sf_crcount = idev->mc_qrv; } else { im->mca_crcount = idev->mc_qrv; } in6_dev_put(pmc->idev); ip6_mc_clear_src(pmc); kfree_rcu(pmc, rcu); } } /* called with mc_lock */ static void mld_clear_delrec(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *nextpmc; pmc = mc_dereference(idev->mc_tomb, idev); RCU_INIT_POINTER(idev->mc_tomb, NULL); for (; pmc; pmc = nextpmc) { nextpmc = mc_dereference(pmc->next, idev); ip6_mc_clear_src(pmc); in6_dev_put(pmc->idev); kfree_rcu(pmc, rcu); } /* clear dead sources, too */ for_each_mc_mclock(idev, pmc) { struct ip6_sf_list *psf, *psf_next; psf = mc_dereference(pmc->mca_tomb, idev); RCU_INIT_POINTER(pmc->mca_tomb, NULL); for (; psf; psf = psf_next) { psf_next = mc_dereference(psf->sf_next, idev); kfree_rcu(psf, rcu); } } } static void mld_clear_query(struct inet6_dev *idev) { struct sk_buff *skb; spin_lock_bh(&idev->mc_query_lock); while ((skb = __skb_dequeue(&idev->mc_query_queue))) kfree_skb(skb); spin_unlock_bh(&idev->mc_query_lock); } static void mld_clear_report(struct inet6_dev *idev) { struct sk_buff *skb; spin_lock_bh(&idev->mc_report_lock); while ((skb = __skb_dequeue(&idev->mc_report_queue))) kfree_skb(skb); spin_unlock_bh(&idev->mc_report_lock); } static void mca_get(struct ifmcaddr6 *mc) { refcount_inc(&mc->mca_refcnt); } static void ma_put(struct ifmcaddr6 *mc) { if (refcount_dec_and_test(&mc->mca_refcnt)) { in6_dev_put(mc->idev); kfree_rcu(mc, rcu); } } /* called with mc_lock */ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return NULL; INIT_DELAYED_WORK(&mc->mca_work, mld_mca_work); mc->mca_addr = *addr; mc->idev = idev; /* reference taken by caller */ mc->mca_users = 1; /* mca_stamp should be updated upon changes */ mc->mca_cstamp = mc->mca_tstamp = jiffies; refcount_set(&mc->mca_refcnt, 1); mc->mca_sfmode = mode; mc->mca_sfcount[mode] = 1; if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) mc->mca_flags |= MAF_NOREPORT; return mc; } static void inet6_ifmcaddr_notify(struct net_device *dev, const struct ifmcaddr6 *ifmca, int event) { struct inet6_fill_args fillargs = { .portid = 0, .seq = 0, .event = event, .flags = 0, .netnsid = -1, .force_rt_scope_universe = true, }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOMEM; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(sizeof(struct in6_addr)) + nla_total_size(sizeof(struct ifa_cacheinfo)), GFP_KERNEL); if (!skb) goto error; err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs); if (err < 0) { WARN_ON_ONCE(err == -EMSGSIZE); nlmsg_free(skb); goto error; } rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL); return; error: rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err); } /* * device multicast group inc (add if not found) */ static int __ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; struct inet6_dev *idev; ASSERT_RTNL(); /* we need to take a reference on idev */ idev = in6_dev_get(dev); if (!idev) return -EINVAL; if (idev->dead) { in6_dev_put(idev); return -ENODEV; } mutex_lock(&idev->mc_lock); for_each_mc_mclock(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, addr)) { mc->mca_users++; ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0); mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return 0; } } mc = mca_alloc(idev, addr, mode); if (!mc) { mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return -ENOMEM; } rcu_assign_pointer(mc->next, idev->mc_list); rcu_assign_pointer(idev->mc_list, mc); mca_get(mc); mld_del_delrec(idev, mc); igmp6_group_added(mc); inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST); mutex_unlock(&idev->mc_lock); ma_put(mc); return 0; } int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) { return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE); } EXPORT_SYMBOL(ipv6_dev_mc_inc); /* * device multicast group del */ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifmcaddr6 *ma, __rcu **map; ASSERT_RTNL(); mutex_lock(&idev->mc_lock); for (map = &idev->mc_list; (ma = mc_dereference(*map, idev)); map = &ma->next) { if (ipv6_addr_equal(&ma->mca_addr, addr)) { if (--ma->mca_users == 0) { *map = ma->next; igmp6_group_dropped(ma); inet6_ifmcaddr_notify(idev->dev, ma, RTM_DELMULTICAST); ip6_mc_clear_src(ma); mutex_unlock(&idev->mc_lock); ma_put(ma); return 0; } mutex_unlock(&idev->mc_lock); return 0; } } mutex_unlock(&idev->mc_lock); return -ENOENT; } int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev; int err; ASSERT_RTNL(); idev = __in6_dev_get(dev); if (!idev) err = -ENODEV; else err = __ipv6_dev_mc_dec(idev, addr); return err; } EXPORT_SYMBOL(ipv6_dev_mc_dec); /* * check if the interface/address pair is valid */ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr) { struct inet6_dev *idev; struct ifmcaddr6 *mc; bool rv = false; rcu_read_lock(); idev = __in6_dev_get(dev); if (!idev) goto unlock; for_each_mc_rcu(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, group)) break; } if (!mc) goto unlock; if (src_addr && !ipv6_addr_any(src_addr)) { struct ip6_sf_list *psf; for_each_psf_rcu(mc, psf) { if (ipv6_addr_equal(&psf->sf_addr, src_addr)) break; } if (psf) rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) || READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) != READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]); else rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0; } else { rv = true; /* don't filter unspecified source */ } unlock: rcu_read_unlock(); return rv; } /* called with mc_lock */ static void mld_gq_start_work(struct inet6_dev *idev) { unsigned long tv = get_random_u32_below(idev->mc_maxdelay); idev->mc_gq_running = 1; if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2)) in6_dev_hold(idev); } /* called with mc_lock */ static void mld_gq_stop_work(struct inet6_dev *idev) { idev->mc_gq_running = 0; if (cancel_delayed_work(&idev->mc_gq_work)) __in6_dev_put(idev); } /* called with mc_lock */ static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = get_random_u32_below(delay); if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2)) in6_dev_hold(idev); } /* called with mc_lock */ static void mld_ifc_stop_work(struct inet6_dev *idev) { idev->mc_ifc_count = 0; if (cancel_delayed_work(&idev->mc_ifc_work)) __in6_dev_put(idev); } /* called with mc_lock */ static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = get_random_u32_below(delay); if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2)) in6_dev_hold(idev); } static void mld_dad_stop_work(struct inet6_dev *idev) { if (cancel_delayed_work(&idev->mc_dad_work)) __in6_dev_put(idev); } static void mld_query_stop_work(struct inet6_dev *idev) { spin_lock_bh(&idev->mc_query_lock); if (cancel_delayed_work(&idev->mc_query_work)) __in6_dev_put(idev); spin_unlock_bh(&idev->mc_query_lock); } static void mld_report_stop_work(struct inet6_dev *idev) { if (cancel_delayed_work_sync(&idev->mc_report_work)) __in6_dev_put(idev); } /* * IGMP handling (alias multicast ICMPv6 messages) * called with mc_lock */ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) { unsigned long delay = resptime; /* Do not start work for these addresses */ if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) || IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); delay = ma->mca_work.timer.expires - jiffies; } if (delay >= resptime) delay = get_random_u32_below(resptime); if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); ma->mca_flags |= MAF_TIMER_RUNNING; } /* mark EXCLUDE-mode sources * called with mc_lock */ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; scount = 0; for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { /* skip inactive filters */ if (psf->sf_count[MCAST_INCLUDE] || pmc->mca_sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) break; if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { scount++; break; } } } pmc->mca_flags &= ~MAF_GSQUERY; if (scount == nsrcs) /* all sources excluded */ return false; return true; } /* called with mc_lock */ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; if (pmc->mca_sfmode == MCAST_EXCLUDE) return mld_xmarksources(pmc, nsrcs, srcs); /* mark INCLUDE-mode sources */ scount = 0; for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { psf->sf_gsresp = 1; scount++; break; } } } if (!scount) { pmc->mca_flags &= ~MAF_GSQUERY; return false; } pmc->mca_flags |= MAF_GSQUERY; return true; } static int mld_force_mld_version(const struct inet6_dev *idev) { const struct net *net = dev_net(idev->dev); int all_force; all_force = READ_ONCE(net->ipv6.devconf_all->force_mld_version); /* Normally, both are 0 here. If enforcement to a particular is * being used, individual device enforcement will have a lower * precedence over 'all' device (.../conf/all/force_mld_version). */ return all_force ?: READ_ONCE(idev->cnf.force_mld_version); } static bool mld_in_v2_mode_only(const struct inet6_dev *idev) { return mld_force_mld_version(idev) == 2; } static bool mld_in_v1_mode_only(const struct inet6_dev *idev) { return mld_force_mld_version(idev) == 1; } static bool mld_in_v1_mode(const struct inet6_dev *idev) { if (mld_in_v2_mode_only(idev)) return false; if (mld_in_v1_mode_only(idev)) return true; if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen)) return true; return false; } static void mld_set_v1_mode(struct inet6_dev *idev) { /* RFC3810, relevant sections: * - 9.1. Robustness Variable * - 9.2. Query Interval * - 9.3. Query Response Interval * - 9.12. Older Version Querier Present Timeout */ unsigned long switchback; switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri; idev->mc_v1_seen = jiffies + switchback; } static void mld_update_qrv(struct inet6_dev *idev, const struct mld2_query *mlh2) { /* RFC3810, relevant sections: * - 5.1.8. QRV (Querier's Robustness Variable) * - 9.1. Robustness Variable */ /* The value of the Robustness Variable MUST NOT be zero, * and SHOULD NOT be one. Catch this here if we ever run * into such a case in future. */ const int min_qrv = min(MLD_QRV_DEFAULT, sysctl_mld_qrv); WARN_ON(idev->mc_qrv == 0); if (mlh2->mld2q_qrv > 0) idev->mc_qrv = mlh2->mld2q_qrv; if (unlikely(idev->mc_qrv < min_qrv)) { net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n", idev->mc_qrv, min_qrv); idev->mc_qrv = min_qrv; } } static void mld_update_qi(struct inet6_dev *idev, const struct mld2_query *mlh2) { /* RFC3810, relevant sections: * - 5.1.9. QQIC (Querier's Query Interval Code) * - 9.2. Query Interval * - 9.12. Older Version Querier Present Timeout * (the [Query Interval] in the last Query received) */ unsigned long mc_qqi; if (mlh2->mld2q_qqic < 128) { mc_qqi = mlh2->mld2q_qqic; } else { unsigned long mc_man, mc_exp; mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic); mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic); mc_qqi = (mc_man | 0x10) << (mc_exp + 3); } idev->mc_qi = mc_qqi * HZ; } static void mld_update_qri(struct inet6_dev *idev, const struct mld2_query *mlh2) { /* RFC3810, relevant sections: * - 5.1.3. Maximum Response Code * - 9.3. Query Response Interval */ idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2)); } static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, unsigned long *max_delay, bool v1_query) { unsigned long mldv1_md; /* Ignore v1 queries */ if (mld_in_v2_mode_only(idev)) return -EINVAL; mldv1_md = ntohs(mld->mld_maxdelay); /* When in MLDv1 fallback and a MLDv2 router start-up being * unaware of current MLDv1 operation, the MRC == MRD mapping * only works when the exponential algorithm is not being * used (as MLDv1 is unaware of such things). * * According to the RFC author, the MLDv2 implementations * he's aware of all use a MRC < 32768 on start up queries. * * Thus, should we *ever* encounter something else larger * than that, just assume the maximum possible within our * reach. */ if (!v1_query) mldv1_md = min(mldv1_md, MLDV1_MRD_MAX_COMPAT); *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL); /* MLDv1 router present: we need to go into v1 mode *only* * when an MLDv1 query is received as per section 9.12. of * RFC3810! And we know from RFC2710 section 3.7 that MLDv1 * queries MUST be of exactly 24 octets. */ if (v1_query) mld_set_v1_mode(idev); /* cancel MLDv2 report work */ mld_gq_stop_work(idev); /* cancel the interface change work */ mld_ifc_stop_work(idev); /* clear deleted report items */ mld_clear_delrec(idev); return 0; } static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, unsigned long *max_delay) { *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); mld_update_qrv(idev, mld); mld_update_qi(idev, mld); mld_update_qri(idev, mld); idev->mc_maxdelay = *max_delay; return; } /* called with rcu_read_lock() */ void igmp6_event_query(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev || idev->dead) goto out; spin_lock_bh(&idev->mc_query_lock); if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_query_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0)) in6_dev_hold(idev); skb = NULL; } spin_unlock_bh(&idev->mc_query_lock); out: kfree_skb(skb); } static void __mld_query_work(struct sk_buff *skb) { struct mld2_query *mlh2 = NULL; const struct in6_addr *group; unsigned long max_delay; struct inet6_dev *idev; struct ifmcaddr6 *ma; struct mld_msg *mld; int group_type; int mark = 0; int len, err; if (!pskb_may_pull(skb, sizeof(struct in6_addr))) goto kfree_skb; /* compute payload length excluding extension headers */ len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); len -= skb_network_header_len(skb); /* RFC3810 6.2 * Upon reception of an MLD message that contains a Query, the node * checks if the source address of the message is a valid link-local * address, if the Hop Limit is set to 1, and if the Router Alert * option is present in the Hop-By-Hop Options header of the IPv6 * packet. If any of these checks fails, the packet is dropped. */ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) || ipv6_hdr(skb)->hop_limit != 1 || !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) || IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD)) goto kfree_skb; idev = in6_dev_get(skb->dev); if (!idev) goto kfree_skb; mld = (struct mld_msg *)icmp6_hdr(skb); group = &mld->mld_mca; group_type = ipv6_addr_type(group); if (group_type != IPV6_ADDR_ANY && !(group_type&IPV6_ADDR_MULTICAST)) goto out; if (len < MLD_V1_QUERY_LEN) { goto out; } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) { err = mld_process_v1(idev, mld, &max_delay, len == MLD_V1_QUERY_LEN); if (err < 0) goto out; } else if (len >= MLD_V2_QUERY_LEN_MIN) { int srcs_offset = sizeof(struct mld2_query) - sizeof(struct icmp6hdr); if (!pskb_may_pull(skb, srcs_offset)) goto out; mlh2 = (struct mld2_query *)skb_transport_header(skb); mld_process_v2(idev, mlh2, &max_delay); if (group_type == IPV6_ADDR_ANY) { /* general query */ if (mlh2->mld2q_nsrcs) goto out; /* no sources allowed */ mld_gq_start_work(idev); goto out; } /* mark sources to include, if group & source-specific */ if (mlh2->mld2q_nsrcs != 0) { if (!pskb_may_pull(skb, srcs_offset + ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr))) goto out; mlh2 = (struct mld2_query *)skb_transport_header(skb); mark = 1; } } else { goto out; } if (group_type == IPV6_ADDR_ANY) { for_each_mc_mclock(idev, ma) { igmp6_group_queried(ma, max_delay); } } else { for_each_mc_mclock(idev, ma) { if (!ipv6_addr_equal(group, &ma->mca_addr)) continue; if (ma->mca_flags & MAF_TIMER_RUNNING) { /* gsquery <- gsquery && mark */ if (!mark) ma->mca_flags &= ~MAF_GSQUERY; } else { /* gsquery <- mark */ if (mark) ma->mca_flags |= MAF_GSQUERY; else ma->mca_flags &= ~MAF_GSQUERY; } if (!(ma->mca_flags & MAF_GSQUERY) || mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs)) igmp6_group_queried(ma, max_delay); break; } } out: in6_dev_put(idev); kfree_skb: consume_skb(skb); } static void mld_query_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_query_work); struct sk_buff_head q; struct sk_buff *skb; bool rework = false; int cnt = 0; skb_queue_head_init(&q); spin_lock_bh(&idev->mc_query_lock); while ((skb = __skb_dequeue(&idev->mc_query_queue))) { __skb_queue_tail(&q, skb); if (++cnt >= MLD_MAX_QUEUE) { rework = true; break; } } spin_unlock_bh(&idev->mc_query_lock); mutex_lock(&idev->mc_lock); while ((skb = __skb_dequeue(&q))) __mld_query_work(skb); mutex_unlock(&idev->mc_lock); if (rework && queue_delayed_work(mld_wq, &idev->mc_query_work, 0)) return; in6_dev_put(idev); } /* called with rcu_read_lock() */ void igmp6_event_report(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev || idev->dead) goto out; spin_lock_bh(&idev->mc_report_lock); if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_report_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0)) in6_dev_hold(idev); skb = NULL; } spin_unlock_bh(&idev->mc_report_lock); out: kfree_skb(skb); } static void __mld_report_work(struct sk_buff *skb) { struct inet6_dev *idev; struct ifmcaddr6 *ma; struct mld_msg *mld; int addr_type; /* Our own report looped back. Ignore it. */ if (skb->pkt_type == PACKET_LOOPBACK) goto kfree_skb; /* send our report if the MC router may not have heard this report */ if (skb->pkt_type != PACKET_MULTICAST && skb->pkt_type != PACKET_BROADCAST) goto kfree_skb; if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr))) goto kfree_skb; mld = (struct mld_msg *)icmp6_hdr(skb); /* Drop reports with not link local source */ addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); if (addr_type != IPV6_ADDR_ANY && !(addr_type&IPV6_ADDR_LINKLOCAL)) goto kfree_skb; idev = in6_dev_get(skb->dev); if (!idev) goto kfree_skb; /* * Cancel the work for this group */ for_each_mc_mclock(idev, ma) { if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { if (cancel_delayed_work(&ma->mca_work)) refcount_dec(&ma->mca_refcnt); ma->mca_flags &= ~(MAF_LAST_REPORTER | MAF_TIMER_RUNNING); break; } } in6_dev_put(idev); kfree_skb: consume_skb(skb); } static void mld_report_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_report_work); struct sk_buff_head q; struct sk_buff *skb; bool rework = false; int cnt = 0; skb_queue_head_init(&q); spin_lock_bh(&idev->mc_report_lock); while ((skb = __skb_dequeue(&idev->mc_report_queue))) { __skb_queue_tail(&q, skb); if (++cnt >= MLD_MAX_QUEUE) { rework = true; break; } } spin_unlock_bh(&idev->mc_report_lock); mutex_lock(&idev->mc_lock); while ((skb = __skb_dequeue(&q))) __mld_report_work(skb); mutex_unlock(&idev->mc_lock); if (rework && queue_delayed_work(mld_wq, &idev->mc_report_work, 0)) return; in6_dev_put(idev); } static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, int gdeleted, int sdeleted) { switch (type) { case MLD2_MODE_IS_INCLUDE: case MLD2_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) return false; if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) { if (pmc->mca_sfmode == MCAST_INCLUDE) return true; /* don't include if this source is excluded * in all filters */ if (psf->sf_count[MCAST_INCLUDE]) return type == MLD2_MODE_IS_INCLUDE; return pmc->mca_sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; } return false; case MLD2_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) return false; return psf->sf_count[MCAST_INCLUDE] != 0; case MLD2_CHANGE_TO_EXCLUDE: if (gdeleted || sdeleted) return false; if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 || psf->sf_count[MCAST_INCLUDE]) return false; return pmc->mca_sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; case MLD2_ALLOW_NEW_SOURCES: if (gdeleted || !psf->sf_crcount) return false; return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted; case MLD2_BLOCK_OLD_SOURCES: if (pmc->mca_sfmode == MCAST_INCLUDE) return gdeleted || (psf->sf_crcount && sdeleted); return psf->sf_crcount && !gdeleted && !sdeleted; } return false; } static int mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) { struct ip6_sf_list *psf; int scount = 0; for_each_psf_mclock(pmc, psf) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; } return scount; } static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr, int proto, int len) { struct ipv6hdr *hdr; skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; skb_reset_network_header(skb); skb_put(skb, sizeof(struct ipv6hdr)); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, 0, 0); hdr->payload_len = htons(len); hdr->nexthdr = proto; hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit); hdr->saddr = *saddr; hdr->daddr = *daddr; } static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) { u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; struct net_device *dev = idev->dev; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; const struct in6_addr *saddr; struct in6_addr addr_buf; struct mld2_report *pmr; struct sk_buff *skb; unsigned int size; struct sock *sk; struct net *net; /* we assume size > sizeof(ra) here * Also try to not allocate high-order pages for big MTU */ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; skb = alloc_skb(size, GFP_KERNEL); if (!skb) return NULL; skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); rcu_read_lock(); net = dev_net_rcu(dev); sk = net->ipv6.igmp_sk; skb_set_owner_w(skb, sk); if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. */ saddr = &in6addr_any; } else saddr = &addr_buf; ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); rcu_read_unlock(); skb_put_data(skb, ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); skb_put(skb, sizeof(*pmr)); pmr = (struct mld2_report *)skb_transport_header(skb); pmr->mld2r_type = ICMPV6_MLD2_REPORT; pmr->mld2r_resv1 = 0; pmr->mld2r_cksum = 0; pmr->mld2r_resv2 = 0; pmr->mld2r_ngrec = 0; return skb; } static void mld_sendpack(struct sk_buff *skb) { struct ipv6hdr *pip6 = ipv6_hdr(skb); struct mld2_report *pmr = (struct mld2_report *)skb_transport_header(skb); int payload_len, mldlen; struct inet6_dev *idev; struct net *net = dev_net(skb->dev); int err; struct flowi6 fl6; struct dst_entry *dst; rcu_read_lock(); idev = __in6_dev_get(skb->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) - sizeof(*pip6); mldlen = skb_tail_pointer(skb) - skb_transport_header(skb); pip6->payload_len = htons(payload_len); pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, IPPROTO_ICMPV6, csum_partial(skb_transport_header(skb), mldlen, 0)); icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); dst = icmp6_dst_alloc(skb->dev, &fl6); err = 0; if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; } skb_dst_set(skb, dst); if (err) goto err_out; err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, net->ipv6.igmp_sk, skb, NULL, skb->dev, dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } else { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); } rcu_read_unlock(); return; err_out: kfree_skb(skb); goto out; } static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) { return sizeof(struct mld2_grec) + 16 * mld_scount(pmc,type,gdel,sdel); } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, struct mld2_grec **ppgr, unsigned int mtu) { struct mld2_report *pmr; struct mld2_grec *pgr; if (!skb) { skb = mld_newpack(pmc->idev, mtu); if (!skb) return NULL; } pgr = skb_put(skb, sizeof(struct mld2_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->mca_addr; /* structure copy */ pmr = (struct mld2_report *)skb_transport_header(skb); pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1); *ppgr = pgr; return skb; } #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) /* called with mc_lock */ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted, int crsend) { struct ip6_sf_list *psf, *psf_prev, *psf_next; int scount, stotal, first, isquery, truncate; struct ip6_sf_list __rcu **psf_list; struct inet6_dev *idev = pmc->idev; struct net_device *dev = idev->dev; struct mld2_grec *pgr = NULL; struct mld2_report *pmr; unsigned int mtu; if (pmc->mca_flags & MAF_NOREPORT) return skb; mtu = READ_ONCE(dev->mtu); if (mtu < IPV6_MIN_MTU) return skb; isquery = type == MLD2_MODE_IS_INCLUDE || type == MLD2_MODE_IS_EXCLUDE; truncate = type == MLD2_MODE_IS_EXCLUDE || type == MLD2_CHANGE_TO_EXCLUDE; stotal = scount = 0; psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; if (!rcu_access_pointer(*psf_list)) goto empty_source; pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL; /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { if (pmr && pmr->mld2r_ngrec && AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) mld_sendpack(skb); skb = mld_newpack(idev, mtu); } } first = 1; psf_prev = NULL; for (psf = mc_dereference(*psf_list, idev); psf; psf = psf_next) { struct in6_addr *psrc; psf_next = mc_dereference(psf->sf_next, idev); if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) { psf_prev = psf; continue; } /* Based on RFC3810 6.1. Should not send source-list change * records when there is a filter mode change. */ if (((gdeleted && pmc->mca_sfmode == MCAST_EXCLUDE) || (!gdeleted && pmc->mca_crcount)) && (type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) goto decrease_sf_crcount; /* clear marks on query responses */ if (isquery) psf->sf_gsresp = 0; if (AVAILABLE(skb) < sizeof(*psrc) + first*sizeof(struct mld2_grec)) { if (truncate && !first) break; /* truncate these */ if (pgr) pgr->grec_nsrcs = htons(scount); if (skb) mld_sendpack(skb); skb = mld_newpack(idev, mtu); first = 1; scount = 0; } if (first) { skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) return NULL; psrc = skb_put(skb, sizeof(*psrc)); *psrc = psf->sf_addr; scount++; stotal++; if ((type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(*psf_list, mc_dereference(psf->sf_next, idev)); kfree_rcu(psf, rcu); continue; } } psf_prev = psf; } empty_source: if (!stotal) { if (type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) return skb; if (pmc->mca_crcount || isquery || crsend) { /* make sure we have room for group header */ if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) { mld_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) pgr->grec_nsrcs = htons(scount); if (isquery) pmc->mca_flags &= ~MAF_GSQUERY; /* clear query state */ return skb; } /* called with mc_lock */ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) { struct sk_buff *skb = NULL; int type; if (!pmc) { for_each_mc_mclock(idev, pmc) { if (pmc->mca_flags & MAF_NOREPORT) continue; if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); } } else { if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); } if (skb) mld_sendpack(skb); } /* * remove zero-count source records from a source filter list * called with mc_lock */ static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev) { struct ip6_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; for (psf = mc_dereference(*ppsf, idev); psf; psf = psf_next) { psf_next = mc_dereference(psf->sf_next, idev); if (psf->sf_crcount == 0) { if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(*ppsf, mc_dereference(psf->sf_next, idev)); kfree_rcu(psf, rcu); } else { psf_prev = psf; } } } /* called with mc_lock */ static void mld_send_cr(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next; struct sk_buff *skb = NULL; int type, dtype; /* deleted MCA's */ pmc_prev = NULL; for (pmc = mc_dereference(idev->mc_tomb, idev); pmc; pmc = pmc_next) { pmc_next = mc_dereference(pmc->next, idev); if (pmc->mca_sfmode == MCAST_INCLUDE) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; skb = add_grec(skb, pmc, type, 1, 0, 0); skb = add_grec(skb, pmc, dtype, 1, 1, 0); } if (pmc->mca_crcount) { if (pmc->mca_sfmode == MCAST_EXCLUDE) { type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 1, 0, 0); } pmc->mca_crcount--; if (pmc->mca_crcount == 0) { mld_clear_zeros(&pmc->mca_tomb, idev); mld_clear_zeros(&pmc->mca_sources, idev); } } if (pmc->mca_crcount == 0 && !rcu_access_pointer(pmc->mca_tomb) && !rcu_access_pointer(pmc->mca_sources)) { if (pmc_prev) rcu_assign_pointer(pmc_prev->next, pmc_next); else rcu_assign_pointer(idev->mc_tomb, pmc_next); in6_dev_put(pmc->idev); kfree_rcu(pmc, rcu); } else pmc_prev = pmc; } /* change recs */ for_each_mc_mclock(idev, pmc) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_ALLOW_NEW_SOURCES; } else { type = MLD2_ALLOW_NEW_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; } skb = add_grec(skb, pmc, type, 0, 0, 0); skb = add_grec(skb, pmc, dtype, 0, 1, 0); /* deleted sources */ /* filter mode changes */ if (pmc->mca_crcount) { if (pmc->mca_sfmode == MCAST_EXCLUDE) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); pmc->mca_crcount--; } } if (!skb) return; (void) mld_sendpack(skb); } static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { const struct in6_addr *snd_addr, *saddr; int err, len, payload_len, full_len; struct in6_addr addr_buf; struct inet6_dev *idev; struct sk_buff *skb; struct mld_msg *hdr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; struct dst_entry *dst; struct flowi6 fl6; struct net *net; struct sock *sk; if (type == ICMPV6_MGM_REDUCTION) snd_addr = &in6addr_linklocal_allrouters; else snd_addr = addr; len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL); rcu_read_lock(); net = dev_net_rcu(dev); idev = __in6_dev_get(dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } sk = net->ipv6.igmp_sk; skb_set_owner_w(skb, sk); skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. */ saddr = &in6addr_any; } else saddr = &addr_buf; ip6_mc_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); skb_put_data(skb, ra, sizeof(ra)); hdr = skb_put_zero(skb, sizeof(struct mld_msg)); hdr->mld_type = type; hdr->mld_mca = *addr; hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len, IPPROTO_ICMPV6, csum_partial(hdr, len, 0)); icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto err_out; } skb_dst_set(skb, dst); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb->dev, dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } else IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; err_out: kfree_skb(skb); goto out; } /* called with mc_lock */ static void mld_send_initial_cr(struct inet6_dev *idev) { struct sk_buff *skb; struct ifmcaddr6 *pmc; int type; if (mld_in_v1_mode(idev)) return; skb = NULL; for_each_mc_mclock(idev, pmc) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_ALLOW_NEW_SOURCES; skb = add_grec(skb, pmc, type, 0, 0, 1); } if (skb) mld_sendpack(skb); } void ipv6_mc_dad_complete(struct inet6_dev *idev) { mutex_lock(&idev->mc_lock); idev->mc_dad_count = idev->mc_qrv; if (idev->mc_dad_count) { mld_send_initial_cr(idev); idev->mc_dad_count--; if (idev->mc_dad_count) mld_dad_start_work(idev, unsolicited_report_interval(idev)); } mutex_unlock(&idev->mc_lock); } static void mld_dad_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_dad_work); mutex_lock(&idev->mc_lock); mld_send_initial_cr(idev); if (idev->mc_dad_count) { idev->mc_dad_count--; if (idev->mc_dad_count) mld_dad_start_work(idev, unsolicited_report_interval(idev)); } mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } /* called with mc_lock */ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; int rv = 0; psf_prev = NULL; for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; } if (!psf || psf->sf_count[sfmode] == 0) { /* source filter not found, or count wrong => bug */ return -ESRCH; } WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1); if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { struct inet6_dev *idev = pmc->idev; /* no more filters for this source */ if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(pmc->mca_sources, mc_dereference(psf->sf_next, idev)); if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && !mld_in_v1_mode(idev)) { psf->sf_crcount = idev->mc_qrv; rcu_assign_pointer(psf->sf_next, mc_dereference(pmc->mca_tomb, idev)); rcu_assign_pointer(pmc->mca_tomb, psf); rv = 1; } else { kfree_rcu(psf, rcu); } } return rv; } /* called with mc_lock */ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) { struct ifmcaddr6 *pmc; int changerec = 0; int i, err; if (!idev) return -ENODEV; for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } if (!pmc) return -ESRCH; sf_markstate(pmc); if (!delta) { if (!pmc->mca_sfcount[sfmode]) return -EINVAL; pmc->mca_sfcount[sfmode]--; } err = 0; for (i = 0; i < sfcount; i++) { int rv = ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]); changerec |= rv > 0; if (!err && rv < 0) err = rv; } if (pmc->mca_sfmode == MCAST_EXCLUDE && pmc->mca_sfcount[MCAST_EXCLUDE] == 0 && pmc->mca_sfcount[MCAST_INCLUDE]) { struct ip6_sf_list *psf; /* filter mode change */ pmc->mca_sfmode = MCAST_INCLUDE; pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; for_each_psf_mclock(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(pmc->idev); } else if (sf_setstate(pmc) || changerec) { mld_ifc_event(pmc->idev); } return err; } /* * Add multicast single-source filter to the interface list * called with mc_lock */ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; psf_prev = NULL; for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; } if (!psf) { psf = kzalloc(sizeof(*psf), GFP_KERNEL); if (!psf) return -ENOBUFS; psf->sf_addr = *psfsrc; if (psf_prev) { rcu_assign_pointer(psf_prev->sf_next, psf); } else { rcu_assign_pointer(pmc->mca_sources, psf); } } WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1); return 0; } /* called with mc_lock */ static void sf_markstate(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else { psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; } } } /* called with mc_lock */ static int sf_setstate(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *dpsf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; int qrv = pmc->idev->mc_qrv; int new_in, rv; rv = 0; for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else new_in = psf->sf_count[MCAST_INCLUDE] != 0; if (new_in) { if (!psf->sf_oldin) { struct ip6_sf_list *prev = NULL; for_each_psf_tomb(pmc, dpsf) { if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; prev = dpsf; } if (dpsf) { if (prev) rcu_assign_pointer(prev->sf_next, mc_dereference(dpsf->sf_next, pmc->idev)); else rcu_assign_pointer(pmc->mca_tomb, mc_dereference(dpsf->sf_next, pmc->idev)); kfree_rcu(dpsf, rcu); } psf->sf_crcount = qrv; rv++; } } else if (psf->sf_oldin) { psf->sf_crcount = 0; /* * add or update "delete" records if an active filter * is now inactive */ for_each_psf_tomb(pmc, dpsf) if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; if (!dpsf) { dpsf = kmalloc(sizeof(*dpsf), GFP_KERNEL); if (!dpsf) continue; *dpsf = *psf; rcu_assign_pointer(dpsf->sf_next, mc_dereference(pmc->mca_tomb, pmc->idev)); rcu_assign_pointer(pmc->mca_tomb, dpsf); } dpsf->sf_crcount = qrv; rv++; } } return rv; } /* * Add multicast source filter list to the interface list * called with mc_lock */ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) { struct ifmcaddr6 *pmc; int isexclude; int i, err; if (!idev) return -ENODEV; for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } if (!pmc) return -ESRCH; sf_markstate(pmc); isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; if (!delta) WRITE_ONCE(pmc->mca_sfcount[sfmode], pmc->mca_sfcount[sfmode] + 1); err = 0; for (i = 0; i < sfcount; i++) { err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; } if (err) { int j; if (!delta) WRITE_ONCE(pmc->mca_sfcount[sfmode], pmc->mca_sfcount[sfmode] - 1); for (j = 0; j < i; j++) ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { struct ip6_sf_list *psf; /* filter mode change */ if (pmc->mca_sfcount[MCAST_EXCLUDE]) pmc->mca_sfmode = MCAST_EXCLUDE; else if (pmc->mca_sfcount[MCAST_INCLUDE]) pmc->mca_sfmode = MCAST_INCLUDE; /* else no filters; keep old mode for reports */ pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; for_each_psf_mclock(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(idev); } else if (sf_setstate(pmc)) { mld_ifc_event(idev); } return err; } /* called with mc_lock */ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *nextpsf; for (psf = mc_dereference(pmc->mca_tomb, pmc->idev); psf; psf = nextpsf) { nextpsf = mc_dereference(psf->sf_next, pmc->idev); kfree_rcu(psf, rcu); } RCU_INIT_POINTER(pmc->mca_tomb, NULL); for (psf = mc_dereference(pmc->mca_sources, pmc->idev); psf; psf = nextpsf) { nextpsf = mc_dereference(psf->sf_next, pmc->idev); kfree_rcu(psf, rcu); } RCU_INIT_POINTER(pmc->mca_sources, NULL); pmc->mca_sfmode = MCAST_EXCLUDE; pmc->mca_sfcount[MCAST_INCLUDE] = 0; /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */ WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1); } /* called with mc_lock */ static void igmp6_join_group(struct ifmcaddr6 *ma) { unsigned long delay; if (ma->mca_flags & MAF_NOREPORT) return; igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); delay = get_random_u32_below(unsolicited_report_interval(ma->idev)); if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); delay = ma->mca_work.timer.expires - jiffies; } if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; } static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev) { struct ip6_sf_socklist *psl; int err; psl = sock_dereference(iml->sflist, sk); if (idev) mutex_lock(&idev->mc_lock); if (!psl) { /* any-source empty exclude case */ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); } else { err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, psl->sl_count, psl->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } if (idev) mutex_unlock(&idev->mc_lock); return err; } /* called with mc_lock */ static void igmp6_leave_group(struct ifmcaddr6 *ma) { if (mld_in_v1_mode(ma->idev)) { if (ma->mca_flags & MAF_LAST_REPORTER) { igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REDUCTION); } } else { mld_add_delrec(ma->idev, ma); mld_ifc_event(ma->idev); } } static void mld_gq_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_gq_work); mutex_lock(&idev->mc_lock); mld_send_report(idev, NULL); idev->mc_gq_running = 0; mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } static void mld_ifc_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_ifc_work); mutex_lock(&idev->mc_lock); mld_send_cr(idev); if (idev->mc_ifc_count) { idev->mc_ifc_count--; if (idev->mc_ifc_count) mld_ifc_start_work(idev, unsolicited_report_interval(idev)); } mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } /* called with mc_lock */ static void mld_ifc_event(struct inet6_dev *idev) { if (mld_in_v1_mode(idev)) return; idev->mc_ifc_count = idev->mc_qrv; mld_ifc_start_work(idev, 1); } static void mld_mca_work(struct work_struct *work) { struct ifmcaddr6 *ma = container_of(to_delayed_work(work), struct ifmcaddr6, mca_work); mutex_lock(&ma->idev->mc_lock); if (mld_in_v1_mode(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); else mld_send_report(ma->idev, ma); ma->mca_flags |= MAF_LAST_REPORTER; ma->mca_flags &= ~MAF_TIMER_RUNNING; mutex_unlock(&ma->idev->mc_lock); ma_put(ma); } /* Device changing type */ void ipv6_mc_unmap(struct inet6_dev *idev) { struct ifmcaddr6 *i; /* Install multicast list, except for all-nodes (already installed) */ mutex_lock(&idev->mc_lock); for_each_mc_mclock(idev, i) igmp6_group_dropped(i); mutex_unlock(&idev->mc_lock); } void ipv6_mc_remap(struct inet6_dev *idev) { ipv6_mc_up(idev); } /* Device going down */ void ipv6_mc_down(struct inet6_dev *idev) { struct ifmcaddr6 *i; mutex_lock(&idev->mc_lock); /* Withdraw multicast list */ for_each_mc_mclock(idev, i) igmp6_group_dropped(i); mutex_unlock(&idev->mc_lock); /* Should stop work after group drop. or we will * start work again in mld_ifc_event() */ mld_query_stop_work(idev); mld_report_stop_work(idev); mutex_lock(&idev->mc_lock); mld_ifc_stop_work(idev); mld_gq_stop_work(idev); mutex_unlock(&idev->mc_lock); mld_dad_stop_work(idev); } static void ipv6_mc_reset(struct inet6_dev *idev) { idev->mc_qrv = sysctl_mld_qrv; idev->mc_qi = MLD_QI_DEFAULT; idev->mc_qri = MLD_QRI_DEFAULT; idev->mc_v1_seen = 0; idev->mc_maxdelay = unsolicited_report_interval(idev); } /* Device going up */ void ipv6_mc_up(struct inet6_dev *idev) { struct ifmcaddr6 *i; /* Install multicast list, except for all-nodes (already installed) */ ipv6_mc_reset(idev); mutex_lock(&idev->mc_lock); for_each_mc_mclock(idev, i) { mld_del_delrec(idev, i); igmp6_group_added(i); } mutex_unlock(&idev->mc_lock); } /* IPv6 device initialization. */ void ipv6_mc_init_dev(struct inet6_dev *idev) { idev->mc_gq_running = 0; INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work); RCU_INIT_POINTER(idev->mc_tomb, NULL); idev->mc_ifc_count = 0; INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work); INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work); INIT_DELAYED_WORK(&idev->mc_query_work, mld_query_work); INIT_DELAYED_WORK(&idev->mc_report_work, mld_report_work); skb_queue_head_init(&idev->mc_query_queue); skb_queue_head_init(&idev->mc_report_queue); spin_lock_init(&idev->mc_query_lock); spin_lock_init(&idev->mc_report_lock); mutex_init(&idev->mc_lock); ipv6_mc_reset(idev); } /* * Device is about to be destroyed: clean up. */ void ipv6_mc_destroy_dev(struct inet6_dev *idev) { struct ifmcaddr6 *i; /* Deactivate works */ ipv6_mc_down(idev); mutex_lock(&idev->mc_lock); mld_clear_delrec(idev); mutex_unlock(&idev->mc_lock); mld_clear_query(idev); mld_clear_report(idev); /* Delete all-nodes address. */ /* We cannot call ipv6_dev_mc_dec() directly, our caller in * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will * fail. */ __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allnodes); if (idev->cnf.forwarding) __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters); mutex_lock(&idev->mc_lock); while ((i = mc_dereference(idev->mc_list, idev))) { rcu_assign_pointer(idev->mc_list, mc_dereference(i->next, idev)); ip6_mc_clear_src(i); ma_put(i); } mutex_unlock(&idev->mc_lock); } static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) { struct ifmcaddr6 *pmc; ASSERT_RTNL(); mutex_lock(&idev->mc_lock); if (mld_in_v1_mode(idev)) { for_each_mc_mclock(idev, pmc) igmp6_join_group(pmc); } else { mld_send_report(idev, NULL); } mutex_unlock(&idev->mc_lock); } static int ipv6_mc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct inet6_dev *idev = __in6_dev_get(dev); switch (event) { case NETDEV_RESEND_IGMP: if (idev) ipv6_mc_rejoin_groups(idev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block igmp6_netdev_notifier = { .notifier_call = ipv6_mc_netdev_event, }; #ifdef CONFIG_PROC_FS struct igmp6_mc_iter_state { struct seq_net_private p; struct net_device *dev; struct inet6_dev *idev; }; #define igmp6_mc_seq_private(seq) ((struct igmp6_mc_iter_state *)(seq)->private) static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) { struct ifmcaddr6 *im = NULL; struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); struct net *net = seq_file_net(seq); state->idev = NULL; for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; idev = __in6_dev_get(state->dev); if (!idev) continue; im = rcu_dereference(idev->mc_list); if (im) { state->idev = idev; break; } } return im; } static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr6 *im) { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); im = rcu_dereference(im->next); while (!im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; break; } state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; im = rcu_dereference(state->idev->mc_list); } return im; } static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos) { struct ifmcaddr6 *im = igmp6_mc_get_first(seq); if (im) while (pos && (im = igmp6_mc_get_next(seq, im)) != NULL) --pos; return pos ? NULL : im; } static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return igmp6_mc_get_idx(seq, *pos); } static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v); ++*pos; return im; } static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); if (likely(state->idev)) state->idev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp6_mc_seq_show(struct seq_file *seq, void *v) { struct ifmcaddr6 *im = (struct ifmcaddr6 *)v; struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); seq_printf(seq, "%-4d %-15s %pi6 %5d %08X %ld\n", state->dev->ifindex, state->dev->name, &im->mca_addr, im->mca_users, im->mca_flags, (im->mca_flags & MAF_TIMER_RUNNING) ? jiffies_to_clock_t(im->mca_work.timer.expires - jiffies) : 0); return 0; } static const struct seq_operations igmp6_mc_seq_ops = { .start = igmp6_mc_seq_start, .next = igmp6_mc_seq_next, .stop = igmp6_mc_seq_stop, .show = igmp6_mc_seq_show, }; struct igmp6_mcf_iter_state { struct seq_net_private p; struct net_device *dev; struct inet6_dev *idev; struct ifmcaddr6 *im; }; #define igmp6_mcf_seq_private(seq) ((struct igmp6_mcf_iter_state *)(seq)->private) static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) { struct ip6_sf_list *psf = NULL; struct ifmcaddr6 *im = NULL; struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); struct net *net = seq_file_net(seq); state->idev = NULL; state->im = NULL; for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; idev = __in6_dev_get(state->dev); if (unlikely(idev == NULL)) continue; im = rcu_dereference(idev->mc_list); if (likely(im)) { psf = rcu_dereference(im->mca_sources); if (likely(psf)) { state->im = im; state->idev = idev; break; } } } return psf; } static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_sf_list *psf) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); psf = rcu_dereference(psf->sf_next); while (!psf) { state->im = rcu_dereference(state->im->next); while (!state->im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; goto out; } state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; state->im = rcu_dereference(state->idev->mc_list); } psf = rcu_dereference(state->im->mca_sources); } out: return psf; } static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos) { struct ip6_sf_list *psf = igmp6_mcf_get_first(seq); if (psf) while (pos && (psf = igmp6_mcf_get_next(seq, psf)) != NULL) --pos; return pos ? NULL : psf; } static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip6_sf_list *psf; if (v == SEQ_START_TOKEN) psf = igmp6_mcf_get_first(seq); else psf = igmp6_mcf_get_next(seq, v); ++*pos; return psf; } static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); if (likely(state->im)) state->im = NULL; if (likely(state->idev)) state->idev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) { struct ip6_sf_list *psf = (struct ip6_sf_list *)v; struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Idx Device Multicast Address Source Address INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s %pi6 %pi6 %6lu %6lu\n", state->dev->ifindex, state->dev->name, &state->im->mca_addr, &psf->sf_addr, READ_ONCE(psf->sf_count[MCAST_INCLUDE]), READ_ONCE(psf->sf_count[MCAST_EXCLUDE])); } return 0; } static const struct seq_operations igmp6_mcf_seq_ops = { .start = igmp6_mcf_seq_start, .next = igmp6_mcf_seq_next, .stop = igmp6_mcf_seq_stop, .show = igmp6_mcf_seq_show, }; static int __net_init igmp6_proc_init(struct net *net) { int err; err = -ENOMEM; if (!proc_create_net("igmp6", 0444, net->proc_net, &igmp6_mc_seq_ops, sizeof(struct igmp6_mc_iter_state))) goto out; if (!proc_create_net("mcfilter6", 0444, net->proc_net, &igmp6_mcf_seq_ops, sizeof(struct igmp6_mcf_iter_state))) goto out_proc_net_igmp6; err = 0; out: return err; out_proc_net_igmp6: remove_proc_entry("igmp6", net->proc_net); goto out; } static void __net_exit igmp6_proc_exit(struct net *net) { remove_proc_entry("mcfilter6", net->proc_net); remove_proc_entry("igmp6", net->proc_net); } #else static inline int igmp6_proc_init(struct net *net) { return 0; } static inline void igmp6_proc_exit(struct net *net) { } #endif static int __net_init igmp6_net_init(struct net *net) { int err; err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { pr_err("Failed to initialize the IGMP6 control socket (err %d)\n", err); goto out; } inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1; net->ipv6.igmp_sk->sk_allocation = GFP_KERNEL; err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { pr_err("Failed to initialize the IGMP6 autojoin socket (err %d)\n", err); goto out_sock_create; } err = igmp6_proc_init(net); if (err) goto out_sock_create_autojoin; return 0; out_sock_create_autojoin: inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); out_sock_create: inet_ctl_sock_destroy(net->ipv6.igmp_sk); out: return err; } static void __net_exit igmp6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.igmp_sk); inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); igmp6_proc_exit(net); } static struct pernet_operations igmp6_net_ops = { .init = igmp6_net_init, .exit = igmp6_net_exit, }; int __init igmp6_init(void) { int err; err = register_pernet_subsys(&igmp6_net_ops); if (err) return err; mld_wq = create_workqueue("mld"); if (!mld_wq) { unregister_pernet_subsys(&igmp6_net_ops); return -ENOMEM; } return err; } int __init igmp6_late_init(void) { return register_netdevice_notifier(&igmp6_netdev_notifier); } void igmp6_cleanup(void) { unregister_pernet_subsys(&igmp6_net_ops); destroy_workqueue(mld_wq); } void igmp6_late_cleanup(void) { unregister_netdevice_notifier(&igmp6_netdev_notifier); } |
435 437 82 82 118 82 74 128 37 129 162 163 163 162 320 217 155 321 214 163 163 163 321 321 86 283 81 313 3 124 35 91 18 321 158 158 159 108 108 179 179 130 324 74 74 306 255 247 12 254 256 131 55 33 32 55 55 110 32 123 131 16 16 131 131 131 179 133 135 87 134 10 133 38 38 179 179 179 179 177 32 179 259 258 157 179 72 87 259 258 316 316 316 314 316 315 316 316 314 62 24 316 319 320 321 321 320 320 3 3 3 3 3 3 306 307 307 307 307 306 306 307 307 307 307 307 306 306 305 306 307 307 306 307 306 307 306 307 307 307 307 306 307 1 306 252 15 15 15 15 11 4 4 162 15 26 220 70 26 70 26 39 1 39 14 39 1 39 1 39 39 39 39 1 39 11 39 39 12 39 1 1 39 39 39 1 39 1 1 39 1 39 3 3 3 6 3 3 307 253 231 307 153 232 46 231 232 230 306 304 10 178 169 16 11 168 90 85 249 249 236 15 8 249 232 363 153 168 156 23 159 12 138 46 46 168 168 92 82 168 168 168 168 135 39 160 160 80 80 160 160 162 6 156 156 1 2 11 11 3 8 8 11 2 15 15 2 15 237 11 17 251 7 158 390 389 445 43 43 13 39 17 17 443 445 444 431 45 441 412 158 444 445 469 469 50 239 4 302 469 353 272 468 470 471 471 402 156 469 43 445 403 155 155 471 323 19 13 13 3 13 320 323 214 149 13 551 551 541 541 551 329 541 318 317 317 318 163 163 21 156 156 39 53 53 53 54 1093 1093 53 7 278 250 41 279 250 41 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * demand-loading started 01.12.91 - seems it is high on the list of * things wanted, and it should be easy to implement. - Linus */ /* * Ok, demand-loading was easy, shared pages a little bit tricker. Shared * pages started 02.12.91, seems to work. - Linus. * * Tested sharing by executing about 30 /bin/sh: under the old kernel it * would have taken more than the 6M I have free, but it worked well as * far as I could see. * * Also corrected some "invalidate()"s - I wasn't doing enough of them. */ /* * Real VM (paging to/from disk) started 18.12.91. Much more work and * thought has to go into this. Oh, well.. * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. * Found it. Everything seems to work now. * 20.12.91 - Ok, making the swap-device changeable like the root. */ /* * 05.04.94 - Multi-page memory management added for v1.1. * Idea by Alex Bligh (alex@cconcepts.co.uk) * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) * * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) */ #include <linux/kernel_stat.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/memremap.h> #include <linux/kmsan.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> #include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/swapops.h> #include <linux/elf.h> #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> #include <linux/memory-tiers.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> #include <linux/dax.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> #include <trace/events/kmem.h> #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include "pgalloc-track.h" #include "internal.h" #include "swap.h" #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif static vm_fault_t do_fault(struct vm_fault *vmf); static vm_fault_t do_anonymous_page(struct vm_fault *vmf); static bool vmf_pte_changed(struct vm_fault *vmf); /* * Return true if the original pte was a uffd-wp pte marker (so the pte was * wr-protected). */ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) { if (!userfaultfd_wp(vmf->vma)) return false; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; return pte_marker_uffd_wp(vmf->orig_pte); } /* * Randomize the address space (stacks, mmaps, brk, etc.). * * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, * as ancient (libc5 based) binaries can segfault. ) */ int randomize_va_space __read_mostly = #ifdef CONFIG_COMPAT_BRK 1; #else 2; #endif #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { /* * Transitioning a PTE from 'old' to 'young' can be expensive on * some architectures, even if it's performed in hardware. By * default, "false" means prefaulted entries will be 'young'. */ return false; } #endif static int __init disable_randmaps(char *s) { randomize_va_space = 0; return 1; } __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; EXPORT_SYMBOL(zero_pfn); unsigned long highest_memmap_pfn __read_mostly; /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ static int __init init_zero_pfn(void) { zero_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } early_initcall(init_zero_pfn); void mm_trace_rss_stat(struct mm_struct *mm, int member) { trace_rss_stat(mm, member); } /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; unsigned long start; start = addr; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; if (start < floor) return; if (ceiling) { ceiling &= PUD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); mm_dec_nr_pmds(tlb->mm); } static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pud_t *pud; unsigned long next; unsigned long start; start = addr; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); start &= P4D_MASK; if (start < floor) return; if (ceiling) { ceiling &= P4D_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pud = pud_offset(p4d, start); p4d_clear(p4d); pud_free_tlb(tlb, pud, start); mm_dec_nr_puds(tlb->mm); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { p4d_t *p4d; unsigned long next; unsigned long start; start = addr; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; free_pud_range(tlb, p4d, addr, next, floor, ceiling); } while (p4d++, addr = next, addr != end); start &= PGDIR_MASK; if (start < floor) return; if (ceiling) { ceiling &= PGDIR_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; p4d = p4d_offset(pgd, start); pgd_clear(pgd); p4d_free_tlb(tlb, p4d, start); } /* * This function frees user-level page tables of a process. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; unsigned long next; /* * The next few lines have given us lots of grief... * * Why are we testing PMD* at this top level? Because often * there will be no work to do at all, and we'd prefer not to * go all the way down to the bottom just to discover that. * * Why all these "- 1"s? Because 0 represents both the bottom * of the address space and the top of it (using -1 for the * top wouldn't help much: the masks would do the wrong thing). * The rule is that addr 0 and floor 0 refer to the bottom of * the address space, but end 0 and ceiling 0 refer to the top * Comparisons need to use "end - 1" and "ceiling - 1" (though * that end 0 case should be mythical). * * Wherever addr is brought up or ceiling brought down, we must * be careful to reject "the opposite 0" before it confuses the * subsequent tests. But what about where end is brought down * by PMD_SIZE below? no, end can't go down to 0 there. * * Whereas we round start (addr) and ceiling down, by different * masks at different levels, in order to test whether a table * now has no other vmas using it, so can be freed, we don't * bother to round floor or end up - the tests don't need that. */ addr &= PMD_MASK; if (addr < floor) { addr += PMD_SIZE; if (!addr) return; } if (ceiling) { ceiling &= PMD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) end -= PMD_SIZE; if (addr > end - 1) return; /* * We add page table cache pages with PAGE_SIZE, * (see pte_free_tlb()), flush the tlb if we need */ tlb_change_page_size(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; free_p4d_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked) { struct unlink_vma_file_batch vb; do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; /* * Note: USER_PGTABLES_CEILING may be passed as ceiling and may * be 0. This will underflow and is okay. */ next = mas_find(mas, ceiling - 1); if (unlikely(xa_is_zero(next))) next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); if (is_vm_hugetlb_page(vma)) { unlink_file_vma(vma); hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } else { unlink_file_vma_batch_init(&vb); unlink_file_vma_batch_add(&vb, vma); /* * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; next = mas_find(mas, ceiling - 1); if (unlikely(xa_is_zero(next))) next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); unlink_file_vma_batch_add(&vb, vma); } unlink_file_vma_batch_final(&vb); free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } vma = next; } while (vma); } void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) { spinlock_t *ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm_inc_nr_ptes(mm); /* * Ensure all pte setup (eg. pte page lock and page clearing) are * visible before the pte is made visible to other CPUs by being * put into page tables. * * The other side of the story is the pointer chasing in the page * table walking code (when walking the page table without locking; * ie. most of the time). Fortunately, these data accesses consist * of a chain of data-dependent loads, meaning most CPUs (alpha * being the notable exception) will already guarantee loads are * seen in-order. See the alpha page table accessors for the * smp_rmb() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ pmd_populate(mm, pmd, *pte); *pte = NULL; } spin_unlock(ptl); } int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; pmd_install(mm, pmd, &new); if (new) pte_free(mm, new); return 0; } int __pte_alloc_kernel(pmd_t *pmd) { pte_t *new = pte_alloc_one_kernel(&init_mm); if (!new) return -ENOMEM; spin_lock(&init_mm.page_table_lock); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ smp_wmb(); /* See comment in pmd_install() */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); return 0; } static inline void init_rss_vec(int *rss) { memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); } static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); } /* * This function is called to print an error when a bad pte * is found. For example, we might have a PFN-mapped pte in * a region that doesn't allow it. * * The calling function must still handle the error. */ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pte_t pte, struct page *page) { pgd_t *pgd = pgd_offset(vma->vm_mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. */ if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; return; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } nr_shown = 0; } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this * case, NULL is returned here. "Normal" mappings do have a struct page. * * There are 2 broad cases. Firstly, an architecture may define a pte_special() * pte bit, in which case this function is trivial. Secondly, an architecture * may not have a spare pte bit, which requires a more complicated scheme, * described below. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). * COWed pages of a VM_PFNMAP are always normal. * * The way we recognize COWed pages within VM_PFNMAP mappings is through the * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit * set, and the vm_pgoff will point to the first PFN mapped: thus every special * mapping will always honor the rule * * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) * * And for normal mappings this is false. * * This restricts such mappings to be a linear translation from virtual address * to pfn. To get around this restriction, we allow arbitrary mappings so long * as the vma is not a COW mapping; in that case, we know that all ptes are * special (because none can have been COWed). * * * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered * normal pages by the VM. The only exception are zeropages, which are * *never* refcounted. * * The disadvantage is that pages are refcounted (which can be slower and * simply not an option for some PFNMAP users). The advantage is that we * don't have to follow the strict linearity rule of PFNMAP mappings in * order to support COWable mappings. * */ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { unsigned long pfn = pte_pfn(pte); if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (is_zero_pfn(pfn)) return NULL; if (pte_devmap(pte)) /* * NOTE: New users of ZONE_DEVICE will not set pte_devmap() * and will have refcounts incremented on their struct pages * when they are inserted into PTEs, thus they are safe to * return here. Legacy ZONE_DEVICE pages that set pte_devmap() * do not have refcounts. Example of legacy ZONE_DEVICE is * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. */ return NULL; print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; if (is_zero_pfn(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (is_zero_pfn(pfn)) return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: VM_WARN_ON_ONCE(is_zero_pfn(pfn)); return pfn_to_page(pfn); } struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct page *page = vm_normal_page(vma, addr, pte); if (page) return page_folio(page); return NULL; } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long pfn = pmd_pfn(pmd); /* Currently it's only used for huge pfnmaps */ if (unlikely(pmd_special(pmd))) return NULL; if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (pmd_devmap(pmd)) return NULL; if (is_huge_zero_pmd(pmd)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) return NULL; /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: return pfn_to_page(pfn); } struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { struct page *page = vm_normal_page_pmd(vma, addr, pmd); if (page) return page_folio(page); return NULL; } #endif /** * restore_exclusive_pte - Restore a device-exclusive entry * @vma: VMA covering @address * @folio: the mapped folio * @page: the mapped folio page * @address: the virtual address * @ptep: pte pointer into the locked page table mapping the folio page * @orig_pte: pte value at @ptep * * Restore a device-exclusive non-swap entry to an ordinary present pte. * * The folio and the page table must be locked, and MMU notifiers must have * been called to invalidate any (exclusive) device mappings. * * Locking the folio makes sure that anybody who just converted the pte to * a device-exclusive entry can map it into the device to make forward * progress without others converting it back until the folio was unlocked. * * If the folio lock ever becomes an issue, we can stop relying on the folio * lock; it might make some scenarios with heavy thrashing less likely to * make forward progress, but these scenarios might not be valid use cases. * * Note that the folio lock does not protect against all cases of concurrent * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers * must use MMU notifiers to sync against any concurrent changes. */ static void restore_exclusive_pte(struct vm_area_struct *vma, struct folio *folio, struct page *page, unsigned long address, pte_t *ptep, pte_t orig_pte) { pte_t pte; VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); if ((vma->vm_flags & VM_WRITE) && can_change_pte_writable(vma, address, pte)) { if (folio_test_dirty(folio)) pte = pte_mkdirty(pte); pte = pte_mkwrite(pte, vma); } set_pte_at(vma->vm_mm, address, ptep, pte); /* * No need to invalidate - it was non-present before. However * secondary CPUs may have mappings that need invalidating. */ update_mmu_cache(vma, address, ptep); } /* * Tries to restore an exclusive pte if the page lock can be acquired without * sleeping. */ static int try_restore_exclusive_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t orig_pte) { struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte)); struct folio *folio = page_folio(page); if (folio_trylock(folio)) { restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte); folio_unlock(folio); return 0; } return -EBUSY; } /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. */ static unsigned long copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { unsigned long vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); pte_t pte = orig_pte; struct folio *folio; struct page *page; swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) return -EIO; /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); if (list_empty(&dst_mm->mmlist)) list_add(&dst_mm->mmlist, &src_mm->mmlist); spin_unlock(&mmlist_lock); } /* Mark the swap entry as shared. */ if (pte_swp_exclusive(orig_pte)) { pte = pte_swp_clear_exclusive(orig_pte); set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); rss[mm_counter(folio)]++; if (!is_readable_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent and child * to be set to read. A previously exclusive entry is * now shared. */ entry = make_readable_migration_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { page = pfn_swap_entry_to_page(entry); folio = page_folio(page); /* * Update rss count even for unaddressable pages, as * they should treated just like normal pages in this * respect. * * We will likely want to have some new rss counters * for unaddressable pages, at some point. But for now * keep things as they are. */ folio_get(folio); rss[mm_counter(folio)]++; /* Cannot fail as these pages cannot get pinned. */ folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma); /* * We do not preserve soft-dirty information, because so * far, checkpoint/restore is the only feature that * requires that. And checkpoint/restore does not work * when a device driver is involved (you cannot easily * save and restore device driver state). */ if (is_writable_device_private_entry(entry) && is_cow_mapping(vm_flags)) { entry = make_readable_device_private_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_exclusive_entry(entry)) { /* * Make device exclusive entries present by restoring the * original entry then copying as for a present pte. Device * exclusive entries currently only support private writable * (ie. COW) mappings. */ VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte)) return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { pte_marker marker = copy_pte_marker(entry, dst_vma); if (marker) set_pte_at(dst_mm, addr, dst_pte, make_pte_marker(marker)); return 0; } if (!userfaultfd_wp(dst_vma)) pte = pte_swp_clear_uffd_wp(pte); set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } /* * Copy a present and normal page. * * NOTE! The usual case is that this isn't required; * instead, the caller can just increase the page refcount * and re-use the pte the traditional way. * * And if we need a pre-allocated page but don't yet have * one, return a negative error to let the preallocation * code know so that it can do so outside the page table * lock. */ static inline int copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, struct folio **prealloc, struct page *page) { struct folio *new_folio; pte_t pte; new_folio = *prealloc; if (!new_folio) return -EAGAIN; /* * We have a prealloc page, all good! Take it * over and copy the page & arm it. */ if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma)) return -EHWPOISON; *prealloc = NULL; __folio_mark_uptodate(new_folio); folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, dst_vma); rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int nr) { struct mm_struct *src_mm = src_vma->vm_mm; /* If it's a COW mapping, write protect it both processes. */ if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { wrprotect_ptes(src_mm, addr, src_pte, nr); pte = pte_wrprotect(pte); } /* If it's a shared mapping, mark it clean in the child. */ if (src_vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); } /* * Copy one present PTE, trying to batch-process subsequent PTEs that map * consecutive pages of the same folio by copying them as well. * * Returns -EAGAIN if one preallocated page is required to copy the next PTE. * Otherwise, returns the number of copied PTEs (at least 1). */ static inline int copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int max_nr, int *rss, struct folio **prealloc) { struct page *page; struct folio *folio; bool any_writable; fpb_t flags = 0; int err, nr; page = vm_normal_page(src_vma, addr, pte); if (unlikely(!page)) goto copy_pte; folio = page_folio(page); /* * If we likely have to copy, just don't bother with batching. Make * sure that the common "small folio" case is as fast as possible * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { if (src_vma->vm_flags & VM_SHARED) flags |= FPB_IGNORE_DIRTY; if (!vma_soft_dirty_enabled(src_vma)) flags |= FPB_IGNORE_SOFT_DIRTY; nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, &any_writable, NULL, NULL); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, nr, dst_vma, src_vma))) { folio_ref_sub(folio, nr); return -EAGAIN; } rss[MM_ANONPAGES] += nr; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); rss[mm_counter_file(folio)] += nr; } if (any_writable) pte = pte_mkwrite(pte, src_vma); __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr); return nr; } folio_get(folio); if (folio_test_anon(folio)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, page); return err ? err : 1; } rss[MM_ANONPAGES]++; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { folio_dup_file_rmap_pte(folio, page, dst_vma); rss[mm_counter_file(folio)]++; } copy_pte: __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1); return 1; } static inline struct folio *folio_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, unsigned long addr, bool need_zero) { struct folio *new_folio; if (need_zero) new_folio = vma_alloc_zeroed_movable_folio(vma, addr); else new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (!new_folio) return NULL; if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) { folio_put(new_folio); return NULL; } folio_throttle_swaprate(new_folio, GFP_KERNEL); return new_folio; } static int copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; pmd_t dummy_pmdval; pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; struct folio *prealloc = NULL; int nr; again: progress = 0; init_rss_vec(rss); /* * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the * error handling here, assume that exclusive mmap_lock on dst and src * protects anon from unexpected THP transitions; with shmem and file * protected by mmap_lock-less collapse skipping areas with anon_vma * (whereas vma_needs_copy() skips areas without anon_vma). A rework * can remove such assumptions later, but this is good enough for now. */ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) { ret = -ENOMEM; goto out; } /* * We already hold the exclusive mmap_lock, the copy_pte_range() and * retract_page_tables() are using vma->anon_vma to be exclusive, so * the PTE page is stable, and there is no need to get pmdval and do * pmd_same() check. */ src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval, &src_ptl); if (!src_pte) { pte_unmap_unlock(dst_pte, dst_ptl); /* ret == 0 */ goto out; } spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; arch_enter_lazy_mmu_mode(); do { nr = 1; /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. */ if (progress >= 32) { progress = 0; if (need_resched() || spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } ptent = ptep_get(src_pte); if (pte_none(ptent)) { progress++; continue; } if (unlikely(!pte_present(ptent))) { ret = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, dst_vma, src_vma, addr, rss); if (ret == -EIO) { entry = pte_to_swp_entry(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; } else if (!ret) { progress += 8; continue; } ptent = ptep_get(src_pte); VM_WARN_ON_ONCE(!pte_present(ptent)); /* * Device exclusive entry restored, continue by copying * the now present pte. */ WARN_ON_ONCE(ret != -ENOENT); } /* copy_present_ptes() will clear `*prealloc' if consumed */ max_nr = (end - addr) / PAGE_SIZE; ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, ptent, addr, max_nr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. * If copy failed due to hwpoison in source page, break out. */ if (unlikely(ret == -EAGAIN || ret == -EHWPOISON)) break; if (unlikely(prealloc)) { /* * pre-alloc page cannot be reused by next time so as * to strictly follow mempolicy (e.g., alloc_page_vma() * will allocate page according to address). This * could only happen if one pinned pte changed. */ folio_put(prealloc); prealloc = NULL; } nr = ret; progress += 8 * nr; } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_src_pte, src_ptl); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); if (ret == -EIO) { VM_WARN_ON_ONCE(!entry.val); if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { ret = -ENOMEM; goto out; } entry.val = 0; } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) { goto out; } else if (ret == -EAGAIN) { prealloc = folio_prealloc(src_mm, src_vma, addr, false); if (!prealloc) return -ENOMEM; } else if (ret < 0) { VM_WARN_ON_ONCE(1); } /* We've captured and resolved the error. Reset, try again. */ ret = 0; if (addr != end) goto again; out: if (unlikely(prealloc)) folio_put(prealloc); return ret; } static inline int copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pmd_t *src_pmd, *dst_pmd; unsigned long next; dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); if (!dst_pmd) return -ENOMEM; src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } static inline int copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pud_t *src_pud, *dst_pud; unsigned long next; dst_pud = pud_alloc(dst_mm, dst_p4d, addr); if (!dst_pud) return -ENOMEM; src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); err = copy_huge_pud(dst_mm, src_mm, dst_pud, src_pud, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } static inline int copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; p4d_t *src_p4d, *dst_p4d; unsigned long next; dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); if (!dst_p4d) return -ENOMEM; src_p4d = p4d_offset(src_pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } /* * Return true if the vma needs to copy the pgtable during this fork(). Return * false when we can speed up fork() by allowing lazy page faults later until * when the child accesses the memory range. */ static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { /* * Always copy pgtables when dst_vma has uffd-wp enabled even if it's * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable * contains uffd-wp protection information, that's something we can't * retrieve from page cache, and skip copying will lose those info. */ if (userfaultfd_wp(dst_vma)) return true; if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return true; if (src_vma->anon_vma) return true; /* * Don't copy ptes where a page fault will fill them correctly. Fork * becomes much lighter when there are big shared or private readonly * mappings. The tradeoff is that copy_page_range is more efficient * than faulting. */ return false; } int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; unsigned long addr = src_vma->vm_start; unsigned long end = src_vma->vm_end; struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; unsigned long next, pfn = 0; bool is_cow; int ret; if (!vma_needs_copy(dst_vma, src_vma)) return 0; if (is_vm_hugetlb_page(src_vma)) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { ret = track_pfn_copy(dst_vma, src_vma, &pfn); if (ret) return ret; } /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ is_cow = is_cow_mapping(src_vma->vm_flags); if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); /* * Disabling preemption is not needed for the write side, as * the read side doesn't spin, but goes to the mmap_lock. * * Use the raw variant of the seqcount_t write API to avoid * lockdep complaining about preemptibility. */ vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src_mm->write_protect_seq); } ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { ret = -ENOMEM; break; } } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow) { raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) untrack_pfn_copy(dst_vma, pfn); return ret; } /* Whether we should zap all COWed (private) pages too */ static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ if (!details || details->reclaim_pt) return true; /* Or, we zap COWed pages only if the caller wants to */ return details->even_cows; } /* Decides whether we should zap this folio with the folio pointer specified */ static inline bool should_zap_folio(struct zap_details *details, struct folio *folio) { /* If we can make a decision without *folio.. */ if (should_zap_cows(details)) return true; /* Otherwise we should only zap non-anon folios */ return !folio_test_anon(folio); } static inline bool zap_drop_markers(struct zap_details *details) { if (!details) return false; return details->zap_flags & ZAP_FLAG_DROP_MARKER; } /* * This function makes sure that we'll replace the none pte with an uffd-wp * swap special pte marker when necessary. Must be with the pgtable lock held. * * Returns true if uffd-wp ptes was installed, false otherwise. */ static inline bool zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { bool was_installed = false; #ifdef CONFIG_PTE_MARKER_UFFD_WP /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) return false; if (zap_drop_markers(details)) return false; for (;;) { /* the PFN in the PTE is irrelevant. */ if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval)) was_installed = true; if (--nr == 0) break; pte++; addr += PAGE_SIZE; } #endif return was_installed; } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; if (!folio_test_anon(folio)) { ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); if (pte_dirty(ptent)) { folio_mark_dirty(folio); if (tlb_delay_rmap(tlb)) { delay_rmap = true; *force_flush = true; } } if (pte_young(ptent) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); rss[mm_counter(folio)] -= nr; } else { /* We don't need up-to-date accessed/dirty bits. */ clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); rss[MM_ANONPAGES] -= nr; } /* Checking a single PTE in a batch is sufficient. */ arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); if (unlikely(folio_mapcount(folio) < 0)) print_bad_pte(vma, addr, ptent, page); } if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush = true; *force_break = true; } } /* * Zap or skip at least one present PTE, trying to batch-process subsequent * PTEs that map consecutive pages of the same folio. * * Returns the number of processed (skipped or zapped) PTEs (at least 1). */ static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; int nr; page = vm_normal_page(vma, addr, ptent); if (!page) { /* We don't need up-to-date accessed/dirty bits. */ ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) { *any_skipped = true; return 1; } /* * Make sure that the common "small folio" case is as fast as possible * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, NULL, NULL, NULL); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, force_break, any_skipped); return nr; } zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, details, rss, force_flush, force_break, any_skipped); return 1; } static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *any_skipped) { swp_entry_t entry; int nr = 1; *any_skipped = true; entry = pte_to_swp_entry(ptent); if (is_device_private_entry(entry) || is_device_exclusive_entry(entry)) { struct page *page = pfn_swap_entry_to_page(entry); struct folio *folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) return 1; /* * Both device private/exclusive mappings should only * work with anonymous page so far, so we don't need to * consider uffd-wp bit when zap. For more information, * see zap_install_uffd_wp_if_needed(). */ WARN_ON_ONCE(!vma_is_anonymous(vma)); rss[mm_counter(folio)]--; folio_remove_rmap_pte(folio, page, vma); folio_put(folio); } else if (!non_swap_entry(entry)) { /* Genuine swap entries, hence a private anon pages */ if (!should_zap_cows(details)) return 1; nr = swap_pte_batch(pte, max_nr, ptent); rss[MM_SWAPENTS] -= nr; free_swap_and_cache_nr(entry, nr); } else if (is_migration_entry(entry)) { struct folio *folio = pfn_swap_entry_folio(entry); if (!should_zap_folio(details, folio)) return 1; rss[mm_counter(folio)]--; } else if (pte_marker_entry_uffd_wp(entry)) { /* * For anon: always drop the marker; for file: only * drop the marker if explicitly requested. */ if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) return 1; } else if (is_guard_swp_entry(entry)) { /* * Ordinary zapping should not remove guard PTE * markers. Only do so if we should remove PTE markers * in general. */ if (!zap_drop_markers(details)) return 1; } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { if (!should_zap_cows(details)) return 1; } else { /* We should have covered all the swap entry types */ pr_alert("unrecognized swap entry 0x%lx\n", entry.val); WARN_ON_ONCE(1); } clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); return nr; } static inline int do_zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, unsigned long addr, unsigned long end, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { pte_t ptent = ptep_get(pte); int max_nr = (end - addr) / PAGE_SIZE; int nr = 0; /* Skip all consecutive none ptes */ if (pte_none(ptent)) { for (nr = 1; nr < max_nr; nr++) { ptent = ptep_get(pte + nr); if (!pte_none(ptent)) break; } max_nr -= nr; if (!max_nr) return nr; pte += nr; addr += nr * PAGE_SIZE; } if (pte_present(ptent)) nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr, details, rss, force_flush, force_break, any_skipped); else nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr, details, rss, any_skipped); return nr; } static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { bool force_flush = false, force_break = false; struct mm_struct *mm = tlb->mm; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; pte_t *pte; pmd_t pmdval; unsigned long start = addr; bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); bool direct_reclaim = true; int nr; retry: tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return addr; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { bool any_skipped = false; if (need_resched()) { direct_reclaim = false; break; } nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, &force_flush, &force_break, &any_skipped); if (any_skipped) can_reclaim_pt = false; if (unlikely(force_break)) { addr += nr * PAGE_SIZE; direct_reclaim = false; break; } } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); /* * Fast path: try to hold the pmd lock and unmap the PTE page. * * If the pte lock was released midway (retry case), or if the attempt * to hold the pmd lock failed, then we need to recheck all pte entries * to ensure they are still none, thereby preventing the pte entries * from being repopulated by another thread. */ if (can_reclaim_pt && direct_reclaim && addr == end) direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ if (force_flush) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_rmaps(tlb, vma); } pte_unmap_unlock(start_pte, ptl); /* * If we forced a TLB flush (either due to running out of * batch buffers or because we needed to flush dirty TLB * entries before releasing the ptl), free the batched * memory too. Come back again if we didn't do everything. */ if (force_flush) tlb_flush_mmu(tlb); if (addr != end) { cond_resched(); force_flush = false; force_break = false; goto retry; } if (can_reclaim_pt) { if (direct_reclaim) free_pte(mm, start, tlb, pmdval); else try_to_free_pte(mm, pmd, start, tlb); } return addr; } static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, struct zap_details *details) { pmd_t *pmd; unsigned long next; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { addr = next; continue; } /* fall through */ } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { spinlock_t *ptl = pmd_lock(tlb->mm, pmd); /* * Take and drop THP pmd lock so that we cannot return * prematurely, while zap_huge_pmd() has cleared *pmd, * but not yet decremented compound_mapcount(). */ spin_unlock(ptl); } if (pmd_none(*pmd)) { addr = next; continue; } addr = zap_pte_range(tlb, vma, pmd, addr, next, details); if (addr != next) pmd--; } while (pmd++, cond_resched(), addr != end); return addr; } static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, struct zap_details *details) { pud_t *pud; unsigned long next; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*pud) || pud_devmap(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); } else if (zap_huge_pud(tlb, vma, pud, addr)) goto next; /* fall through */ } if (pud_none_or_clear_bad(pud)) continue; next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); } while (pud++, addr = next, addr != end); return addr; } static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, struct zap_details *details) { p4d_t *p4d; unsigned long next; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); return addr; } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) { pgd_t *pgd; unsigned long next; BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details, bool mm_wr_locked) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; if (start >= vma->vm_end) return; end = min(vma->vm_end, end_addr); if (end <= vma->vm_start) return; if (vma->vm_file) uprobe_munmap(vma, start, end); if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn(vma, 0, 0, mm_wr_locked); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. */ if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; __unmap_hugepage_range(tlb, vma, start, end, NULL, zap_flags); } } else unmap_page_range(tlb, vma, start, end, details); } } /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather * @mas: the maple state * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @tree_end: The maximum index to check * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. * * unmap_vmas() assumes that the caller will flush the whole unmapped address * range after unmap_vmas() returns. So the only responsibility here is to * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long tree_end, bool mm_wr_locked) { struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, /* Careful - we need to zap private pages too! */ .even_cows = true, }; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { unsigned long start = start_addr; unsigned long end = end_addr; hugetlb_zap_begin(vma, &start, &end); unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); } /** * zap_page_range_single - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap * @details: details of shared cache invalidation * * The range must fit into one VMA. */ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ unmap_single_vma(&tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); hugetlb_zap_end(vma, details); } /** * zap_vma_ptes - remove ptes mapping the vma * @vma: vm_area_struct holding ptes to be zapped * @address: starting address of pages to zap * @size: number of bytes to zap * * This function only unmaps ptes assigned to VM_PFNMAP vmas. * * The entire address range must be fully contained within the vma. * */ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (!range_in_vma(vma, address, address + size) || !(vma->vm_flags & VM_PFNMAP)) return; zap_page_range_single(vma, address, size, NULL); } EXPORT_SYMBOL_GPL(zap_vma_ptes); static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, addr); if (!pud) return NULL; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return NULL; VM_BUG_ON(pmd_trans_huge(*pmd)); return pmd; } pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pmd_t *pmd = walk_to_pmd(mm, addr); if (!pmd) return NULL; return pte_alloc_map_lock(mm, pmd, addr, ptl); } static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma) { VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); /* * Whoever wants to forbid the zeropage after some zeropages * might already have been mapped has to scan the page tables and * bail out on any zeropages. Zeropages in COW mappings can * be unshared using FAULT_FLAG_UNSHARE faults. */ if (mm_forbids_zeropage(vma->vm_mm)) return false; /* zeropages in COW mappings are common and unproblematic. */ if (is_cow_mapping(vma->vm_flags)) return true; /* Mappings that do not allow for writable PTEs are unproblematic. */ if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) return true; /* * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could * find the shared zeropage and longterm-pin it, which would * be problematic as soon as the zeropage gets replaced by a different * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would * now differ to what GUP looked up. FSDAX is incompatible to * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see * check_vma_flags). */ return vma->vm_ops && vma->vm_ops->pfn_mkwrite && (vma_is_fsdax(vma) || vma->vm_flags & VM_IO); } static int validate_page_before_insert(struct vm_area_struct *vma, struct page *page) { struct folio *folio = page_folio(page); if (!folio_ref_count(folio)) return -EINVAL; if (unlikely(is_zero_folio(folio))) { if (!vm_mixed_zeropage_allowed(vma)) return -EINVAL; return 0; } if (folio_test_anon(folio) || folio_test_slab(folio) || page_has_type(page)) return -EINVAL; flush_dcache_folio(folio); return 0; } static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot, bool mkwrite) { struct folio *folio = page_folio(page); pte_t pteval = ptep_get(pte); if (!pte_none(pteval)) { if (!mkwrite) return -EBUSY; /* see insert_pfn(). */ if (pte_pfn(pteval) != page_to_pfn(page)) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval))); return -EFAULT; } pteval = maybe_mkwrite(pteval, vma); pteval = pte_mkyoung(pteval); if (ptep_set_access_flags(vma, addr, pte, pteval, 1)) update_mmu_cache(vma, addr, pte); return 0; } /* Ok, finally just insert the thing.. */ pteval = mk_pte(page, prot); if (unlikely(is_zero_folio(folio))) { pteval = pte_mkspecial(pteval); } else { folio_get(folio); pteval = mk_pte(page, prot); if (mkwrite) { pteval = pte_mkyoung(pteval); pteval = maybe_mkwrite(pte_mkdirty(pteval), vma); } inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); folio_add_file_rmap_pte(folio, page, vma); } set_pte_at(vma->vm_mm, addr, pte, pteval); return 0; } static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot, bool mkwrite) { int retval; pte_t *pte; spinlock_t *ptl; retval = validate_page_before_insert(vma, page); if (retval) goto out; retval = -ENOMEM; pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; retval = insert_page_into_pte_locked(vma, pte, addr, page, prot, mkwrite); pte_unmap_unlock(pte, ptl); out: return retval; } static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; err = validate_page_before_insert(vma, page); if (err) return err; return insert_page_into_pte_locked(vma, pte, addr, page, prot, false); } /* insert_pages() amortizes the cost of spinlock operations * when inserting pages in a loop. */ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) { pmd_t *pmd = NULL; pte_t *start_pte, *pte; spinlock_t *pte_lock; struct mm_struct *const mm = vma->vm_mm; unsigned long curr_page_idx = 0; unsigned long remaining_pages_total = *num; unsigned long pages_to_write_in_pmd; int ret; more: ret = -EFAULT; pmd = walk_to_pmd(mm, addr); if (!pmd) goto out; pages_to_write_in_pmd = min_t(unsigned long, remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); /* Allocate the PTE if necessary; takes PMD lock once only. */ ret = -ENOMEM; if (pte_alloc(mm, pmd)) goto out; while (pages_to_write_in_pmd) { int pte_idx = 0; const int batch_size = min_t(int, pages_to_write_in_pmd, 8); start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); if (!start_pte) { ret = -EFAULT; goto out; } for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); ret = err; remaining_pages_total -= pte_idx; goto out; } addr += PAGE_SIZE; ++curr_page_idx; } pte_unmap_unlock(start_pte, pte_lock); pages_to_write_in_pmd -= batch_size; remaining_pages_total -= batch_size; } if (remaining_pages_total) goto more; ret = 0; out: *num = remaining_pages_total; return ret; } /** * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock. * @vma: user vma to map to * @addr: target start user address of these pages * @pages: source kernel pages * @num: in: number of pages to map. out: number of pages that were *not* * mapped. (0 means all pages were successfully mapped). * * Preferred over vm_insert_page() when inserting multiple pages. * * In case of error, we may have mapped a subset of the provided * pages. It is the caller's responsibility to account for this case. * * The same restrictions apply as in vm_insert_page(). */ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num) { const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; if (addr < vma->vm_start || end_addr >= vma->vm_end) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ return insert_pages(vma, addr, pages, num, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_pages); /** * vm_insert_page - insert single page into user vma * @vma: user vma to map to * @addr: target user address of this page * @page: source kernel page * * This allows drivers to insert individual pages they've allocated * into a user vma. The zeropage is supported in some VMAs, * see vm_mixed_zeropage_allowed(). * * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow * that. Your vma protection will have to be set up correctly, which * means that if you want a shared writable mapping, you'd better * ask for a shared writable mapping! * * The page does not need to be reserved. * * Usually this function is called from f_op->mmap() handler * under mm->mmap_lock write-lock, so it can change vma->vm_flags. * Caller must set VM_MIXEDMAP on vma if it wants to call this * function from other places, for example from page-fault handler. * * Return: %0 on success, negative error code otherwise. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } return insert_page(vma, addr, page, vma->vm_page_prot, false); } EXPORT_SYMBOL(vm_insert_page); /* * __vm_map_pages - maps range of kernel pages into user vma * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * @offset: user's requested vm_pgoff * * This allows drivers to map range of kernel pages into a user vma. * The zeropage is supported in some VMAs, see * vm_mixed_zeropage_allowed(). * * Return: 0 on success and error code otherwise. */ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num, unsigned long offset) { unsigned long count = vma_pages(vma); unsigned long uaddr = vma->vm_start; int ret, i; /* Fail if the user requested offset is beyond the end of the object */ if (offset >= num) return -ENXIO; /* Fail if the user requested size exceeds available object size */ if (count > num - offset) return -ENXIO; for (i = 0; i < count; i++) { ret = vm_insert_page(vma, uaddr, pages[offset + i]); if (ret < 0) return ret; uaddr += PAGE_SIZE; } return 0; } /** * vm_map_pages - maps range of kernel pages starts with non zero offset * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * * Maps an object consisting of @num pages, catering for the user's * requested vm_pgoff * * If we fail to insert any page into the vma, the function will return * immediately leaving any previously inserted pages present. Callers * from the mmap handler may immediately return the error as their caller * will destroy the vma, removing any successfully inserted pages. Other * callers should make their own arrangements for calling unmap_region(). * * Context: Process context. Called by mmap handlers. * Return: 0 on success and error code otherwise. */ int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num) { return __vm_map_pages(vma, pages, num, vma->vm_pgoff); } EXPORT_SYMBOL(vm_map_pages); /** * vm_map_pages_zero - map range of kernel pages starts with zero offset * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * * Similar to vm_map_pages(), except that it explicitly sets the offset * to 0. This function is intended for the drivers that did not consider * vm_pgoff. * * Context: Process context. Called by mmap handlers. * Return: 0 on success and error code otherwise. */ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num) { return __vm_map_pages(vma, pages, num, 0); } EXPORT_SYMBOL(vm_map_pages_zero); static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, entry; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); if (!pte) return VM_FAULT_OOM; entry = ptep_get(pte); if (!pte_none(entry)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed * in may not match the PFN we have mapped if the * mapped PFN is a writeable COW page. In the mkwrite * case we are creating a writable PTE for a shared * mapping and we expect the PFNs to match. If they * don't match, we are likely racing with block * allocation and mapping invalidation so just skip the * update. */ if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); } goto out_unlock; } /* Ok, finally just insert the thing.. */ if (pfn_t_devmap(pfn)) entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); if (mkwrite) { entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); } set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ out_unlock: pte_unmap_unlock(pte, ptl); return VM_FAULT_NOPAGE; } /** * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * * This is exactly like vmf_insert_pfn(), except that it allows drivers * to override pgprot on a per-page basis. * * This only makes sense for IO mappings, and it makes no sense for * COW mappings. In general, using multiple vmas is preferable; * vmf_insert_pfn_prot should only be used if using multiple VMAs is * impractical. * * pgprot typically only differs from @vma->vm_page_prot when drivers set * caching- and encryption bits different than those of @vma->vm_page_prot, * because the caching- or encryption mode may not be known at mmap() time. * * This is ok as long as @vma->vm_page_prot is not used by the core vm * to set caching and encryption bits for those vmas (except for COW pages). * This is ensured by core vm only modifying these page table entries using * functions that don't touch caching- or encryption bits, using pte_modify() * if needed. (See for example mprotect()). * * Also when new page-table entries are created, this is only done using the * fault() callback, and never using the value of vma->vm_page_prot, * except for page-table entries that point to anonymous pages as the result * of COW. * * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like * consistency in testing and feature parity among all, so we should * try to keep these invariants in place for everybody. */ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); } EXPORT_SYMBOL(vmf_insert_pfn_prot); /** * vmf_insert_pfn - insert single pfn into user vma * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * * Similar to vm_insert_page, this allows drivers to insert individual pages * they've allocated into a user vma. Same comments apply. * * This function should only be called from a vm_ops->fault handler, and * in that case the handler should return the result of this function. * * vma cannot be a COW mapping. * * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); } EXPORT_SYMBOL(vmf_insert_pfn); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite) { if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) && (mkwrite || !vm_mixed_zeropage_allowed(vma))) return false; /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; if (pfn_t_devmap(pfn)) return true; if (pfn_t_special(pfn)) return true; if (is_zero_pfn(pfn_t_to_pfn(pfn))) return true; return false; } static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; int err; if (!vm_mixed_ok(vma, pfn, mkwrite)) return VM_FAULT_SIGBUS; if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) return VM_FAULT_SIGBUS; /* * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* * refcount the page if pfn_valid is true (hence insert_page rather * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* * At this point we are committed to insert_page() * regardless of whether the caller specified flags that * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); err = insert_page(vma, addr, page, pgprot, mkwrite); } else { return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, bool write) { pgprot_t pgprot = vmf->vma->vm_page_prot; unsigned long addr = vmf->address; int err; if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end) return VM_FAULT_SIGBUS; err = insert_page(vmf->vma, addr, page, pgprot, write); if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, false); } EXPORT_SYMBOL(vmf_insert_mixed); /* * If the insertion of PTE failed because someone else already added a * different entry in the mean time, we treat that as success as we assume * the same entry was actually inserted. */ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, true); } /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pte_t *pte, *mapped_pte; spinlock_t *ptl; int err = 0; mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; } set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(mapped_pte, ptl); return err; } static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pmd_t *pmd; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pmd++, addr = next, addr != end); return 0; } static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pud_t *pud; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pud = pud_alloc(mm, p4d, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); err = remap_pmd_range(mm, pud, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pud++, addr = next, addr != end); return 0; } static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { p4d_t *p4d; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); err = remap_pud_range(mm, p4d, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (p4d++, addr = next, addr != end); return 0; } static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; int err; if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; /* * Physically remapped pages are special. Tell the * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. * VM_DONTEXPAND * Disable vma merging and expanding with mremap(). * VM_DONTDUMP * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". * See vm_normal_page() for details. */ if (is_cow_mapping(vma->vm_flags)) { if (addr != vma->vm_start || end != vma->vm_end) return -EINVAL; vma->vm_pgoff = pfn; } vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); do { next = pgd_addr_end(addr, end); err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pgd++, addr = next, addr != end); return 0; } /* * Variant of remap_pfn_range that does not call track_pfn_remap. The caller * must have pre-validated the caching bits of the pgprot_t. */ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); if (!error) return 0; /* * A partial pfn range mapping is dangerous: it does not * maintain page reference counts, and callers may free * pages due to the error. So zap it early. */ zap_page_range_single(vma, addr, size, NULL); return error; } /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to * @addr: target page aligned user address to start at * @pfn: page frame number of kernel physical memory address * @size: size of mapping area * @prot: page protection flags for this mapping * * Note: this is only safe if the mm semaphore is held when called. * * Return: %0 on success, negative error code otherwise. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int err; err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); if (err) return -EINVAL; err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); if (err) untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); return err; } EXPORT_SYMBOL(remap_pfn_range); /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to * @start: start of the physical memory to be mapped * @len: size of area * * This is a simplified io_remap_pfn_range() for common driver use. The * driver just needs to give us the physical memory range to be mapped, * we'll figure out the rest from the vma information. * * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get * whatever write-combining details or similar. * * Return: %0 on success, negative error code otherwise. */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { unsigned long vm_len, pfn, pages; /* Check that the physical memory area passed in looks valid */ if (start + len < start) return -EINVAL; /* * You *really* shouldn't map things that aren't page-aligned, * but we've historically allowed it because IO memory might * just have smaller alignment. */ len += start & ~PAGE_MASK; pfn = start >> PAGE_SHIFT; pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; if (pfn + pages < pfn) return -EINVAL; /* We start the mapping 'vm_pgoff' pages into the area */ if (vma->vm_pgoff > pages) return -EINVAL; pfn += vma->vm_pgoff; pages -= vma->vm_pgoff; /* Can we fit all of the mapping? */ vm_len = vma->vm_end - vma->vm_start; if (vm_len >> PAGE_SHIFT > pages) return -EINVAL; /* Ok, let it rip */ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); } EXPORT_SYMBOL(vm_iomap_memory); static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pte_t *pte, *mapped_pte; int err = 0; spinlock_t *ptl; if (create) { mapped_pte = pte = (mm == &init_mm) ? pte_alloc_kernel_track(pmd, addr, mask) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; } else { mapped_pte = pte = (mm == &init_mm) ? pte_offset_kernel(pmd, addr) : pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return -EINVAL; } arch_enter_lazy_mmu_mode(); if (fn) { do { if (create || !pte_none(ptep_get(pte))) { err = fn(pte, addr, data); if (err) break; } } while (pte++, addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; arch_leave_lazy_mmu_mode(); if (mm != &init_mm) pte_unmap_unlock(mapped_pte, ptl); return err; } static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; int err = 0; BUG_ON(pud_leaf(*pud)); if (create) { pmd = pmd_alloc_track(mm, pud, addr, mask); if (!pmd) return -ENOMEM; } else { pmd = pmd_offset(pud, addr); } do { next = pmd_addr_end(addr, end); if (pmd_none(*pmd) && !create) continue; if (WARN_ON_ONCE(pmd_leaf(*pmd))) return -EINVAL; if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) { if (!create) continue; pmd_clear_bad(pmd); } err = apply_to_pte_range(mm, pmd, addr, next, fn, data, create, mask); if (err) break; } while (pmd++, addr = next, addr != end); return err; } static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int err = 0; if (create) { pud = pud_alloc_track(mm, p4d, addr, mask); if (!pud) return -ENOMEM; } else { pud = pud_offset(p4d, addr); } do { next = pud_addr_end(addr, end); if (pud_none(*pud) && !create) continue; if (WARN_ON_ONCE(pud_leaf(*pud))) return -EINVAL; if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) { if (!create) continue; pud_clear_bad(pud); } err = apply_to_pmd_range(mm, pud, addr, next, fn, data, create, mask); if (err) break; } while (pud++, addr = next, addr != end); return err; } static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; int err = 0; if (create) { p4d = p4d_alloc_track(mm, pgd, addr, mask); if (!p4d) return -ENOMEM; } else { p4d = p4d_offset(pgd, addr); } do { next = p4d_addr_end(addr, end); if (p4d_none(*p4d) && !create) continue; if (WARN_ON_ONCE(p4d_leaf(*p4d))) return -EINVAL; if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) { if (!create) continue; p4d_clear_bad(p4d); } err = apply_to_pud_range(mm, p4d, addr, next, fn, data, create, mask); if (err) break; } while (p4d++, addr = next, addr != end); return err; } static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data, bool create) { pgd_t *pgd; unsigned long start = addr, next; unsigned long end = addr + size; pgtbl_mod_mask mask = 0; int err = 0; if (WARN_ON(addr >= end)) return -EINVAL; pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) continue; if (WARN_ON_ONCE(pgd_leaf(*pgd))) { err = -EINVAL; break; } if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { if (!create) continue; pgd_clear_bad(pgd); } err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, start + size); return err; } /* * Scan a region of virtual memory, filling in page tables as necessary * and calling a provided function on each leaf page table. */ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { return __apply_to_page_range(mm, addr, size, fn, data, true); } EXPORT_SYMBOL_GPL(apply_to_page_range); /* * Scan a region of virtual memory, calling a provided function on * each leaf page table where it exists. * * Unlike apply_to_page_range, this does _not_ fill in page tables * where they are absent. */ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { return __apply_to_page_range(mm, addr, size, fn, data, false); } /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures * or configurations (e.g. i386 with PAE) which might give a mix of unmatched * parts, do_swap_page must check under lock before unmapping the pte and * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct vm_fault *vmf) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { spin_lock(vmf->ptl); same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); spin_unlock(vmf->ptl); } #endif pte_unmap(vmf->pte); vmf->pte = NULL; return same; } /* * Return: * 0: copied succeeded * -EHWPOISON: copy failed due to hwpoison in source page * -EAGAIN: copied failed (some other reason) */ static inline int __wp_page_copy_user(struct page *dst, struct page *src, struct vm_fault *vmf) { int ret; void *kaddr; void __user *uaddr; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = vmf->address; if (likely(src)) { if (copy_mc_user_highpage(dst, src, addr, vma)) return -EHWPOISON; return 0; } /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by * just copying from the original user address. If that * fails, we just zero-fill it. Live with it. */ kaddr = kmap_local_page(dst); pagefault_disable(); uaddr = (void __user *)(addr & PAGE_MASK); /* * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ vmf->pte = NULL; if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* * Other thread has already handled the fault * and update local tlb only */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } entry = pte_mkyoung(vmf->orig_pte); if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1); } /* * This really shouldn't fail, because the page is there * in the page tables. But it might just be unreadable, * in which case we just give up and fill the result with * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { if (vmf->pte) goto warn; /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } /* * The same page can be mapped back since last copy attempt. * Try to copy again under PTL. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { /* * Give a warn in case there can be some obscure * use-case */ warn: WARN_ON_ONCE(1); clear_page(kaddr); } } ret = 0; pte_unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); pagefault_enable(); kunmap_local(kaddr); flush_dcache_page(dst); return ret; } static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) { struct file *vm_file = vma->vm_file; if (vm_file) return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; /* * Special mappings (e.g. VDSO) do not have any file so fake * a default GFP_KERNEL for them. */ return GFP_KERNEL; } /* * Notify the address space that the page is about to become writable so that * it can prohibit this or wait for the page to get into an appropriate state. * * We do this without the lock held, so that it can sleep if it needs to. */ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio) { vm_fault_t ret; unsigned int old_flags = vmf->flags; vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; if (vmf->vma->vm_file && IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) return VM_FAULT_SIGBUS; ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { folio_lock(folio); if (!folio->mapping) { folio_unlock(folio); return 0; /* retry */ } ret |= VM_FAULT_LOCKED; } else VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); return ret; } /* * Handle dirtying of a page in shared file mapping on a write fault. * * The function expects the page to be locked and unlocks it. */ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; struct folio *folio = page_folio(vmf->page); bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; dirtied = folio_mark_dirty(folio); VM_BUG_ON_FOLIO(folio_test_anon(folio), folio); /* * Take a local copy of the address_space - folio.mapping may be zeroed * by truncate after folio_unlock(). The address_space itself remains * pinned by vma->vm_file's reference. We rely on folio_unlock()'s * release semantics to prevent the compiler from undoing this copying. */ mapping = folio_raw_mapping(folio); folio_unlock(folio); if (!page_mkwrite) file_update_time(vma->vm_file); /* * Throttle page dirtying rate down to writeback speed. * * mapping may be NULL here because some device drivers do not * set page.mapping but still dirty their pages * * Drop the mmap_lock before waiting on IO, if we can. The file * is pinning the mapping, as per above. */ if ((dirtied || page_mkwrite) && mapping) { struct file *fpin; fpin = maybe_unlock_mmap_for_io(vmf, NULL); balance_dirty_pages_ratelimited(mapping); if (fpin) { fput(fpin); return VM_FAULT_COMPLETED; } } return 0; } /* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, * or due to us being the last reference standing to the page. In either * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte))); if (folio) { VM_BUG_ON(folio_test_anon(folio) && !PageAnonExclusive(vmf->page)); /* * Clear the folio's cpupid information as the existing * information potentially belongs to a now completely * unrelated process. */ folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); pte_unmap_unlock(vmf->pte, vmf->ptl); count_vm_event(PGREUSE); } /* * We could add a bitflag somewhere, but for now, we know that all * vm_ops that have a ->map_pages have been audited and don't need * the mmap_lock to be held. */ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) return 0; vma_end_read(vma); return VM_FAULT_RETRY; } /** * __vmf_anon_prepare - Prepare to handle an anonymous fault. * @vmf: The vm_fault descriptor passed from the fault handler. * * When preparing to insert an anonymous page into a VMA from a * fault handler, call this function rather than anon_vma_prepare(). * If this vma does not already have an associated anon_vma and we are * only protected by the per-VMA lock, the caller must retry with the * mmap_lock held. __anon_vma_prepare() will look at adjacent VMAs to * determine if this VMA can share its anon_vma, and that's not safe to * do with only the per-VMA lock held for this VMA. * * Return: 0 if fault handling can proceed. Any other value should be * returned to the caller. */ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { if (!mmap_read_trylock(vma->vm_mm)) return VM_FAULT_RETRY; } if (__anon_vma_prepare(vma)) ret = VM_FAULT_OOM; if (vmf->flags & FAULT_FLAG_VMA_LOCK) mmap_read_unlock(vma->vm_mm); return ret; } /* * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. * * Called with mmap_lock locked and the old page referenced, but * without the ptl held. * * High level logic flow: * * - Allocate a page, copy the content of the old page to the new one. * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. * - Take the PTL. If the pte changed, bail out and release the allocated page * - If the pte is still the way we remember it, update the page table and all * relevant references. This includes dropping the reference the page-table * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ static vm_fault_t wp_page_copy(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct folio *old_folio = NULL; struct folio *new_folio = NULL; pte_t entry; int page_copied = 0; struct mmu_notifier_range range; vm_fault_t ret; bool pfn_is_zero; delayacct_wpcopy_start(); if (vmf->page) old_folio = page_folio(vmf->page); ret = vmf_anon_prepare(vmf); if (unlikely(ret)) goto out; pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte)); new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero); if (!new_folio) goto oom; if (!pfn_is_zero) { int err; err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); if (err) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on * the same address and we will handle the fault * from the second attempt. * The -EHWPOISON case will not be retried. */ folio_put(new_folio); if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(&new_folio->page, vmf->page); } __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); /* * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } } else { ksm_might_unmap_zero_page(mm, vmf->orig_pte); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(&new_folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) entry = pte_mksoft_dirty(entry); if (pte_uffd_wp(vmf->orig_pte)) entry = pte_mkuffd_wp(entry); } else { entry = maybe_mkwrite(pte_mkdirty(entry), vma); } /* * Clear the pte entry and flush it first, before updating the * pte with the new entry, to keep TLBs on different CPUs in * sync. This code used to set the new PTE then flush TLBs, but * that left a window where the new PTE could be loaded into * some TLBs while the old PTE remains in others. */ ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); BUG_ON(unshare && pte_write(entry)); set_pte_at(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * folio_remove_rmap_pte() with the ptp_clear_flush * above. Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in folio_remove_rmap_pte(); * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ folio_remove_rmap_pte(old_folio, vmf->page, vma); } /* Free the old page.. */ new_folio = old_folio; page_copied = 1; pte_unmap_unlock(vmf->pte, vmf->ptl); } else if (vmf->pte) { update_mmu_tlb(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); } mmu_notifier_invalidate_range_end(&range); if (new_folio) folio_put(new_folio); if (old_folio) { if (page_copied) free_swap_cache(old_folio); folio_put(old_folio); } delayacct_wpcopy_end(); return 0; oom: ret = VM_FAULT_OOM; out: if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); return ret; } /** * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE * writeable once the page is prepared * * @vmf: structure describing the fault * @folio: the folio of vmf->page * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. * It handles locking of PTE and modifying it. * * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). * * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * we acquired PTE lock. */ static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) return VM_FAULT_NOPAGE; /* * We might have raced with another page fault while we released the * pte_offset_map_lock. */ if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; } wp_page_reuse(vmf, folio); return 0; } /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); ret = vmf_can_call_fault(vmf); if (ret) return ret; vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; return finish_mkwrite_fault(vmf, NULL); } wp_page_reuse(vmf, NULL); return 0; } static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; folio_get(folio); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); tmp = vmf_can_call_fault(vmf); if (tmp) { folio_put(folio); return tmp; } tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); return tmp; } tmp = finish_mkwrite_fault(vmf, folio); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { folio_unlock(folio); folio_put(folio); return tmp; } } else { wp_page_reuse(vmf, folio); folio_lock(folio); } ret |= fault_dirty_shared_page(vmf); folio_put(folio); return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static bool __wp_can_reuse_large_anon_folio(struct folio *folio, struct vm_area_struct *vma) { bool exclusive = false; /* Let's just free up a large folio if only a single page is mapped. */ if (folio_large_mapcount(folio) <= 1) return false; /* * The assumption for anonymous folios is that each page can only get * mapped once into each MM. The only exception are KSM folios, which * are always small. * * Each taken mapcount must be paired with exactly one taken reference, * whereby the refcount must be incremented before the mapcount when * mapping a page, and the refcount must be decremented after the * mapcount when unmapping a page. * * If all folio references are from mappings, and all mappings are in * the page tables of this MM, then this folio is exclusive to this MM. */ if (folio_test_large_maybe_mapped_shared(folio)) return false; VM_WARN_ON_ONCE(folio_test_ksm(folio)); if (unlikely(folio_test_swapcache(folio))) { /* * Note: freeing up the swapcache will fail if some PTEs are * still swap entries. */ if (!folio_trylock(folio)) return false; folio_free_swap(folio); folio_unlock(folio); } if (folio_large_mapcount(folio) != folio_ref_count(folio)) return false; /* Stabilize the mapcount vs. refcount and recheck. */ folio_lock_large_mapcount(folio); VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio)); if (folio_test_large_maybe_mapped_shared(folio)) goto unlock; if (folio_large_mapcount(folio) != folio_ref_count(folio)) goto unlock; VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio); VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && folio_mm_id(folio, 1) != vma->vm_mm->mm_id); /* * Do we need the folio lock? Likely not. If there would have been * references from page migration/swapout, we would have detected * an additional folio reference and never ended up here. */ exclusive = true; unlock: folio_unlock_large_mapcount(folio); return exclusive; } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, struct vm_area_struct *vma) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static bool wp_can_reuse_anon_folio(struct folio *folio, struct vm_area_struct *vma) { if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio)) return __wp_can_reuse_large_anon_folio(folio, vma); /* * We have to verify under folio lock: these early checks are * just an optimization to avoid locking the folio and freeing * the swapcache if there is little hope that we can reuse. * * KSM doesn't necessarily raise the folio refcount. */ if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) return false; if (!folio_test_lru(folio)) /* * We cannot easily detect+handle references from * remote LRU caches or references to LRU folios. */ lru_add_drain(); if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) return false; if (!folio_trylock(folio)) return false; if (folio_test_swapcache(folio)) folio_free_swap(folio); if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { folio_unlock(folio); return false; } /* * Ok, we've got the only folio reference from our mapping * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ folio_move_anon_rmap(folio, vma); folio_unlock(folio); return true; } /* * This routine handles present pages, when * * users try to write to a shared page (FAULT_FLAG_WRITE) * * GUP wants to take a R/O pin on a possibly shared anonymous page * (FAULT_FLAG_UNSHARE) * * It is done by copying the page to a new address and decrementing the * shared-page counter for the old page. * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've * done any necessary COW. * * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even * though the page will change only once the write actually happens. This * avoids a few races, and potentially makes it more efficient. * * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; pte_t pte; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { if (!userfaultfd_wp_async(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } /* * Nothing needed (cache flush, TLB invalidations, * etc.) because we're only removing the uffd-wp bit, * which is completely invisible to the user. */ pte = pte_clear_uffd_wp(ptep_get(vmf->pte)); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); /* * Update this to be prepared for following up CoW * handling */ vmf->orig_pte = pte; } /* * Userfaultfd write-protect can defer flushes. Ensure the TLB * is flushed in this case before copying. */ if (unlikely(userfaultfd_wp(vmf->vma) && mm_tlb_flush_pending(vmf->vma->vm_mm))) flush_tlb_page(vmf->vma, vmf->address); } vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (vmf->page) folio = page_folio(vmf->page); /* * Shared mapping: we are guaranteed to have VM_WRITE and * FAULT_FLAG_WRITE set at this point. */ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ if (!vmf->page || is_fsdax_page(vmf->page)) { vmf->page = NULL; return wp_pfn_shared(vmf); } return wp_page_shared(vmf, folio); } /* * Private mapping: create an exclusive anonymous page copy if reuse * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. * * If we encounter a page that is marked exclusive, we must reuse * the page without further checks. */ if (folio && folio_test_anon(folio) && (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) { if (!PageAnonExclusive(vmf->page)) SetPageAnonExclusive(vmf->page); if (unlikely(unshare)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } wp_page_reuse(vmf, folio); return 0; } /* * Ok, we need to copy. Oh, well.. */ if (folio) folio_get(folio); pte_unmap_unlock(vmf->pte, vmf->ptl); #ifdef CONFIG_KSM if (folio && folio_test_ksm(folio)) count_vm_event(COW_KSM); #endif return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, pgoff_t first_index, pgoff_t last_index, struct zap_details *details) { struct vm_area_struct *vma; pgoff_t vba, vea, zba, zea; vma_interval_tree_foreach(vma, root, first_index, last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; zba = max(first_index, vba); zea = min(last_index, vea); unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, details); } } /** * unmap_mapping_folio() - Unmap single folio from processes. * @folio: The locked folio to be unmapped. * * Unmap this folio from any userspace process which still has it mmaped. * Typically, for efficiency, the range of nearby pages has already been * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once * truncation or invalidation holds the lock on a folio, it may find that * the page has been remapped again: and then uses unmap_mapping_folio() * to unmap it finally. */ void unmap_mapping_folio(struct folio *folio) { struct address_space *mapping = folio->mapping; struct zap_details details = { }; pgoff_t first_index; pgoff_t last_index; VM_BUG_ON(!folio_test_locked(folio)); first_index = folio->index; last_index = folio_next_index(folio) - 1; details.even_cows = false; details.single_folio = folio; details.zap_flags = ZAP_FLAG_DROP_MARKER; i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); i_mmap_unlock_read(mapping); } /** * unmap_mapping_pages() - Unmap pages from processes. * @mapping: The address space containing pages to be unmapped. * @start: Index of first page to be unmapped. * @nr: Number of pages to be unmapped. 0 to unmap to end of file. * @even_cows: Whether to unmap even private COWed pages. * * Unmap the pages in this address space from any userspace process which * has them mmaped. Generally, you want to remove COWed pages as well when * a file is being truncated, but not when invalidating pages from the page * cache. */ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { struct zap_details details = { }; pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; details.even_cows = even_cows; if (last_index < first_index) last_index = ULONG_MAX; i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); i_mmap_unlock_read(mapping); } EXPORT_SYMBOL_GPL(unmap_mapping_pages); /** * unmap_mapping_range - unmap the portion of all mmaps in the specified * address_space corresponding to the specified byte range in the underlying * file. * * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded * up to a PAGE_SIZE boundary. A holelen of zero truncates to the * end of the file. * @even_cows: 1 when truncating a file, unmap even private COWed pages; * but 0 when invalidating pagecache, don't throw away private data. */ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT; pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { long long holeend = (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; } unmap_mapping_pages(mapping, hba, hlen, even_cows); } EXPORT_SYMBOL(unmap_mapping_range); /* * Restore a potential device exclusive pte to a working pte entry */ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; vm_fault_t ret; /* * We need a reference to lock the folio because we don't hold * the PTL so a racing thread can remove the device-exclusive * entry and unmap it. If the folio is free the entry must * have been removed already. If it happens to have already * been re-allocated after being freed all we do is lock and * unlock it. */ if (!folio_try_get(folio)) return 0; ret = folio_lock_or_retry(folio, vmf); if (ret) { folio_put(folio); return ret; } mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); mmu_notifier_invalidate_range_start(&range); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) restore_exclusive_pte(vma, folio, vmf->page, vmf->address, vmf->pte, vmf->orig_pte); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); folio_unlock(folio); folio_put(folio); mmu_notifier_invalidate_range_end(&range); return 0; } static inline bool should_try_to_free_swap(struct folio *folio, struct vm_area_struct *vma, unsigned int fault_flags) { if (!folio_test_swapcache(folio)) return false; if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || folio_test_mlocked(folio)) return true; /* * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * user. Try freeing the swapcache to get rid of the swapcache * reference only in case it's likely that we'll be the exlusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && folio_ref_count(folio) == (1 + folio_nr_pages(folio)); } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) return 0; /* * Be careful so that we will only recover a special uffd-wp pte into a * none pte. Otherwise it means the pte could have changed, so retry. * * This should also cover the case where e.g. the pte changed * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. * So is_pte_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } static vm_fault_t do_pte_missing(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); else return do_fault(vmf); } /* * This is actually a page-missing access, but with uffd-wp special pte * installed. It means this pte was wr-protected before being unmapped. */ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) { /* * Just in case there're leftover special ptes even after the region * got unregistered - we can simply clear them. */ if (unlikely(!userfaultfd_wp(vmf->vma))) return pte_marker_clear(vmf); return do_pte_missing(vmf); } static vm_fault_t handle_pte_marker(struct vm_fault *vmf) { swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); unsigned long marker = pte_marker_get(entry); /* * PTE markers should never be empty. If anything weird happened, * the best thing to do is to kill the process along with its mm. */ if (WARN_ON_ONCE(!marker)) return VM_FAULT_SIGBUS; /* Higher priority than uffd-wp when data corrupted */ if (marker & PTE_MARKER_POISONED) return VM_FAULT_HWPOISON; /* Hitting a guard page is always a fatal condition. */ if (marker & PTE_MARKER_GUARD) return VM_FAULT_SIGSEGV; if (pte_marker_entry_uffd_wp(entry)) return pte_marker_handle_uffd_wp(vmf); /* This is an unknown pte marker */ return VM_FAULT_SIGBUS; } static struct folio *__alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; swp_entry_t entry; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; entry = pte_to_swp_entry(vmf->orig_pte); if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)) { folio_put(folio); return NULL; } return folio; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { struct swap_info_struct *si = swp_swap_info(entry); pgoff_t offset = swp_offset(entry); int i; /* * While allocating a large folio and doing swap_read_folio, which is * the case the being faulted pte doesn't have swapcache. We need to * ensure all PTEs have no cache as well, otherwise, we might go to * swap devices while the content is in swapcache. */ for (i = 0; i < max_nr; i++) { if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) return i; } return i; } /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. */ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) { unsigned long addr; swp_entry_t entry; int idx; pte_t pte; addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); idx = (vmf->address - addr) / PAGE_SIZE; pte = ptep_get(ptep); if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) return false; entry = pte_to_swp_entry(pte); if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) return false; /* * swap_read_folio() can't handle the case a large folio is hybridly * from different backends. And they are likely corner cases. Similar * things might be added once zswap support large folios. */ if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) return false; if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) return false; return true; } static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, unsigned long addr, unsigned long orders) { int order, nr; order = highest_order(orders); /* * To swap in a THP with nr pages, we require that its first swap_offset * is aligned with that number, as it was when the THP was swapped out. * This helps filter out most invalid entries. */ while (orders) { nr = 1 << order; if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr) break; order = next_order(&orders, order); } return orders; } static struct folio *alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long orders; struct folio *folio; unsigned long addr; swp_entry_t entry; spinlock_t *ptl; pte_t *pte; gfp_t gfp; int order; /* * If uffd is active for the vma we need per-page fault fidelity to * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) goto fallback; /* * A large swapped out folio could be partially or fully in zswap. We * lack handling for such cases, so fallback to swapping in order-0 * folio. */ if (!zswap_never_enabled()) goto fallback; entry = pte_to_swp_entry(vmf->orig_pte); /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); if (!orders) goto fallback; pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl); if (unlikely(!pte)) goto fallback; /* * For do_swap_page, find the highest order where the aligned range is * completely swap entries with contiguous swap offsets. */ order = highest_order(orders); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order)) break; order = next_order(&orders, order); } pte_unmap_unlock(pte, ptl); /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) return folio; count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); folio_put(folio); } count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order = next_order(&orders, order); } fallback: return __alloc_swap_folio(vmf); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static struct folio *alloc_swap_folio(struct vm_fault *vmf) { return __alloc_swap_folio(vmf); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq); /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with pte unmapped and unlocked. * * We return with the mmap_lock locked or unlocked in the same cases * as does filemap_fault(). */ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *swapcache, *folio = NULL; DECLARE_WAITQUEUE(wait, current); struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; bool need_clear_cache = false; bool exclusive = false; swp_entry_t entry; pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; int nr_pages; unsigned long page_idx; unsigned long address; pte_t *ptep; if (!pte_unmap_same(vmf)) goto out; entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_exclusive_entry(entry)) { vmf->page = pfn_swap_entry_to_page(entry); ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) { /* * migrate_to_ram is not yet ready to operate * under VMA lock. */ vma_end_read(vma); ret = VM_FAULT_RETRY; goto out; } vmf->page = pfn_swap_entry_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto unlock; /* * Get a page reference while we know the page can't be * freed. */ if (trylock_page(vmf->page)) { struct dev_pagemap *pgmap; get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); pgmap = page_pgmap(vmf->page); ret = pgmap->ops->migrate_to_ram(vmf); unlock_page(vmf->page); put_page(vmf->page); } else { pte_unmap_unlock(vmf->pte, vmf->ptl); } } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; } /* Prevent swapoff from happening to us. */ si = get_swap_device(entry); if (unlikely(!si)) goto out; folio = swap_cache_get_folio(entry, vma, vmf->address); if (folio) page = folio_file_page(folio, swp_offset(entry)); swapcache = folio; if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ folio = alloc_swap_folio(vmf); if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); nr_pages = folio_nr_pages(folio); if (folio_test_large(folio)) entry.val = ALIGN_DOWN(entry.val, nr_pages); /* * Prevent parallel swapin from proceeding with * the cache flag. Otherwise, another thread * may finish swapin first, free the entry, and * swapout reusing the same entry. It's * undetectable as pte_same() returns true due * to entry reuse. */ if (swapcache_prepare(entry, nr_pages)) { /* * Relax a bit to prevent rapid * repeated page faults. */ add_wait_queue(&swapcache_wq, &wait); schedule_timeout_uninterruptible(1); remove_wait_queue(&swapcache_wq, &wait); goto out_page; } need_clear_cache = true; memcg1_swapin(entry, nr_pages); shadow = get_shadow_from_swap_cache(entry); if (shadow) workingset_refault(folio, shadow); folio_add_lru(folio); /* To provide entry to swap_read_folio() */ folio->swap = entry; swap_read_folio(folio, NULL); folio->private = NULL; } } else { folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); swapcache = folio; } if (!folio) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; goto unlock; } /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); page = folio_file_page(folio, swp_offset(entry)); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; goto out_release; } ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; if (swapcache) { /* * Make sure folio_free_swap() or swapoff did not release the * swapcache from under us. The page pin, and pte_same test * below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not * changed. */ if (unlikely(!folio_test_swapcache(folio) || page_swap_entry(page).val != entry.val)) goto out_page; /* * KSM sometimes has to copy on read faults, for example, if * page->index of !PageKSM() pages would be nonlinear inside the * anon VMA -- PageKSM() is lost on actual swapout. */ folio = ksm_might_need_to_copy(folio, vma, vmf->address); if (unlikely(!folio)) { ret = VM_FAULT_OOM; folio = swapcache; goto out_page; } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { ret = VM_FAULT_HWPOISON; folio = swapcache; goto out_page; } if (folio != swapcache) page = folio_page(folio, 0); /* * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * owner. Try removing the extra reference from the local LRU * caches if required. */ if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && !folio_test_ksm(folio) && !folio_test_lru(folio)) lru_add_drain(); } folio_throttle_swaprate(folio, GFP_KERNEL); /* * Back out if somebody else already faulted in this pte. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; if (unlikely(!folio_test_uptodate(folio))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } /* allocated large folios for SWP_SYNCHRONOUS_IO */ if (folio_test_large(folio) && !folio_test_swapcache(folio)) { unsigned long nr = folio_nr_pages(folio); unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; pte_t *folio_ptep = vmf->pte - idx; pte_t folio_pte = ptep_get(folio_ptep); if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || swap_pte_batch(folio_ptep, nr, folio_pte) != nr) goto out_nomap; page_idx = idx; address = folio_start; ptep = folio_ptep; goto check_folio; } nr_pages = 1; page_idx = 0; address = vmf->address; ptep = vmf->pte; if (folio_test_large(folio) && folio_test_swapcache(folio)) { int nr = folio_nr_pages(folio); unsigned long idx = folio_page_idx(folio, page); unsigned long folio_start = address - idx * PAGE_SIZE; unsigned long folio_end = folio_start + nr * PAGE_SIZE; pte_t *folio_ptep; pte_t folio_pte; if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start))) goto check_folio; if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end))) goto check_folio; folio_ptep = vmf->pte - idx; folio_pte = ptep_get(folio_ptep); if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || swap_pte_batch(folio_ptep, nr, folio_pte) != nr) goto check_folio; page_idx = idx; address = folio_start; ptep = folio_ptep; nr_pages = nr; entry = folio->swap; page = &folio->page; } check_folio: /* * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte * must never point at an anonymous page in the swapcache that is * PG_anon_exclusive. Sanity check that this holds and especially, that * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity * check after taking the PT lock and making sure that nobody * concurrently faulted in this page and set PG_anon_exclusive. */ BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. */ if (!folio_test_ksm(folio)) { exclusive = pte_swp_exclusive(vmf->orig_pte); if (folio != swapcache) { /* * We have a fresh page that is not exposed to the * swapcache -> certainly exclusive. */ exclusive = true; } else if (exclusive && folio_test_writeback(folio) && data_race(si->flags & SWP_STABLE_WRITES)) { /* * This is tricky: not all swap backends support * concurrent page modifications while under writeback. * * So if we stumble over such a page in the swapcache * we must not set the page exclusive, otherwise we can * map it writable without further checks and modify it * while still under writeback. * * For these problematic swap backends, simply drop the * exclusive marker: this is perfectly fine as we start * writeback only if we fully unmapped the page and * there are no unexpected references on the page after * unmapping succeeded. After fully unmapped, no * further GUP references (FOLL_GET and FOLL_PIN) can * appear, so dropping the exclusive marker and mapping * it only R/O is fine. */ exclusive = false; } } /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ arch_swap_restore(folio_swap(entry, folio), folio); /* * Remove the swap entry and conditionally try to free up the swapcache. * We're already holding a reference on the page but haven't mapped it * yet. */ swap_free_nr(entry, nr_pages); if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); pte = mk_pte(page, vma->vm_page_prot); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); if (pte_swp_uffd_wp(vmf->orig_pte)) pte = pte_mkuffd_wp(pte); /* * Same logic as in do_wp_page(); however, optimize for pages that are * certainly not shared either because we just allocated them without * exposing them to the swapcache or because the swap entry indicates * exclusivity. */ if (!folio_test_ksm(folio) && (exclusive || folio_ref_count(folio) == 1)) { if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) && !pte_needs_soft_dirty_wp(vma, pte)) { pte = pte_mkwrite(pte, vma); if (vmf->flags & FAULT_FLAG_WRITE) { pte = pte_mkdirty(pte); vmf->flags &= ~FAULT_FLAG_WRITE; } } rmap_flags |= RMAP_EXCLUSIVE; } folio_ref_add(folio, nr_pages - 1); flush_icache_pages(vma, page, nr_pages); vmf->orig_pte = pte_advance_pfn(pte, page_idx); /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else if (!folio_test_anon(folio)) { /* * We currently only expect small !anon folios which are either * fully exclusive or fully shared, or new allocated large * folios which are fully exclusive. If we ever get large * folios within swapcache here, we have to be careful. */ VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); } else { folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, rmap_flags); } VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); set_ptes(vma->vm_mm, address, ptep, pte, nr_pages); arch_do_swap_page_nr(vma->vm_mm, vma, address, pte, pte, nr_pages); folio_unlock(folio); if (folio != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check * (to avoid false positives from pte_same). For * further safety release the lock after the swap_free * so that the swap count won't change under a * parallel locked swapcache. */ folio_unlock(swapcache); folio_put(swapcache); } if (vmf->flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ update_mmu_cache_range(vmf, vma, address, ptep, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out: /* Clear the swap cache pin for direct swapin after PTL unlock */ if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); if (waitqueue_active(&swapcache_wq)) wake_up(&swapcache_wq); } if (si) put_swap_device(si); return ret; out_nomap: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: folio_unlock(folio); out_release: folio_put(folio); if (folio != swapcache && swapcache) { folio_unlock(swapcache); folio_put(swapcache); } if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); if (waitqueue_active(&swapcache_wq)) wake_up(&swapcache_wq); } if (si) put_swap_device(si); return ret; } static bool pte_range_none(pte_t *pte, int nr_pages) { int i; for (i = 0; i < nr_pages; i++) { if (!pte_none(ptep_get_lockless(pte + i))) return false; } return true; } static struct folio *alloc_anon_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long orders; struct folio *folio; unsigned long addr; pte_t *pte; gfp_t gfp; int order; /* * If uffd is active for the vma we need per-page fault fidelity to * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) goto fallback; /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) goto fallback; pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK); if (!pte) return ERR_PTR(-EAGAIN); /* * Find the highest order where the aligned range is completely * pte_none(). Note that all remaining orders will be completely * pte_none(). */ order = highest_order(orders); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); if (pte_range_none(pte + pte_index(addr), 1 << order)) break; order = next_order(&orders, order); } pte_unmap(pte); if (!orders) goto fallback; /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); folio_put(folio); goto next; } folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation * (__GFP_ZERO not used) or user folios require special * handling, folio_zero_user() is used to make sure * that the page corresponding to the faulting address * will be hot in the cache after zeroing. */ if (user_alloc_needs_zeroing()) folio_zero_user(folio, vmf->address); return folio; } next: count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); } fallback: #endif return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; int nr_pages = 1; pte_t entry; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; /* * Use pte_alloc() instead of pte_alloc_map(), so that OOM can * be distinguished from a transient failure of pte_offset_map(). */ if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) goto unlock; if (vmf_pte_changed(vmf)) { update_mmu_tlb(vma, vmf->address, vmf->pte); goto unlock; } ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } /* Allocate our own private page. */ ret = vmf_anon_prepare(vmf); if (ret) return ret; /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ folio = alloc_anon_folio(vmf); if (IS_ERR(folio)) return 0; if (!folio) goto oom; nr_pages = folio_nr_pages(folio); addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ __folio_mark_uptodate(folio); entry = mk_pte(&folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; if (nr_pages == 1 && vmf_pte_changed(vmf)) { update_mmu_tlb(vma, addr, vmf->pte); goto release; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); goto release; } ret = check_stable_address_space(vma->vm_mm); if (ret) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); folio_put(folio); return handle_userfault(vmf, VM_UFFD_MISSING); } folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: if (vmf_orig_pte_uffd_wp(vmf)) entry = pte_mkuffd_wp(entry); set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); /* No need to invalidate - it was non-present before */ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: folio_put(folio); goto unlock; oom: return VM_FAULT_OOM; } /* * The mmap_lock must have been held on entry, and may have been * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ static vm_fault_t __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; vm_fault_t ret; /* * Preallocate pte before we take page_lock because this might lead to * deadlocks for memcg reclaim which waits for pages under writeback: * lock_page(A) * SetPageWriteback(A) * unlock_page(A) * lock_page(B) * lock_page(B) * pte_alloc_one * shrink_folio_list * wait_on_page_writeback(A) * SetPageWriteback(B) * unlock_page(B) * # flush A, B to clear the writeback */ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) return ret; folio = page_folio(vmf->page); if (unlikely(PageHWPoison(vmf->page))) { vm_fault_t poisonret = VM_FAULT_HWPOISON; if (ret & VM_FAULT_LOCKED) { if (page_mapped(vmf->page)) unmap_mapping_folio(folio); /* Retry if a clean folio was removed from the cache. */ if (mapping_evict_folio(folio->mapping, folio)) poisonret = VM_FAULT_NOPAGE; folio_unlock(folio); } folio_put(folio); vmf->page = NULL; return poisonret; } if (unlikely(!(ret & VM_FAULT_LOCKED))) folio_lock(folio); else VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page); return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ mm_inc_nr_ptes(vma->vm_mm); vmf->prealloc_pte = NULL; } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct folio *folio = page_folio(page); struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; vm_fault_t ret = VM_FAULT_FALLBACK; /* * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any * PMD mappings if THPs are disabled. */ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; if (folio_order(folio) != HPAGE_PMD_ORDER) return ret; page = &folio->page; /* * Just backoff if any subpage of a THP is corrupted otherwise * the corrupted page may mapped by PMD silently to escape the * check. This kind of THP just can be PTE mapped. Access to * the corrupted subpage should trigger SIGBUS as expected. */ if (unlikely(folio_test_has_hwpoisoned(folio))) return ret; /* * Archs like ppc64 need additional space to store information * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) goto out; flush_icache_pages(vma, page, HPAGE_PMD_NR); entry = mk_huge_pmd(page, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR); folio_add_file_rmap_pmd(folio, page, vma); /* * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) deposit_prealloc_pte(vmf); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; count_vm_event(THP_FILE_MAPPED); out: spin_unlock(vmf->ptl); return ret; } #else vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { return VM_FAULT_FALLBACK; } #endif /** * set_pte_range - Set a range of PTEs to point to pages in a folio. * @vmf: Fault decription. * @folio: The folio that contains @page. * @page: The first page to create a PTE for. * @nr: The number of PTEs to create. * @addr: The first address to create a PTE for. */ void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; flush_icache_pages(vma, page, nr); entry = mk_pte(page, vma->vm_page_prot); if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); else entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { VM_BUG_ON_FOLIO(nr != 1, folio); folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr); } static bool vmf_pte_changed(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); return !pte_none(ptep_get(vmf->pte)); } /** * finish_fault - finish page fault once we have prepared the page to fault * * @vmf: structure describing the fault * * This function handles all that is needed to finish a page fault once the * page to fault in is prepared. It handles locking of PTEs, inserts PTE for * given page, adds reverse page mapping, handles memcg charges and LRU * addition. * * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). * * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t finish_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page; struct folio *folio; vm_fault_t ret; bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); int type, nr_pages; unsigned long addr; bool needs_fallback = false; fallback: addr = vmf->address; /* Did we COW the page? */ if (is_cow) page = vmf->cow_page; else page = vmf->page; /* * check even for read faults because we might have lost our CoWed * page */ if (!(vma->vm_flags & VM_SHARED)) { ret = check_stable_address_space(vma->vm_mm); if (ret) return ret; } if (pmd_none(*vmf->pmd)) { if (PageTransCompound(page)) { ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; } if (vmf->prealloc_pte) pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) return VM_FAULT_OOM; } folio = page_folio(page); nr_pages = folio_nr_pages(folio); /* * Using per-page fault to maintain the uffd semantics, and same * approach also applies to non-anonymous-shmem faults to avoid * inflating the RSS of the process. */ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); /* The page offset of vmf->address within the VMA. */ pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; /* The index of the entry in the pagetable for fault page. */ pgoff_t pte_off = pte_index(vmf->address); /* * Fallback to per-page fault in case the folio size in page * cache beyond the VMA limits and PMD pagetable limits. */ if (unlikely(vma_off < idx || vma_off + (nr_pages - idx) > vma_pages(vma) || pte_off < idx || pte_off + (nr_pages - idx) > PTRS_PER_PTE)) { nr_pages = 1; } else { /* Now we can set mappings for the whole large folio. */ addr = vmf->address - idx * PAGE_SIZE; page = &folio->page; } } vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) return VM_FAULT_NOPAGE; /* Re-check under ptl */ if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) { update_mmu_tlb(vma, addr, vmf->pte); ret = VM_FAULT_NOPAGE; goto unlock; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { needs_fallback = true; pte_unmap_unlock(vmf->pte, vmf->ptl); goto fallback; } folio_ref_add(folio, nr_pages - 1); set_pte_range(vmf, folio, page, nr_pages, addr); type = is_cow ? MM_ANONPAGES : mm_counter_file(folio); add_mm_counter(vma->vm_mm, type, nr_pages); ret = 0; unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } static unsigned long fault_around_pages __read_mostly = 65536 >> PAGE_SHIFT; #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) { *val = fault_around_pages << PAGE_SHIFT; return 0; } /* * fault_around_bytes must be rounded down to the nearest page order as it's * what do_fault_around() expects to see. */ static int fault_around_bytes_set(void *data, u64 val) { if (val / PAGE_SIZE > PTRS_PER_PTE) return -EINVAL; /* * The minimum value is 1 page, however this results in no fault-around * at all. See should_fault_around(). */ val = max(val, PAGE_SIZE); fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, &fault_around_bytes_fops); return 0; } late_initcall(fault_around_debugfs); #endif /* * do_fault_around() tries to map few pages around the fault address. The hope * is that the pages will be needed soon and this will lower the number of * faults to handle. * * It uses vm_ops->map_pages() to map the pages, which skips the page if it's * not ready to be mapped: not up-to-date, locked, etc. * * This function doesn't cross VMA or page table boundaries, in order to call * map_pages() and acquire a PTE lock only once. * * fault_around_pages defines how many pages we'll try to map. * do_fault_around() expects it to be set to a power of two less than or equal * to PTRS_PER_PTE. * * The virtual address of the area that we map is naturally aligned to * fault_around_pages * PAGE_SIZE rounded down to the machine page size * (and therefore to page order). This way it's easier to guarantee * that we don't cross page table boundaries. */ static vm_fault_t do_fault_around(struct vm_fault *vmf) { pgoff_t nr_pages = READ_ONCE(fault_around_pages); pgoff_t pte_off = pte_index(vmf->address); /* The page offset of vmf->address within the VMA. */ pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; pgoff_t from_pte, to_pte; vm_fault_t ret; /* The PTE offset of the start address, clamped to the VMA. */ from_pte = max(ALIGN_DOWN(pte_off, nr_pages), pte_off - min(pte_off, vma_off)); /* The PTE offset of the end address, clamped to the VMA and PTE. */ to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE, pte_off + vma_pages(vmf->vma) - vma_off) - 1; if (pmd_none(*vmf->pmd)) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } rcu_read_lock(); ret = vmf->vma->vm_ops->map_pages(vmf, vmf->pgoff + from_pte - pte_off, vmf->pgoff + to_pte - pte_off); rcu_read_unlock(); return ret; } /* Return true if we should do read fault-around, false otherwise */ static inline bool should_fault_around(struct vm_fault *vmf) { /* No ->map_pages? No way to fault around... */ if (!vmf->vma->vm_ops->map_pages) return false; if (uffd_disable_fault_around(vmf->vma)) return false; /* A single page implies no faulting 'around' at all. */ return fault_around_pages > 1; } static vm_fault_t do_read_fault(struct vm_fault *vmf) { vm_fault_t ret = 0; struct folio *folio; /* * Let's call ->map_pages() first and use ->fault() as fallback * if page by the offset is not ready to be mapped (cold cache or * something). */ if (should_fault_around(vmf)) { ret = do_fault_around(vmf); if (ret) return ret; } ret = vmf_can_call_fault(vmf); if (ret) return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; ret |= finish_fault(vmf); folio = page_folio(vmf->page); folio_unlock(folio); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) folio_put(folio); return ret; } static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; vm_fault_t ret; ret = vmf_can_call_fault(vmf); if (!ret) ret = vmf_anon_prepare(vmf); if (ret) return ret; folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false); if (!folio) return VM_FAULT_OOM; vmf->cow_page = &folio->page; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (ret & VM_FAULT_DONE_COW) return ret; if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) { ret = VM_FAULT_HWPOISON; goto unlock; } __folio_mark_uptodate(folio); ret |= finish_fault(vmf); unlock: unlock_page(vmf->page); put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; uncharge_out: folio_put(folio); return ret; } static vm_fault_t do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; struct folio *folio; ret = vmf_can_call_fault(vmf); if (ret) return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; folio = page_folio(vmf->page); /* * Check if the backing address space wants to know that the page is * about to become writable */ if (vma->vm_ops->page_mkwrite) { folio_unlock(folio); tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); return tmp; } } ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { folio_unlock(folio); folio_put(folio); return ret; } ret |= fault_dirty_shared_page(vmf); return ret; } /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults). * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). * If mmap_lock is released, vma may become invalid (for example * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) ret = VM_FAULT_SIGBUS; else { /* * Make sure this is not a temporary clearing of pte * by holding ptl and checking again. A R/M/W update * of pte involves: take ptl, clearing the pte so that * we don't have concurrent modification by hardware * followed by an update. */ if (unlikely(pte_none(ptep_get(vmf->pte)))) ret = VM_FAULT_SIGBUS; else ret = VM_FAULT_NOPAGE; pte_unmap_unlock(vmf->pte, vmf->ptl); } } else if (!(vmf->flags & FAULT_FLAG_WRITE)) ret = do_read_fault(vmf); else if (!(vma->vm_flags & VM_SHARED)) ret = do_cow_fault(vmf); else ret = do_shared_fault(vmf); /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; } int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid) { struct vm_area_struct *vma = vmf->vma; /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses * the case where a mapping is writable but the process never writes * to it but pte_write gets cleared during protection updates and * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ if (!writable) *flags |= TNF_NO_GROUP; /* * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) *flags |= TNF_SHARED; /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. */ if (folio_use_access_time(folio)) *last_cpupid = (-1 & LAST_CPUPID_MASK); else *last_cpupid = folio_last_cpupid(folio); /* Record the current PID acceesing VMA */ vma_set_access_pid_bit(vma); count_vm_numa_event(NUMA_HINT_FAULTS); #ifdef CONFIG_NUMA_BALANCING count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1); #endif if (folio_nid(folio) == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); *flags |= TNF_FAULT_LOCAL; } return mpol_misplaced(folio, vmf, addr); } static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long fault_addr, pte_t *fault_pte, bool writable) { pte_t pte, old_pte; old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (writable) pte = pte_mkwrite(pte, vma); ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); } static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, struct folio *folio, pte_t fault_pte, bool ignore_writable, bool pte_write_upgrade) { int nr = pte_pfn(fault_pte) - folio_pfn(folio); unsigned long start, end, addr = vmf->address; unsigned long addr_start = addr - (nr << PAGE_SHIFT); unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE); pte_t *start_ptep; /* Stay within the VMA and within the page table. */ start = max3(addr_start, pt_start, vma->vm_start); end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE, vma->vm_end); start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT); /* Restore all PTEs' mapping of the large folio */ for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(start_ptep); bool writable = false; if (!pte_present(ptent) || !pte_protnone(ptent)) continue; if (pfn_folio(pte_pfn(ptent)) != folio) continue; if (!ignore_writable) { ptent = pte_modify(ptent, vma->vm_page_prot); writable = pte_write(ptent); if (!writable && pte_write_upgrade && can_change_pte_writable(vma, addr, ptent)) writable = true; } numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable); } } static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; int nid = NUMA_NO_NODE; bool writable = false, ignore_writable = false; bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma); int last_cpupid; int target_nid; pte_t pte, old_pte; int flags = 0, nr_pages; /* * The pte cannot be used safely until we verify, while holding the page * table lock, that its contents have not changed during fault handling. */ spin_lock(vmf->ptl); /* Read the live PTE from the page tables: */ old_pte = ptep_get(vmf->pte); if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } pte = pte_modify(old_pte, vma->vm_page_prot); /* * Detect now whether the PTE could be writable; this information * is only valid while holding the PT lock. */ writable = pte_write(pte); if (!writable && pte_write_upgrade && can_change_pte_writable(vma, vmf->address, pte)) writable = true; folio = vm_normal_folio(vma, vmf->address, pte); if (!folio || folio_is_zone_device(folio)) goto out_map; nid = folio_nid(folio); nr_pages = folio_nr_pages(folio); target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags, writable, &last_cpupid); if (target_nid == NUMA_NO_NODE) goto out_map; if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { flags |= TNF_MIGRATE_FAIL; goto out_map; } /* The folio is isolated and isolation code holds a folio reference. */ pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; ignore_writable = true; /* Migrate to the requested node */ if (!migrate_misplaced_folio(folio, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; } flags |= TNF_MIGRATE_FAIL; vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) return 0; if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } out_map: /* * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ if (folio && folio_test_large(folio)) numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, pte_write_upgrade); else numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, writable); pte_unmap_unlock(vmf->pte, vmf->ptl); if (nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PMD_ORDER); return VM_FAULT_FALLBACK; } /* `inline' is required to avoid gcc 4.1.2 build error */ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; vm_fault_t ret; if (vma_is_anonymous(vma)) { if (likely(!unshare) && userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { if (userfaultfd_wp_async(vmf->vma)) goto split; return handle_userfault(vmf, VM_UFFD_WP); } return do_huge_pmd_wp_page(vmf); } if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } split: /* COW or write-notify handled on pte level: split pmd. */ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) struct vm_area_struct *vma = vmf->vma; /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PUD_ORDER); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) goto split; if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } split: /* COW or write-notify not handled on PUD level: split pud.*/ __split_huge_pud(vma, vmf->pud, vmf->address); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ return VM_FAULT_FALLBACK; } /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. * * There is also a hook called "update_mmu_cache()" that architectures * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow * concurrent faults). * * The mmap_lock may have been released depending on flags and our return value. * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ vmf->pte = NULL; vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; } else { pmd_t dummy_pmdval; /* * A regular pmd is established and it can't morph into a huge * pmd by anon khugepaged, since that takes mmap_lock in write * mode; but shmem or file collapse to THP could still morph * it into a huge pmd: just retry later if so. * * Use the maywrite version to indicate that vmf->pte may be * modified, but since we will use pte_same() to detect the * change of the !pte_none() entry, there is no need to recheck * the pmdval. Here we chooes to pass a dummy variable instead * of NULL, which helps new user think about why this place is * special. */ vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &dummy_pmdval, &vmf->ptl); if (unlikely(!vmf->pte)) return 0; vmf->orig_pte = ptep_get_lockless(vmf->pte); vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; if (pte_none(vmf->orig_pte)) { pte_unmap(vmf->pte); vmf->pte = NULL; } } if (!vmf->pte) return do_pte_missing(vmf); if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); spin_lock(vmf->ptl); entry = vmf->orig_pte; if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!pte_write(entry)) return do_wp_page(vmf); else if (likely(vmf->flags & FAULT_FLAG_WRITE)) entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE)) { update_mmu_cache_range(vmf, vmf->vma, vmf->address, vmf->pte, 1); } else { /* Skip spurious TLB flush for retried page fault */ if (vmf->flags & FAULT_FLAG_TRIED) goto unlock; /* * This is needed only for protection faults but the arch code * is not yet telling us if this is a protection fault or not. * This still avoids useless tlb flushes for .text page faults * with threads. */ if (vmf->flags & FAULT_FLAG_WRITE) flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, vmf->pte); } unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* * On entry, we hold either the VMA lock or the mmap_lock * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in * the result, the mmap_lock is not held on exit. See filemap_fault() * and __folio_lock_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, .real_address = address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; unsigned long vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) return VM_FAULT_OOM; vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && thp_vma_allowable_order(vma, vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { pud_t orig_pud = *vmf.pud; barrier(); if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { /* * TODO once we support anonymous PUDs: NUMA case and * FAULT_FLAG_UNSHARE handling. */ if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) { ret = wp_huge_pud(&vmf, orig_pud); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { huge_pud_set_accessed(&vmf, orig_pud); return 0; } } } vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; /* Huge pud page fault raced with pmd_alloc? */ if (pud_trans_unstable(vmf.pud)) goto retry_pud; if (pmd_none(*vmf.pmd) && thp_vma_allowable_order(vma, vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(vmf.orig_pmd)); if (is_pmd_migration_entry(vmf.orig_pmd)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; } if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf); if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !pmd_write(vmf.orig_pmd)) { ret = wp_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { huge_pmd_set_accessed(&vmf); return 0; } } } return handle_pte_fault(&vmf); } /** * mm_account_fault - Do page fault accounting * @mm: mm from which memcg should be extracted. It can be NULL. * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting * of perf event counters, but we'll still do the per-task accounting to * the task who triggered this page fault. * @address: the faulted address. * @flags: the fault flags. * @ret: the fault retcode. * * This will take care of most of the page fault accounting. Meanwhile, it * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter * updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should * still be in per-arch page fault handlers at the entry of page fault. */ static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs, unsigned long address, unsigned int flags, vm_fault_t ret) { bool major; /* Incomplete faults will be accounted upon completion. */ if (ret & VM_FAULT_RETRY) return; /* * To preserve the behavior of older kernels, PGFAULT counters record * both successful and failed faults, as opposed to perf counters, * which ignore failed cases. */ count_vm_event(PGFAULT); count_memcg_event_mm(mm, PGFAULT); /* * Do not account for unsuccessful faults (e.g. when the address wasn't * valid). That includes arch_vma_access_permitted() failing before * reaching here. So this is not a "this many hardware page faults" * counter. We should use the hw profiling for that. */ if (ret & VM_FAULT_ERROR) return; /* * We define the fault as a major fault when the final successful fault * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't * handle it immediately previously). */ major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED); if (major) current->maj_flt++; else current->min_flt++; /* * If the fault is done for GUP, regs will be NULL. We only do the * accounting for the per thread fault counters who triggered the * fault, and we skip the perf event updates. */ if (!regs) return; if (major) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); else perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } #ifdef CONFIG_LRU_GEN static void lru_gen_enter_fault(struct vm_area_struct *vma) { /* the LRU algorithm only applies to accesses with recency */ current->in_lru_fault = vma_has_recency(vma); } static void lru_gen_exit_fault(void) { current->in_lru_fault = false; } #else static void lru_gen_enter_fault(struct vm_area_struct *vma) { } static void lru_gen_exit_fault(void) { } #endif /* CONFIG_LRU_GEN */ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, unsigned int *flags) { if (unlikely(*flags & FAULT_FLAG_UNSHARE)) { if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE)) return VM_FAULT_SIGSEGV; /* * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's * just treat it like an ordinary read-fault otherwise. */ if (!is_cow_mapping(vma->vm_flags)) *flags &= ~FAULT_FLAG_UNSHARE; } else if (*flags & FAULT_FLAG_WRITE) { /* Write faults on read-only mappings are impossible ... */ if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) return VM_FAULT_SIGSEGV; /* ... and FOLL_FORCE only applies to COW mappings. */ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) && !is_cow_mapping(vma->vm_flags))) return VM_FAULT_SIGSEGV; } #ifdef CONFIG_PER_VMA_LOCK /* * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of * the assumption that lock is dropped on VM_FAULT_RETRY. */ if (WARN_ON_ONCE((*flags & (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) == (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT))) return VM_FAULT_SIGSEGV; #endif return 0; } /* * By the time we get here, we already hold either the VMA lock or the * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* If the fault handler drops the mmap_lock, vma may be freed */ struct mm_struct *mm = vma->vm_mm; vm_fault_t ret; bool is_droppable; __set_current_state(TASK_RUNNING); ret = sanitize_fault_flags(vma, &flags); if (ret) goto out; if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) { ret = VM_FAULT_SIGSEGV; goto out; } is_droppable = !!(vma->vm_flags & VM_DROPPABLE); /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) mem_cgroup_enter_user_fault(); lru_gen_enter_fault(vma); if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else ret = __handle_mm_fault(vma, address, flags); /* * Warning: It is no longer safe to dereference vma-> after this point, * because mmap_lock might have been dropped by __handle_mm_fault(), so * vma might be destroyed from underneath us. */ lru_gen_exit_fault(); /* If the mapping is droppable, then errors due to OOM aren't fatal. */ if (is_droppable) ret &= ~VM_FAULT_OOM; if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* * The task may have entered a memcg OOM situation but * if the allocation error was handled gracefully (no * VM_FAULT_OOM), there is no need to kill anything. * Just clean up the OOM state peacefully. */ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) mem_cgroup_oom_synchronize(false); } out: mm_account_fault(mm, regs, address, flags, ret); return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); #ifdef CONFIG_LOCK_MM_AND_FIND_VMA #include <linux/extable.h> static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { if (likely(mmap_read_trylock(mm))) return true; if (regs && !user_mode(regs)) { unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } return !mmap_read_lock_killable(mm); } static inline bool mmap_upgrade_trylock(struct mm_struct *mm) { /* * We don't have this operation yet. * * It should be easy enough to do: it's basically a * atomic_long_try_cmpxchg_acquire() * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but * it also needs the proper lockdep magic etc. */ return false; } static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { mmap_read_unlock(mm); if (regs && !user_mode(regs)) { unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } return !mmap_write_lock_killable(mm); } /* * Helper for page fault handling. * * This is kind of equivalent to "mmap_read_lock()" followed * by "find_extend_vma()", except it's a lot more careful about * the locking (and will drop the lock on failure). * * For example, if we have a kernel bug that causes a page * fault, we don't want to just use mmap_read_lock() to get * the mm lock, because that would deadlock if the bug were * to happen while we're holding the mm lock for writing. * * So this checks the exception tables on kernel faults in * order to only do this all for instructions that are actually * expected to fault. * * We can also actually take the mm lock for writing if we * need to extend the vma, which helps the VM layer a lot. */ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long addr, struct pt_regs *regs) { struct vm_area_struct *vma; if (!get_mmap_lock_carefully(mm, regs)) return NULL; vma = find_vma(mm, addr); if (likely(vma && (vma->vm_start <= addr))) return vma; /* * Well, dang. We might still be successful, but only * if we can extend a vma to do so. */ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { mmap_read_unlock(mm); return NULL; } /* * We can try to upgrade the mmap lock atomically, * in which case we can continue to use the vma * we already looked up. * * Otherwise we'll have to drop the mmap lock and * re-take it, and also look up the vma again, * re-checking it. */ if (!mmap_upgrade_trylock(mm)) { if (!upgrade_mmap_lock_carefully(mm, regs)) return NULL; vma = find_vma(mm, addr); if (!vma) goto fail; if (vma->vm_start <= addr) goto success; if (!(vma->vm_flags & VM_GROWSDOWN)) goto fail; } if (expand_stack_locked(vma, addr)) goto fail; success: mmap_write_downgrade(mm); return vma; fail: mmap_write_unlock(mm); return NULL; } #endif #ifdef CONFIG_PER_VMA_LOCK static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) { unsigned int tgt_refcnt = VMA_LOCK_OFFSET; /* Additional refcnt if the vma is attached. */ if (!detaching) tgt_refcnt++; /* * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). */ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) return false; rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, TASK_UNINTERRUPTIBLE); lock_acquired(&vma->vmlock_dep_map, _RET_IP_); return true; } static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) { *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); rwsem_release(&vma->vmlock_dep_map, _RET_IP_); } void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) { bool locked; /* * __vma_enter_locked() returns false immediately if the vma is not * attached, otherwise it waits until refcnt is indicating that vma * is attached with no readers. */ locked = __vma_enter_locked(vma, false); /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). * We don't really care about the correctness of that early check, but * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); if (locked) { bool detached; __vma_exit_locked(vma, &detached); WARN_ON_ONCE(detached); /* vma should remain attached */ } } EXPORT_SYMBOL_GPL(__vma_start_write); void vma_mark_detached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_attached(vma); /* * We are the only writer, so no need to use vma_refcount_put(). * The condition below is unlikely because the vma has been already * write-locked and readers can increment vm_refcnt only temporarily * before they check vm_lock_seq, realize the vma is locked and drop * back the vm_refcnt. That is a narrow window for observing a raised * vm_refcnt. */ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { /* Wait until vma is detached with no readers. */ if (__vma_enter_locked(vma, true)) { bool detached; __vma_exit_locked(vma, &detached); WARN_ON_ONCE(!detached); } } } /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the * function returns NULL. */ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) { MA_STATE(mas, &mm->mm_mt, address, address); struct vm_area_struct *vma; rcu_read_lock(); retry: vma = mas_walk(&mas); if (!vma) goto inval; vma = vma_start_read(mm, vma); if (IS_ERR_OR_NULL(vma)) { /* Check if the VMA got isolated after we found it */ if (PTR_ERR(vma) == -EAGAIN) { count_vm_vma_lock_event(VMA_LOCK_MISS); /* The area was replaced with another one */ goto retry; } /* Failed to lock the VMA */ goto inval; } /* * At this point, we have a stable reference to a VMA: The VMA is * locked and we know it hasn't already been isolated. * From here on, we can access the VMA without worrying about which * fields are accessible for RCU readers. */ /* Check if the vma we locked is the right one. */ if (unlikely(vma->vm_mm != mm || address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); return vma; inval_end_read: vma_end_read(vma); inval: rcu_read_unlock(); count_vm_vma_lock_event(VMA_LOCK_ABORT); return NULL; } #endif /* CONFIG_PER_VMA_LOCK */ #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. * We've already handled the fast-path in-line. */ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { p4d_t *new = p4d_alloc_one(mm, address); if (!new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) { /* Another has populated it */ p4d_free(mm, new); } else { smp_wmb(); /* See comment in pmd_install() */ pgd_populate(mm, pgd, new); } spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_P4D_FOLDED */ #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. * We've already handled the fast-path in-line. */ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { pud_t *new = pud_alloc_one(mm, address); if (!new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); smp_wmb(); /* See comment in pmd_install() */ p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. * We've already handled the fast-path in-line. */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { spinlock_t *ptl; pmd_t *new = pmd_alloc_one(mm, address); if (!new) return -ENOMEM; ptl = pud_lock(mm, pud); if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); smp_wmb(); /* See comment in pmd_install() */ pud_populate(mm, pud, new); } else { /* Another has populated it */ pmd_free(mm, new); } spin_unlock(ptl); return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, spinlock_t *lock, pte_t *ptep, pgprot_t pgprot, unsigned long pfn_base, unsigned long addr_mask, bool writable, bool special) { args->lock = lock; args->ptep = ptep; args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); args->addr_mask = addr_mask; args->pgprot = pgprot; args->writable = writable; args->special = special; } static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) { #ifdef CONFIG_LOCKDEP struct file *file = vma->vm_file; struct address_space *mapping = file ? file->f_mapping : NULL; if (mapping) lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) || lockdep_is_held(&vma->vm_mm->mmap_lock)); else lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); #endif } /** * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address * @args: Pointer to struct @follow_pfnmap_args * * The caller needs to setup args->vma and args->address to point to the * virtual address as the target of such lookup. On a successful return, * the results will be put into other output fields. * * After the caller finished using the fields, the caller must invoke * another follow_pfnmap_end() to proper releases the locks and resources * of such look up request. * * During the start() and end() calls, the results in @args will be valid * as proper locks will be held. After the end() is called, all the fields * in @follow_pfnmap_args will be invalid to be further accessed. Further * use of such information after end() may require proper synchronizations * by the caller with page table updates, otherwise it can create a * security bug. * * If the PTE maps a refcounted page, callers are responsible to protect * against invalidation with MMU notifiers; otherwise access to the PFN at * a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore * should be taken for read, and the mmap semaphore cannot be released * before the end() is invoked. * * This function must not be used to modify PTE content. * * Return: zero on success, negative otherwise. */ int follow_pfnmap_start(struct follow_pfnmap_args *args) { struct vm_area_struct *vma = args->vma; unsigned long address = args->address; struct mm_struct *mm = vma->vm_mm; spinlock_t *lock; pgd_t *pgdp; p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; pfnmap_lockdep_assert(vma); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto out; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; retry: pgdp = pgd_offset(mm, address); if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) goto out; p4dp = p4d_offset(pgdp, address); p4d = READ_ONCE(*p4dp); if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; pudp = pud_offset(p4dp, address); pud = READ_ONCE(*pudp); if (pud_none(pud)) goto out; if (pud_leaf(pud)) { lock = pud_lock(mm, pudp); if (!unlikely(pud_leaf(pud))) { spin_unlock(lock); goto retry; } pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), pud_pfn(pud), PUD_MASK, pud_write(pud), pud_special(pud)); return 0; } pmdp = pmd_offset(pudp, address); pmd = pmdp_get_lockless(pmdp); if (pmd_leaf(pmd)) { lock = pmd_lock(mm, pmdp); if (!unlikely(pmd_leaf(pmd))) { spin_unlock(lock); goto retry; } pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), pmd_special(pmd)); return 0; } ptep = pte_offset_map_lock(mm, pmdp, address, &lock); if (!ptep) goto out; pte = ptep_get(ptep); if (!pte_present(pte)) goto unlock; pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), pte_pfn(pte), PAGE_MASK, pte_write(pte), pte_special(pte)); return 0; unlock: pte_unmap_unlock(ptep, lock); out: return -EINVAL; } EXPORT_SYMBOL_GPL(follow_pfnmap_start); /** * follow_pfnmap_end(): End a follow_pfnmap_start() process * @args: Pointer to struct @follow_pfnmap_args * * Must be used in pair of follow_pfnmap_start(). See the start() function * above for more information. */ void follow_pfnmap_end(struct follow_pfnmap_args *args) { if (args->lock) spin_unlock(args->lock); if (args->ptep) pte_unmap(args->ptep); } EXPORT_SYMBOL_GPL(follow_pfnmap_end); #ifdef CONFIG_HAVE_IOREMAP_PROT /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access * @addr: userspace address, not relative offset within @vma * @buf: buffer to read/write * @len: length of transfer * @write: set to FOLL_WRITE when writing, otherwise reading * * This is a generic implementation for &vm_operations_struct.access for an * iomem mapping. This callback is used by access_process_vm() when the @vma is * not page based. */ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; pgprot_t prot = __pgprot(0); void __iomem *maddr; int offset = offset_in_page(addr); int ret = -EINVAL; bool writable; struct follow_pfnmap_args args = { .vma = vma, .address = addr }; retry: if (follow_pfnmap_start(&args)) return -EINVAL; prot = args.pgprot; phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; writable = args.writable; follow_pfnmap_end(&args); if ((write & FOLL_WRITE) && !writable) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; if (follow_pfnmap_start(&args)) goto out_unmap; if ((pgprot_val(prot) != pgprot_val(args.pgprot)) || (phys_addr != (args.pfn << PAGE_SHIFT)) || (writable != args.writable)) { follow_pfnmap_end(&args); iounmap(maddr); goto retry; } if (write) memcpy_toio(maddr + offset, buf, len); else memcpy_fromio(buf, maddr + offset, len); ret = len; follow_pfnmap_end(&args); out_unmap: iounmap(maddr); return ret; } EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* * Access another process' address space as given in mm. */ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { void *old_buf = buf; int write = gup_flags & FOLL_WRITE; if (mmap_read_lock_killable(mm)) return 0; /* Untag the address before looking up the VMA */ addr = untagged_addr_remote(mm, addr); /* Avoid triggering the temporary warning in __get_user_pages */ if (!vma_lookup(mm, addr) && !expand_stack(mm, addr)) return 0; /* ignore errors, just check how much was successfully transferred */ while (len) { int bytes, offset; void *maddr; struct vm_area_struct *vma = NULL; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); if (IS_ERR(page)) { /* We might need to expand the stack to access it */ vma = vma_lookup(mm, addr); if (!vma) { vma = expand_stack(mm, addr); /* mmap_lock was dropped on failure */ if (!vma) return buf - old_buf; /* Try again if stack expansion worked */ continue; } /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ bytes = 0; #ifdef CONFIG_HAVE_IOREMAP_PROT if (vma->vm_ops && vma->vm_ops->access) bytes = vma->vm_ops->access(vma, addr, buf, len, write); #endif if (bytes <= 0) break; } else { bytes = len; offset = addr & (PAGE_SIZE-1); if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; maddr = kmap_local_page(page); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); set_page_dirty_lock(page); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } unmap_and_put_page(page, maddr); } len -= bytes; buf += bytes; addr += bytes; } mmap_read_unlock(mm); return buf - old_buf; } /** * access_remote_vm - access another process' address space * @mm: the mm_struct of the target address space * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. * * Return: number of bytes copied from source to destination. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* * Access another process' address space. * Source/target buffer must be kernel space, * Do not walk the page table directly, use get_user_pages */ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; mm = get_task_mm(tsk); if (!mm) return 0; ret = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; } EXPORT_SYMBOL_GPL(access_process_vm); #ifdef CONFIG_BPF_SYSCALL /* * Copy a string from another process's address space as given in mm. * If there is any error return -EFAULT. */ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { void *old_buf = buf; int err = 0; *(char *)buf = '\0'; if (mmap_read_lock_killable(mm)) return -EFAULT; addr = untagged_addr_remote(mm, addr); /* Avoid triggering the temporary warning in __get_user_pages */ if (!vma_lookup(mm, addr)) { err = -EFAULT; goto out; } while (len) { int bytes, offset, retval; void *maddr; struct page *page; struct vm_area_struct *vma = NULL; page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); if (IS_ERR(page)) { /* * Treat as a total failure for now until we decide how * to handle the CONFIG_HAVE_IOREMAP_PROT case and * stack expansion. */ *(char *)buf = '\0'; err = -EFAULT; goto out; } bytes = len; offset = addr & (PAGE_SIZE - 1); if (bytes > PAGE_SIZE - offset) bytes = PAGE_SIZE - offset; maddr = kmap_local_page(page); retval = strscpy(buf, maddr + offset, bytes); if (retval >= 0) { /* Found the end of the string */ buf += retval; unmap_and_put_page(page, maddr); break; } buf += bytes - 1; /* * Because strscpy always NUL terminates we need to * copy the last byte in the page if we are going to * load more pages */ if (bytes != len) { addr += bytes - 1; copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1); buf += 1; addr += 1; } len -= bytes; unmap_and_put_page(page, maddr); } out: mmap_read_unlock(mm); if (err) return err; return buf - old_buf; } /** * copy_remote_vm_str - copy a string from another process's address space. * @tsk: the task of the target address space * @addr: start address to read from * @buf: destination buffer * @len: number of bytes to copy * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. * * Return: number of bytes copied from @addr (source) to @buf (destination); * not including the trailing NUL. Always guaranteed to leave NUL-terminated * buffer. On any error, return -EFAULT. */ int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; if (unlikely(len == 0)) return 0; mm = get_task_mm(tsk); if (!mm) { *(char *)buf = '\0'; return -EFAULT; } ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags); mmput(mm); return ret; } EXPORT_SYMBOL_GPL(copy_remote_vm_str); #endif /* CONFIG_BPF_SYSCALL */ /* * Print the name of a VMA. */ void print_vma_addr(char *prefix, unsigned long ip) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; /* * we might be running from an atomic context so we cannot sleep */ if (!mmap_read_trylock(mm)) return; vma = vma_lookup(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; ip -= vma->vm_start; ip += vma->vm_pgoff << PAGE_SHIFT; printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip, vma->vm_start, vma->vm_end - vma->vm_start); } mmap_read_unlock(mm); } #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) void __might_fault(const char *file, int line) { if (pagefault_disabled()) return; __might_sleep(file, line); if (current->mm) might_lock_read(¤t->mm->mmap_lock); } EXPORT_SYMBOL(__might_fault); #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) /* * Process all subpages of the specified huge page with the specified * operation. The target subpage will be processed last to keep its * cache lines hot. */ static inline int process_huge_page( unsigned long addr_hint, unsigned int nr_pages, int (*process_subpage)(unsigned long addr, int idx, void *arg), void *arg) { int i, n, base, l, ret; unsigned long addr = addr_hint & ~(((unsigned long)nr_pages << PAGE_SHIFT) - 1); /* Process target subpage last to keep its cache lines hot */ might_sleep(); n = (addr_hint - addr) / PAGE_SIZE; if (2 * n <= nr_pages) { /* If target subpage in first half of huge page */ base = 0; l = n; /* Process subpages at the end of huge page */ for (i = nr_pages - 1; i >= 2 * n; i--) { cond_resched(); ret = process_subpage(addr + i * PAGE_SIZE, i, arg); if (ret) return ret; } } else { /* If target subpage in second half of huge page */ base = nr_pages - 2 * (nr_pages - n); l = nr_pages - n; /* Process subpages at the begin of huge page */ for (i = 0; i < base; i++) { cond_resched(); ret = process_subpage(addr + i * PAGE_SIZE, i, arg); if (ret) return ret; } } /* * Process remaining subpages in left-right-left-right pattern * towards the target subpage */ for (i = 0; i < l; i++) { int left_idx = base + i; int right_idx = base + 2 * l - 1 - i; cond_resched(); ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg); if (ret) return ret; cond_resched(); ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg); if (ret) return ret; } return 0; } static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, unsigned int nr_pages) { unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); int i; might_sleep(); for (i = 0; i < nr_pages; i++) { cond_resched(); clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE); } } static int clear_subpage(unsigned long addr, int idx, void *arg) { struct folio *folio = arg; clear_user_highpage(folio_page(folio, idx), addr); return 0; } /** * folio_zero_user - Zero a folio which will be mapped to userspace. * @folio: The folio to zero. * @addr_hint: The address will be accessed or the base address if uncelar. */ void folio_zero_user(struct folio *folio, unsigned long addr_hint) { unsigned int nr_pages = folio_nr_pages(folio); if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) clear_gigantic_page(folio, addr_hint, nr_pages); else process_huge_page(addr_hint, nr_pages, clear_subpage, folio); } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma, unsigned int nr_pages) { unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst)); struct page *dst_page; struct page *src_page; int i; for (i = 0; i < nr_pages; i++) { dst_page = folio_page(dst, i); src_page = folio_page(src, i); cond_resched(); if (copy_mc_user_highpage(dst_page, src_page, addr + i*PAGE_SIZE, vma)) return -EHWPOISON; } return 0; } struct copy_subpage_arg { struct folio *dst; struct folio *src; struct vm_area_struct *vma; }; static int copy_subpage(unsigned long addr, int idx, void *arg) { struct copy_subpage_arg *copy_arg = arg; struct page *dst = folio_page(copy_arg->dst, idx); struct page *src = folio_page(copy_arg->src, idx); if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) return -EHWPOISON; return 0; } int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma) { unsigned int nr_pages = folio_nr_pages(dst); struct copy_subpage_arg arg = { .dst = dst, .src = src, .vma = vma, }; if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages); return process_huge_page(addr_hint, nr_pages, copy_subpage, &arg); } long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault) { void *kaddr; unsigned long i, rc = 0; unsigned int nr_pages = folio_nr_pages(dst_folio); unsigned long ret_val = nr_pages * PAGE_SIZE; struct page *subpage; for (i = 0; i < nr_pages; i++) { subpage = folio_page(dst_folio, i); kaddr = kmap_local_page(subpage); if (!allow_pagefault) pagefault_disable(); rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE); if (!allow_pagefault) pagefault_enable(); kunmap_local(kaddr); ret_val -= (PAGE_SIZE - rc); if (rc) break; flush_dcache_page(subpage); cond_resched(); } return ret_val; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; void __init ptlock_cache_init(void) { page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, SLAB_PANIC, NULL); } bool ptlock_alloc(struct ptdesc *ptdesc) { spinlock_t *ptl; ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); if (!ptl) return false; ptdesc->ptl = ptl; return true; } void ptlock_free(struct ptdesc *ptdesc) { if (ptdesc->ptl) kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif void vma_pgtable_walk_begin(struct vm_area_struct *vma) { if (is_vm_hugetlb_page(vma)) hugetlb_vma_lock_read(vma); } void vma_pgtable_walk_end(struct vm_area_struct *vma) { if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org> * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2008-2012 Novell Inc. * Copyright (c) 2012-2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2012-2019 Linux Foundation * * Core driver model functions and structures that should not be * shared outside of the drivers/base/ directory. * */ #include <linux/notifier.h> /** * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure. * * @subsys - the struct kset that defines this subsystem * @devices_kset - the subsystem's 'devices' directory * @interfaces - list of subsystem interfaces associated * @mutex - protect the devices, and interfaces lists. * * @drivers_kset - the list of drivers associated * @klist_devices - the klist to iterate over the @devices_kset * @klist_drivers - the klist to iterate over the @drivers_kset * @bus_notifier - the bus notifier list for anything that cares about things * on this bus. * @bus - pointer back to the struct bus_type that this structure is associated * with. * @dev_root: Default device to use as the parent. * * @glue_dirs - "glue" directory to put in-between the parent device to * avoid namespace conflicts * @class - pointer back to the struct class that this structure is associated * with. * @lock_key: Lock class key for use by the lock validator * * This structure is the one that is the actual kobject allowing struct * bus_type/class to be statically allocated safely. Nothing outside of the * driver core should ever touch these fields. */ struct subsys_private { struct kset subsys; struct kset *devices_kset; struct list_head interfaces; struct mutex mutex; struct kset *drivers_kset; struct klist klist_devices; struct klist klist_drivers; struct blocking_notifier_head bus_notifier; unsigned int drivers_autoprobe:1; const struct bus_type *bus; struct device *dev_root; struct kset glue_dirs; const struct class *class; struct lock_class_key lock_key; }; #define to_subsys_private(obj) container_of_const(obj, struct subsys_private, subsys.kobj) static inline struct subsys_private *subsys_get(struct subsys_private *sp) { if (sp) kset_get(&sp->subsys); return sp; } static inline void subsys_put(struct subsys_private *sp) { if (sp) kset_put(&sp->subsys); } struct subsys_private *bus_to_subsys(const struct bus_type *bus); struct subsys_private *class_to_subsys(const struct class *class); struct driver_private { struct kobject kobj; struct klist klist_devices; struct klist_node knode_bus; struct module_kobject *mkobj; struct device_driver *driver; }; #define to_driver(obj) container_of(obj, struct driver_private, kobj) /** * struct device_private - structure to hold the private to the driver core portions of the device structure. * * @klist_children - klist containing all children of this device * @knode_parent - node in sibling list * @knode_driver - node in driver list * @knode_bus - node in bus list * @knode_class - node in class list * @deferred_probe - entry in deferred_probe_list which is used to retry the * binding of drivers which were unable to get all the resources needed by * the device; typically because it depends on another driver getting * probed first. * @async_driver - pointer to device driver awaiting probe via async_probe * @device - pointer back to the struct device that this structure is * associated with. * @dead - This device is currently either in the process of or has been * removed from the system. Any asynchronous events scheduled for this * device should exit without taking any action. * * Nothing outside of the driver core should ever touch these fields. */ struct device_private { struct klist klist_children; struct klist_node knode_parent; struct klist_node knode_driver; struct klist_node knode_bus; struct klist_node knode_class; struct list_head deferred_probe; const struct device_driver *async_driver; char *deferred_probe_reason; struct device *device; u8 dead:1; }; #define to_device_private_parent(obj) \ container_of(obj, struct device_private, knode_parent) #define to_device_private_driver(obj) \ container_of(obj, struct device_private, knode_driver) #define to_device_private_bus(obj) \ container_of(obj, struct device_private, knode_bus) #define to_device_private_class(obj) \ container_of(obj, struct device_private, knode_class) /* initialisation functions */ int devices_init(void); int buses_init(void); int classes_init(void); int firmware_init(void); #ifdef CONFIG_SYS_HYPERVISOR int hypervisor_init(void); #else static inline int hypervisor_init(void) { return 0; } #endif int platform_bus_init(void); int faux_bus_init(void); void cpu_dev_init(void); void container_dev_init(void); #ifdef CONFIG_AUXILIARY_BUS void auxiliary_bus_init(void); #else static inline void auxiliary_bus_init(void) { } #endif struct kobject *virtual_device_parent(void); int bus_add_device(struct device *dev); void bus_probe_device(struct device *dev); void bus_remove_device(struct device *dev); void bus_notify(struct device *dev, enum bus_notifier_event value); bool bus_is_registered(const struct bus_type *bus); int bus_add_driver(struct device_driver *drv); void bus_remove_driver(struct device_driver *drv); void device_release_driver_internal(struct device *dev, const struct device_driver *drv, struct device *parent); void driver_detach(const struct device_driver *drv); void driver_deferred_probe_del(struct device *dev); void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf); static inline int driver_match_device(const struct device_driver *drv, struct device *dev) { return drv->bus->match ? drv->bus->match(dev, drv) : 1; } static inline void dev_sync_state(struct device *dev) { if (dev->bus->sync_state) dev->bus->sync_state(dev); else if (dev->driver && dev->driver->sync_state) dev->driver->sync_state(dev); } int driver_add_groups(const struct device_driver *drv, const struct attribute_group **groups); void driver_remove_groups(const struct device_driver *drv, const struct attribute_group **groups); void device_driver_detach(struct device *dev); static inline void device_set_driver(struct device *dev, const struct device_driver *drv) { /* * Majority (all?) read accesses to dev->driver happens either * while holding device lock or in bus/driver code that is only * invoked when the device is bound to a driver and there is no * concern of the pointer being changed while it is being read. * However when reading device's uevent file we read driver pointer * without taking device lock (so we do not block there for * arbitrary amount of time). We use WRITE_ONCE() here to prevent * tearing so that READ_ONCE() can safely be used in uevent code. */ // FIXME - this cast should not be needed "soon" WRITE_ONCE(dev->driver, (struct device_driver *)drv); } int devres_release_all(struct device *dev); void device_block_probing(void); void device_unblock_probing(void); void deferred_probe_extend_timeout(void); void driver_deferred_probe_trigger(void); const char *device_get_devnode(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid, const char **tmp); /* /sys/devices directory */ extern struct kset *devices_kset; void devices_kset_move_last(struct device *dev); #if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS) int module_add_driver(struct module *mod, const struct device_driver *drv); void module_remove_driver(const struct device_driver *drv); #else static inline int module_add_driver(struct module *mod, struct device_driver *drv) { return 0; } static inline void module_remove_driver(struct device_driver *drv) { } #endif #ifdef CONFIG_DEVTMPFS int devtmpfs_init(void); #else static inline int devtmpfs_init(void) { return 0; } #endif #ifdef CONFIG_BLOCK extern const struct class block_class; static inline bool is_blockdev(struct device *dev) { return dev->class == &block_class; } #else static inline bool is_blockdev(struct device *dev) { return false; } #endif /* Device links support */ int device_links_read_lock(void); void device_links_read_unlock(int idx); int device_links_read_lock_held(void); int device_links_check_suppliers(struct device *dev); void device_links_force_bind(struct device *dev); void device_links_driver_bound(struct device *dev); void device_links_driver_cleanup(struct device *dev); void device_links_no_driver(struct device *dev); bool device_links_busy(struct device *dev); void device_links_unbind_consumers(struct device *dev); void fw_devlink_drivers_done(void); void fw_devlink_probing_done(void); /* device pm support */ void device_pm_move_to_tail(struct device *dev); #ifdef CONFIG_DEVTMPFS int devtmpfs_create_node(struct device *dev); int devtmpfs_delete_node(struct device *dev); #else static inline int devtmpfs_create_node(struct device *dev) { return 0; } static inline int devtmpfs_delete_node(struct device *dev) { return 0; } #endif void software_node_notify(struct device *dev); void software_node_notify_remove(struct device *dev); |
31 30 31 31 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 | // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/mm/mmu.c * * Copyright (C) 1995-2005 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/cache.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/ioport.h> #include <linux/kexec.h> #include <linux/libfdt.h> #include <linux/mman.h> #include <linux/nodemask.h> #include <linux/memblock.h> #include <linux/memremap.h> #include <linux/memory.h> #include <linux/fs.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/kfence.h> #include <linux/pkeys.h> #include <asm/barrier.h> #include <asm/cputype.h> #include <asm/fixmap.h> #include <asm/kasan.h> #include <asm/kernel-pgtable.h> #include <asm/sections.h> #include <asm/setup.h> #include <linux/sizes.h> #include <asm/tlb.h> #include <asm/mmu_context.h> #include <asm/ptdump.h> #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/kfence.h> #define NO_BLOCK_MAPPINGS BIT(0) #define NO_CONT_MAPPINGS BIT(1) #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 }; static bool rodata_is_rw __ro_after_init = true; /* * The booting CPU updates the failed status @__early_cpu_boot_status, * with MMU turned off. */ long __section(".mmuoff.data.write") __early_cpu_boot_status; /* * Empty_zero_page is a special page that is used for zero-initialized data * and COW. */ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); static DEFINE_SPINLOCK(swapper_pgdir_lock); static DEFINE_MUTEX(fixmap_lock); void noinstr set_swapper_pgd(pgd_t *pgdp, pgd_t pgd) { pgd_t *fixmap_pgdp; /* * Don't bother with the fixmap if swapper_pg_dir is still mapped * writable in the kernel mapping. */ if (rodata_is_rw) { WRITE_ONCE(*pgdp, pgd); dsb(ishst); isb(); return; } spin_lock(&swapper_pgdir_lock); fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp)); WRITE_ONCE(*fixmap_pgdp, pgd); /* * We need dsb(ishst) here to ensure the page-table-walker sees * our new entry before set_p?d() returns. The fixmap's * flush_tlb_kernel_range() via clear_fixmap() does this for us. */ pgd_clear_fixmap(); spin_unlock(&swapper_pgdir_lock); } pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { if (!pfn_is_map_memory(pfn)) return pgprot_noncached(vma_prot); else if (file->f_flags & O_SYNC) return pgprot_writecombine(vma_prot); return vma_prot; } EXPORT_SYMBOL(phys_mem_access_prot); static phys_addr_t __init early_pgtable_alloc(int shift) { phys_addr_t phys; phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, MEMBLOCK_ALLOC_NOLEAKTRACE); if (!phys) panic("Failed to allocate page table page\n"); return phys; } bool pgattr_change_is_safe(pteval_t old, pteval_t new) { /* * The following mapping attributes may be updated in live * kernel mappings without the need for break-before-make. */ pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG | PTE_SWBITS_MASK; /* creating or taking down mappings is always safe */ if (!pte_valid(__pte(old)) || !pte_valid(__pte(new))) return true; /* A live entry's pfn should not change */ if (pte_pfn(__pte(old)) != pte_pfn(__pte(new))) return false; /* live contiguous mappings may not be manipulated at all */ if ((old | new) & PTE_CONT) return false; /* Transitioning from Non-Global to Global is unsafe */ if (old & ~new & PTE_NG) return false; /* * Changing the memory type between Normal and Normal-Tagged is safe * since Tagged is considered a permission attribute from the * mismatched attribute aliases perspective. */ if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) || (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) && ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) || (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED))) mask |= PTE_ATTRINDX_MASK; return ((old ^ new) & ~mask) == 0; } static void init_clear_pgtable(void *table) { clear_page(table); /* Ensure the zeroing is observed by page table walks. */ dsb(ishst); } static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot) { do { pte_t old_pte = __ptep_get(ptep); /* * Required barriers to make this visible to the table walker * are deferred to the end of alloc_init_cont_pte(). */ __set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot)); /* * After the PTE entry has been populated once, we * only allow updates to the permission attributes. */ BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), pte_val(__ptep_get(ptep)))); phys += PAGE_SIZE; } while (ptep++, addr += PAGE_SIZE, addr != end); } static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; pmd_t pmd = READ_ONCE(*pmdp); pte_t *ptep; BUG_ON(pmd_sect(pmd)); if (pmd_none(pmd)) { pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; phys_addr_t pte_phys; if (flags & NO_EXEC_MAPPINGS) pmdval |= PMD_TABLE_PXN; BUG_ON(!pgtable_alloc); pte_phys = pgtable_alloc(PAGE_SHIFT); ptep = pte_set_fixmap(pte_phys); init_clear_pgtable(ptep); ptep += pte_index(addr); __pmd_populate(pmdp, pte_phys, pmdval); } else { BUG_ON(pmd_bad(pmd)); ptep = pte_set_fixmap_offset(pmdp, addr); } do { pgprot_t __prot = prot; next = pte_cont_addr_end(addr, end); /* use a contiguous mapping if the range is suitably aligned */ if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) && (flags & NO_CONT_MAPPINGS) == 0) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); init_pte(ptep, addr, next, phys, __prot); ptep += pte_index(next) - pte_index(addr); phys += next - addr; } while (addr = next, addr != end); /* * Note: barriers and maintenance necessary to clear the fixmap slot * ensure that all previous pgtable writes are visible to the table * walker. */ pte_clear_fixmap(); } static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; do { pmd_t old_pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); /* try section mapping first */ if (((addr | next | phys) & ~PMD_MASK) == 0 && (flags & NO_BLOCK_MAPPINGS) == 0) { pmd_set_huge(pmdp, phys, prot); /* * After the PMD entry has been populated once, we * only allow updates to the permission attributes. */ BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), READ_ONCE(pmd_val(*pmdp)))); } else { alloc_init_cont_pte(pmdp, addr, next, phys, prot, pgtable_alloc, flags); BUG_ON(pmd_val(old_pmd) != 0 && pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp))); } phys += next - addr; } while (pmdp++, addr = next, addr != end); } static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; pud_t pud = READ_ONCE(*pudp); pmd_t *pmdp; /* * Check for initial section mappings in the pgd/pud. */ BUG_ON(pud_sect(pud)); if (pud_none(pud)) { pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; phys_addr_t pmd_phys; if (flags & NO_EXEC_MAPPINGS) pudval |= PUD_TABLE_PXN; BUG_ON(!pgtable_alloc); pmd_phys = pgtable_alloc(PMD_SHIFT); pmdp = pmd_set_fixmap(pmd_phys); init_clear_pgtable(pmdp); pmdp += pmd_index(addr); __pud_populate(pudp, pmd_phys, pudval); } else { BUG_ON(pud_bad(pud)); pmdp = pmd_set_fixmap_offset(pudp, addr); } do { pgprot_t __prot = prot; next = pmd_cont_addr_end(addr, end); /* use a contiguous mapping if the range is suitably aligned */ if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) && (flags & NO_CONT_MAPPINGS) == 0) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags); pmdp += pmd_index(next) - pmd_index(addr); phys += next - addr; } while (addr = next, addr != end); pmd_clear_fixmap(); } static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; p4d_t p4d = READ_ONCE(*p4dp); pud_t *pudp; if (p4d_none(p4d)) { p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF; phys_addr_t pud_phys; if (flags & NO_EXEC_MAPPINGS) p4dval |= P4D_TABLE_PXN; BUG_ON(!pgtable_alloc); pud_phys = pgtable_alloc(PUD_SHIFT); pudp = pud_set_fixmap(pud_phys); init_clear_pgtable(pudp); pudp += pud_index(addr); __p4d_populate(p4dp, pud_phys, p4dval); } else { BUG_ON(p4d_bad(p4d)); pudp = pud_set_fixmap_offset(p4dp, addr); } do { pud_t old_pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); /* * For 4K granule only, attempt to put down a 1GB block */ if (pud_sect_supported() && ((addr | next | phys) & ~PUD_MASK) == 0 && (flags & NO_BLOCK_MAPPINGS) == 0) { pud_set_huge(pudp, phys, prot); /* * After the PUD entry has been populated once, we * only allow updates to the permission attributes. */ BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), READ_ONCE(pud_val(*pudp)))); } else { alloc_init_cont_pmd(pudp, addr, next, phys, prot, pgtable_alloc, flags); BUG_ON(pud_val(old_pud) != 0 && pud_val(old_pud) != READ_ONCE(pud_val(*pudp))); } phys += next - addr; } while (pudp++, addr = next, addr != end); pud_clear_fixmap(); } static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; pgd_t pgd = READ_ONCE(*pgdp); p4d_t *p4dp; if (pgd_none(pgd)) { pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF; phys_addr_t p4d_phys; if (flags & NO_EXEC_MAPPINGS) pgdval |= PGD_TABLE_PXN; BUG_ON(!pgtable_alloc); p4d_phys = pgtable_alloc(P4D_SHIFT); p4dp = p4d_set_fixmap(p4d_phys); init_clear_pgtable(p4dp); p4dp += p4d_index(addr); __pgd_populate(pgdp, p4d_phys, pgdval); } else { BUG_ON(pgd_bad(pgd)); p4dp = p4d_set_fixmap_offset(pgdp, addr); } do { p4d_t old_p4d = READ_ONCE(*p4dp); next = p4d_addr_end(addr, end); alloc_init_pud(p4dp, addr, next, phys, prot, pgtable_alloc, flags); BUG_ON(p4d_val(old_p4d) != 0 && p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp))); phys += next - addr; } while (p4dp++, addr = next, addr != end); p4d_clear_fixmap(); } static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long addr, end, next; pgd_t *pgdp = pgd_offset_pgd(pgdir, virt); /* * If the virtual and physical address don't have the same offset * within a page, we cannot map the region as the caller expects. */ if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) return; phys &= PAGE_MASK; addr = virt & PAGE_MASK; end = PAGE_ALIGN(virt + size); do { next = pgd_addr_end(addr, end); alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc, flags); phys += next - addr; } while (pgdp++, addr = next, addr != end); } static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags) { mutex_lock(&fixmap_lock); __create_pgd_mapping_locked(pgdir, phys, virt, size, prot, pgtable_alloc, flags); mutex_unlock(&fixmap_lock); } #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 extern __alias(__create_pgd_mapping_locked) void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags); #endif static phys_addr_t __pgd_pgtable_alloc(int shift) { /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO); BUG_ON(!ptr); return __pa(ptr); } static phys_addr_t pgd_pgtable_alloc(int shift) { phys_addr_t pa = __pgd_pgtable_alloc(shift); struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa)); /* * Call proper page table ctor in case later we need to * call core mm functions like apply_to_page_range() on * this pre-allocated page table. * * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is * folded, and if so pagetable_pte_ctor() becomes nop. */ if (shift == PAGE_SHIFT) BUG_ON(!pagetable_pte_ctor(ptdesc)); else if (shift == PMD_SHIFT) BUG_ON(!pagetable_pmd_ctor(ptdesc)); return pa; } /* * This function can only be used to modify existing table entries, * without allocating new levels of table. Note that this permits the * creation of new section or page entries. */ void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { if (virt < PAGE_OFFSET) { pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", &phys, virt); return; } __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, NO_CONT_MAPPINGS); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, bool page_mappings_only) { int flags = 0; BUG_ON(mm == &init_mm); if (page_mappings_only) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(mm->pgd, phys, virt, size, prot, pgd_pgtable_alloc, flags); } static void update_mapping_prot(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { if (virt < PAGE_OFFSET) { pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n", &phys, virt); return; } __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, NO_CONT_MAPPINGS); /* flush the TLBs after updating live kernel mappings */ flush_tlb_kernel_range(virt, virt + size); } static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start, phys_addr_t end, pgprot_t prot, int flags) { __create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start, prot, early_pgtable_alloc, flags); } void __init mark_linear_text_alias_ro(void) { /* * Remove the write permissions from the linear alias of .text/.rodata */ update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext), (unsigned long)__init_begin - (unsigned long)_stext, PAGE_KERNEL_RO); } #ifdef CONFIG_KFENCE bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL; /* early_param() will be parsed before map_mem() below. */ static int __init parse_kfence_early_init(char *arg) { int val; if (get_option(&arg, &val)) kfence_early_init = !!val; return 0; } early_param("kfence.sample_interval", parse_kfence_early_init); static phys_addr_t __init arm64_kfence_alloc_pool(void) { phys_addr_t kfence_pool; if (!kfence_early_init) return 0; kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); if (!kfence_pool) { pr_err("failed to allocate kfence pool\n"); kfence_early_init = false; return 0; } /* Temporarily mark as NOMAP. */ memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); return kfence_pool; } static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { if (!kfence_pool) return; /* KFENCE pool needs page-level mapping. */ __map_memblock(pgdp, kfence_pool, kfence_pool + KFENCE_POOL_SIZE, pgprot_tagged(PAGE_KERNEL), NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); __kfence_pool = phys_to_virt(kfence_pool); } #else /* CONFIG_KFENCE */ static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; } static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { } #endif /* CONFIG_KFENCE */ static void __init map_mem(pgd_t *pgdp) { static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); phys_addr_t kernel_start = __pa_symbol(_stext); phys_addr_t kernel_end = __pa_symbol(__init_begin); phys_addr_t start, end; phys_addr_t early_kfence_pool; int flags = NO_EXEC_MAPPINGS; u64 i; /* * Setting hierarchical PXNTable attributes on table entries covering * the linear region is only possible if it is guaranteed that no table * entries at any level are being shared between the linear region and * the vmalloc region. Check whether this is true for the PGD level, in * which case it is guaranteed to be true for all other levels as well. * (Unless we are running with support for LPA2, in which case the * entire reduced VA space is covered by a single pgd_t which will have * been populated without the PXNTable attribute by the time we get here.) */ BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) && pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1); early_kfence_pool = arm64_kfence_alloc_pool(); if (can_set_direct_map()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* * Take care not to create a writable alias for the * read-only text and rodata sections of the kernel image. * So temporarily mark them as NOMAP to skip mappings in * the following for-loop */ memblock_mark_nomap(kernel_start, kernel_end - kernel_start); /* map all the memory banks */ for_each_mem_range(i, &start, &end) { if (start >= end) break; /* * The linear map must allow allocation tags reading/writing * if MTE is present. Otherwise, it has the same attributes as * PAGE_KERNEL. */ __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL), flags); } /* * Map the linear alias of the [_stext, __init_begin) interval * as non-executable now, and remove the write permission in * mark_linear_text_alias_ro() below (which will be called after * alternative patching has completed). This makes the contents * of the region accessible to subsystems such as hibernate, * but protects it from inadvertent modification or execution. * Note that contiguous mappings cannot be remapped in this way, * so we should avoid them here. */ __map_memblock(pgdp, kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS); memblock_clear_nomap(kernel_start, kernel_end - kernel_start); arm64_kfence_map_pool(early_kfence_pool, pgdp); } void mark_rodata_ro(void) { unsigned long section_size; /* * mark .rodata as read only. Use __init_begin rather than __end_rodata * to cover NOTES and EXCEPTION_TABLE. */ section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; WRITE_ONCE(rodata_is_rw, false); update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); } static void __init declare_vma(struct vm_struct *vma, void *va_start, void *va_end, unsigned long vm_flags) { phys_addr_t pa_start = __pa_symbol(va_start); unsigned long size = va_end - va_start; BUG_ON(!PAGE_ALIGNED(pa_start)); BUG_ON(!PAGE_ALIGNED(size)); if (!(vm_flags & VM_NO_GUARD)) size += PAGE_SIZE; vma->addr = va_start; vma->phys_addr = pa_start; vma->size = size; vma->flags = VM_MAP | vm_flags; vma->caller = __builtin_return_address(0); vm_area_add_early(vma); } #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static pgprot_t kernel_exec_prot(void) { return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; } static int __init map_entry_trampoline(void) { int i; if (!arm64_kernel_unmapped_at_el0()) return 0; pgprot_t prot = kernel_exec_prot(); phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); /* The trampoline is always mapped and can therefore be global */ pgprot_val(prot) &= ~PTE_NG; /* Map only the text into the trampoline page table */ memset(tramp_pg_dir, 0, PGD_SIZE); __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, entry_tramp_text_size(), prot, __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS); /* Map both the text and data into the kernel page table */ for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, pa_start + i * PAGE_SIZE, prot); if (IS_ENABLED(CONFIG_RELOCATABLE)) __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO); return 0; } core_initcall(map_entry_trampoline); #endif /* * Declare the VMA areas for the kernel */ static void __init declare_kernel_vmas(void) { static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); declare_vma(&vmlinux_seg[4], _data, _end, 0); } void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, u64 va_offset); static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; static void __init create_idmap(void) { u64 start = __pa_symbol(__idmap_text_start); u64 end = __pa_symbol(__idmap_text_end); u64 ptep = __pa_symbol(idmap_ptes); __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX, IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, __phys_to_virt(ptep) - ptep); if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) { extern u32 __idmap_kpti_flag; u64 pa = __pa_symbol(&__idmap_kpti_flag); /* * The KPTI G-to-nG conversion code needs a read-write mapping * of its synchronization flag in the ID map. */ ptep = __pa_symbol(kpti_ptes); __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL, IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, __phys_to_virt(ptep) - ptep); } } void __init paging_init(void) { map_mem(swapper_pg_dir); memblock_allow_resize(); create_idmap(); declare_kernel_vmas(); } #ifdef CONFIG_MEMORY_HOTPLUG static void free_hotplug_page_range(struct page *page, size_t size, struct vmem_altmap *altmap) { if (altmap) { vmem_altmap_free(altmap, size >> PAGE_SHIFT); } else { WARN_ON(PageReserved(page)); free_pages((unsigned long)page_address(page), get_order(size)); } } static void free_hotplug_pgtable_page(struct page *page) { free_hotplug_page_range(page, PAGE_SIZE, NULL); } static bool pgtable_range_aligned(unsigned long start, unsigned long end, unsigned long floor, unsigned long ceiling, unsigned long mask) { start &= mask; if (start < floor) return false; if (ceiling) { ceiling &= mask; if (!ceiling) return false; } if (end - 1 > ceiling - 1) return false; return true; } static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, unsigned long end, bool free_mapped, struct vmem_altmap *altmap) { pte_t *ptep, pte; do { ptep = pte_offset_kernel(pmdp, addr); pte = __ptep_get(ptep); if (pte_none(pte)) continue; WARN_ON(!pte_present(pte)); __pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); if (free_mapped) free_hotplug_page_range(pte_page(pte), PAGE_SIZE, altmap); } while (addr += PAGE_SIZE, addr < end); } static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr, unsigned long end, bool free_mapped, struct vmem_altmap *altmap) { unsigned long next; pmd_t *pmdp, pmd; do { next = pmd_addr_end(addr, end); pmdp = pmd_offset(pudp, addr); pmd = READ_ONCE(*pmdp); if (pmd_none(pmd)) continue; WARN_ON(!pmd_present(pmd)); if (pmd_sect(pmd)) { pmd_clear(pmdp); /* * One TLBI should be sufficient here as the PMD_SIZE * range is mapped with a single block entry. */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE); if (free_mapped) free_hotplug_page_range(pmd_page(pmd), PMD_SIZE, altmap); continue; } WARN_ON(!pmd_table(pmd)); unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap); } while (addr = next, addr < end); } static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr, unsigned long end, bool free_mapped, struct vmem_altmap *altmap) { unsigned long next; pud_t *pudp, pud; do { next = pud_addr_end(addr, end); pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (pud_none(pud)) continue; WARN_ON(!pud_present(pud)); if (pud_sect(pud)) { pud_clear(pudp); /* * One TLBI should be sufficient here as the PUD_SIZE * range is mapped with a single block entry. */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE); if (free_mapped) free_hotplug_page_range(pud_page(pud), PUD_SIZE, altmap); continue; } WARN_ON(!pud_table(pud)); unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap); } while (addr = next, addr < end); } static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr, unsigned long end, bool free_mapped, struct vmem_altmap *altmap) { unsigned long next; p4d_t *p4dp, p4d; do { next = p4d_addr_end(addr, end); p4dp = p4d_offset(pgdp, addr); p4d = READ_ONCE(*p4dp); if (p4d_none(p4d)) continue; WARN_ON(!p4d_present(p4d)); unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap); } while (addr = next, addr < end); } static void unmap_hotplug_range(unsigned long addr, unsigned long end, bool free_mapped, struct vmem_altmap *altmap) { unsigned long next; pgd_t *pgdp, pgd; /* * altmap can only be used as vmemmap mapping backing memory. * In case the backing memory itself is not being freed, then * altmap is irrelevant. Warn about this inconsistency when * encountered. */ WARN_ON(!free_mapped && altmap); do { next = pgd_addr_end(addr, end); pgdp = pgd_offset_k(addr); pgd = READ_ONCE(*pgdp); if (pgd_none(pgd)) continue; WARN_ON(!pgd_present(pgd)); unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap); } while (addr = next, addr < end); } static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pte_t *ptep, pte; unsigned long i, start = addr; do { ptep = pte_offset_kernel(pmdp, addr); pte = __ptep_get(ptep); /* * This is just a sanity check here which verifies that * pte clearing has been done by earlier unmap loops. */ WARN_ON(!pte_none(pte)); } while (addr += PAGE_SIZE, addr < end); if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK)) return; /* * Check whether we can free the pte page if the rest of the * entries are empty. Overlap with other regions have been * handled by the floor/ceiling check. */ ptep = pte_offset_kernel(pmdp, 0UL); for (i = 0; i < PTRS_PER_PTE; i++) { if (!pte_none(__ptep_get(&ptep[i]))) return; } pmd_clear(pmdp); __flush_tlb_kernel_pgtable(start); free_hotplug_pgtable_page(virt_to_page(ptep)); } static void free_empty_pmd_table(pud_t *pudp, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pmd_t *pmdp, pmd; unsigned long i, next, start = addr; do { next = pmd_addr_end(addr, end); pmdp = pmd_offset(pudp, addr); pmd = READ_ONCE(*pmdp); if (pmd_none(pmd)) continue; WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd)); free_empty_pte_table(pmdp, addr, next, floor, ceiling); } while (addr = next, addr < end); if (CONFIG_PGTABLE_LEVELS <= 2) return; if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK)) return; /* * Check whether we can free the pmd page if the rest of the * entries are empty. Overlap with other regions have been * handled by the floor/ceiling check. */ pmdp = pmd_offset(pudp, 0UL); for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(READ_ONCE(pmdp[i]))) return; } pud_clear(pudp); __flush_tlb_kernel_pgtable(start); free_hotplug_pgtable_page(virt_to_page(pmdp)); } static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pud_t *pudp, pud; unsigned long i, next, start = addr; do { next = pud_addr_end(addr, end); pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (pud_none(pud)) continue; WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud)); free_empty_pmd_table(pudp, addr, next, floor, ceiling); } while (addr = next, addr < end); if (!pgtable_l4_enabled()) return; if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK)) return; /* * Check whether we can free the pud page if the rest of the * entries are empty. Overlap with other regions have been * handled by the floor/ceiling check. */ pudp = pud_offset(p4dp, 0UL); for (i = 0; i < PTRS_PER_PUD; i++) { if (!pud_none(READ_ONCE(pudp[i]))) return; } p4d_clear(p4dp); __flush_tlb_kernel_pgtable(start); free_hotplug_pgtable_page(virt_to_page(pudp)); } static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { p4d_t *p4dp, p4d; unsigned long i, next, start = addr; do { next = p4d_addr_end(addr, end); p4dp = p4d_offset(pgdp, addr); p4d = READ_ONCE(*p4dp); if (p4d_none(p4d)) continue; WARN_ON(!p4d_present(p4d)); free_empty_pud_table(p4dp, addr, next, floor, ceiling); } while (addr = next, addr < end); if (!pgtable_l5_enabled()) return; if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK)) return; /* * Check whether we can free the p4d page if the rest of the * entries are empty. Overlap with other regions have been * handled by the floor/ceiling check. */ p4dp = p4d_offset(pgdp, 0UL); for (i = 0; i < PTRS_PER_P4D; i++) { if (!p4d_none(READ_ONCE(p4dp[i]))) return; } pgd_clear(pgdp); __flush_tlb_kernel_pgtable(start); free_hotplug_pgtable_page(virt_to_page(p4dp)); } static void free_empty_tables(unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { unsigned long next; pgd_t *pgdp, pgd; do { next = pgd_addr_end(addr, end); pgdp = pgd_offset_k(addr); pgd = READ_ONCE(*pgdp); if (pgd_none(pgd)) continue; WARN_ON(!pgd_present(pgd)); free_empty_p4d_table(pgdp, addr, next, floor, ceiling); } while (addr = next, addr < end); } #endif void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, unsigned long addr, unsigned long next) { pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); } int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, unsigned long addr, unsigned long next) { vmemmap_verify((pte_t *)pmdp, node, addr, next); return pmd_sect(READ_ONCE(*pmdp)); } int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); /* [start, end] should be within one section */ WARN_ON_ONCE(end - start > PAGES_PER_SECTION * sizeof(struct page)); if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES) || (end - start < PAGES_PER_SECTION * sizeof(struct page))) return vmemmap_populate_basepages(start, end, node, altmap); else return vmemmap_populate_hugepages(start, end, node, altmap); } #ifdef CONFIG_MEMORY_HOTPLUG void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); unmap_hotplug_range(start, end, true, altmap); free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END); } #endif /* CONFIG_MEMORY_HOTPLUG */ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) { pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot)); /* Only allow permission changes for now */ if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)), pud_val(new_pud))) return 0; VM_BUG_ON(phys & ~PUD_MASK); set_pud(pudp, new_pud); return 1; } int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) { pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot)); /* Only allow permission changes for now */ if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)), pmd_val(new_pmd))) return 0; VM_BUG_ON(phys & ~PMD_MASK); set_pmd(pmdp, new_pmd); return 1; } #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_huge(p4d_t *p4dp) { } #endif int pud_clear_huge(pud_t *pudp) { if (!pud_sect(READ_ONCE(*pudp))) return 0; pud_clear(pudp); return 1; } int pmd_clear_huge(pmd_t *pmdp) { if (!pmd_sect(READ_ONCE(*pmdp))) return 0; pmd_clear(pmdp); return 1; } int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) { pte_t *table; pmd_t pmd; pmd = READ_ONCE(*pmdp); if (!pmd_table(pmd)) { VM_WARN_ON(1); return 1; } table = pte_offset_kernel(pmdp, addr); pmd_clear(pmdp); __flush_tlb_kernel_pgtable(addr); pte_free_kernel(NULL, table); return 1; } int pud_free_pmd_page(pud_t *pudp, unsigned long addr) { pmd_t *table; pmd_t *pmdp; pud_t pud; unsigned long next, end; pud = READ_ONCE(*pudp); if (!pud_table(pud)) { VM_WARN_ON(1); return 1; } table = pmd_offset(pudp, addr); pmdp = table; next = addr; end = addr + PUD_SIZE; do { pmd_free_pte_page(pmdp, next); } while (pmdp++, next += PMD_SIZE, next != end); pud_clear(pudp); __flush_tlb_kernel_pgtable(addr); pmd_free(NULL, table); return 1; } #ifdef CONFIG_MEMORY_HOTPLUG static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) { unsigned long end = start + size; WARN_ON(pgdir != init_mm.pgd); WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END)); unmap_hotplug_range(start, end, false, NULL); free_empty_tables(start, end, PAGE_OFFSET, PAGE_END); } struct range arch_get_mappable_range(void) { struct range mhp_range; u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); u64 end_linear_pa = __pa(PAGE_END - 1); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { /* * Check for a wrap, it is possible because of randomized linear * mapping the start physical address is actually bigger than * the end physical address. In this case set start to zero * because [0, end_linear_pa] range must still be able to cover * all addressable physical addresses. */ if (start_linear_pa > end_linear_pa) start_linear_pa = 0; } WARN_ON(start_linear_pa > end_linear_pa); /* * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)] * accommodating both its ends but excluding PAGE_END. Max physical * range which can be mapped inside this linear mapping range, must * also be derived from its end points. */ mhp_range.start = start_linear_pa; mhp_range.end = end_linear_pa; return mhp_range; } int arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) { int ret, flags = NO_EXEC_MAPPINGS; VM_BUG_ON(!mhp_range_allowed(start, size, true)); if (can_set_direct_map()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), size, params->pgprot, __pgd_pgtable_alloc, flags); memblock_clear_nomap(start, size); ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params); if (ret) __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size); else { /* Address of hotplugged memory can be smaller */ max_pfn = max(max_pfn, PFN_UP(start + size)); max_low_pfn = max_pfn; } return ret; } void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; __remove_pages(start_pfn, nr_pages, altmap); __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size); } /* * This memory hotplug notifier helps prevent boot memory from being * inadvertently removed as it blocks pfn range offlining process in * __offline_pages(). Hence this prevents both offlining as well as * removal process for boot memory which is initially always online. * In future if and when boot memory could be removed, this notifier * should be dropped and free_hotplug_page_range() should handle any * reserved pages allocated during boot. */ static int prevent_bootmem_remove_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct mem_section *ms; struct memory_notify *arg = data; unsigned long end_pfn = arg->start_pfn + arg->nr_pages; unsigned long pfn = arg->start_pfn; if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE)) return NOTIFY_OK; for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { unsigned long start = PFN_PHYS(pfn); unsigned long end = start + (1UL << PA_SECTION_SHIFT); ms = __pfn_to_section(pfn); if (!early_section(ms)) continue; if (action == MEM_GOING_OFFLINE) { /* * Boot memory removal is not supported. Prevent * it via blocking any attempted offline request * for the boot memory and just report it. */ pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end); return NOTIFY_BAD; } else if (action == MEM_OFFLINE) { /* * This should have never happened. Boot memory * offlining should have been prevented by this * very notifier. Probably some memory removal * procedure might have changed which would then * require further debug. */ pr_err("Boot memory [%lx %lx] offlined\n", start, end); /* * Core memory hotplug does not process a return * code from the notifier for MEM_OFFLINE events. * The error condition has been reported. Return * from here as if ignored. */ return NOTIFY_DONE; } } return NOTIFY_OK; } static struct notifier_block prevent_bootmem_remove_nb = { .notifier_call = prevent_bootmem_remove_notifier, }; /* * This ensures that boot memory sections on the platform are online * from early boot. Memory sections could not be prevented from being * offlined, unless for some reason they are not online to begin with. * This helps validate the basic assumption on which the above memory * event notifier works to prevent boot memory section offlining and * its possible removal. */ static void validate_bootmem_online(void) { phys_addr_t start, end, addr; struct mem_section *ms; u64 i; /* * Scanning across all memblock might be expensive * on some big memory systems. Hence enable this * validation only with DEBUG_VM. */ if (!IS_ENABLED(CONFIG_DEBUG_VM)) return; for_each_mem_range(i, &start, &end) { for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) { ms = __pfn_to_section(PHYS_PFN(addr)); /* * All memory ranges in the system at this point * should have been marked as early sections. */ WARN_ON(!early_section(ms)); /* * Memory notifier mechanism here to prevent boot * memory offlining depends on the fact that each * early section memory on the system is initially * online. Otherwise a given memory section which * is already offline will be overlooked and can * be removed completely. Call out such sections. */ if (!online_section(ms)) pr_err("Boot memory [%llx %llx] is offline, can be removed\n", addr, addr + (1UL << PA_SECTION_SHIFT)); } } } static int __init prevent_bootmem_remove_init(void) { int ret = 0; if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) return ret; validate_bootmem_online(); ret = register_memory_notifier(&prevent_bootmem_remove_nb); if (ret) pr_err("%s: Notifier registration failed %d\n", __func__, ret); return ret; } early_initcall(prevent_bootmem_remove_init); #endif pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* * Break-before-make (BBM) is required for all user space mappings * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ if (pte_user_exec(ptep_get(ptep))) return ptep_clear_flush(vma, addr, ptep); } return ptep_get_and_clear(vma->vm_mm, addr, ptep); } void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { set_pte_at(vma->vm_mm, addr, ptep, pte); } /* * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, * avoiding the possibility of conflicting TLB entries being allocated. */ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) { typedef void (ttbr_replace_func)(phys_addr_t); extern ttbr_replace_func idmap_cpu_replace_ttbr1; ttbr_replace_func *replace_phys; unsigned long daif; /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */ phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); if (cnp) ttbr1 |= TTBR_CNP_BIT; replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); cpu_install_idmap(); /* * We really don't want to take *any* exceptions while TTBR1 is * in the process of being replaced so mask everything. */ daif = local_daif_save(); replace_phys(ttbr1); local_daif_restore(daif); cpu_uninstall_idmap(); } #ifdef CONFIG_ARCH_HAS_PKEYS int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { u64 new_por; u64 old_por; if (!system_supports_poe()) return -ENOSPC; /* * This code should only be called with valid 'pkey' * values originating from in-kernel users. Complain * if a bad value is observed. */ if (WARN_ON_ONCE(pkey >= arch_max_pkey())) return -EINVAL; /* Set the bits we need in POR: */ new_por = POE_RWX; if (init_val & PKEY_DISABLE_WRITE) new_por &= ~POE_W; if (init_val & PKEY_DISABLE_ACCESS) new_por &= ~POE_RW; if (init_val & PKEY_DISABLE_READ) new_por &= ~POE_R; if (init_val & PKEY_DISABLE_EXECUTE) new_por &= ~POE_X; /* Shift the bits in to the correct place in POR for pkey: */ new_por = POR_ELx_PERM_PREP(pkey, new_por); /* Get old POR and mask off any old bits in place: */ old_por = read_sysreg_s(SYS_POR_EL0); old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey)); /* Write old part along with new part: */ write_sysreg_s(old_por | new_por, SYS_POR_EL0); return 0; } #endif |
681 251 585 1259 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | /* SPDX-License-Identifier: GPL-2.0 */ /* * This file provides wrappers with sanitizer instrumentation for atomic bit * operations. * * To use this functionality, an arch's bitops.h file needs to define each of * the below bit operations with an arch_ prefix (e.g. arch_set_bit(), * arch___set_bit(), etc.). */ #ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H #define _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H #include <linux/instrumented.h> /** * set_bit - Atomically set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). * * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ static __always_inline void set_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_set_bit(nr, addr); } /** * clear_bit - Clears a bit in memory * @nr: Bit to clear * @addr: Address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). */ static __always_inline void clear_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_clear_bit(nr, addr); } /** * change_bit - Toggle a bit in memory * @nr: Bit to change * @addr: Address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). * * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ static __always_inline void change_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_change_bit(nr, addr); } /** * test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_set_bit(nr, addr); } /** * test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_clear_bit(nr, addr); } /** * test_and_change_bit - Change a bit and return its old value * @nr: Bit to change * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_change_bit(nr, addr); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include "ipvlan.h" #include <linux/if_vlan.h> #include <linux/if_tap.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/virtio_net.h> #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ NETIF_F_TSO6) static dev_t ipvtap_major; static struct cdev ipvtap_cdev; static const void *ipvtap_net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d->parent); return dev_net(dev); } static struct class ipvtap_class = { .name = "ipvtap", .ns_type = &net_ns_type_operations, .namespace = ipvtap_net_namespace, }; struct ipvtap_dev { struct ipvl_dev vlan; struct tap_dev tap; }; static void ipvtap_count_tx_dropped(struct tap_dev *tap) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; this_cpu_inc(vlan->pcpu_stats->tx_drps); } static void ipvtap_count_rx_dropped(struct tap_dev *tap) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; ipvlan_count_rx(vlan, 0, 0, 0); } static void ipvtap_update_features(struct tap_dev *tap, netdev_features_t features) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; vlan->sfeatures = features; netdev_update_features(vlan->dev); } static int ipvtap_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ipvtap_dev *vlantap = netdev_priv(dev); int err; INIT_LIST_HEAD(&vlantap->tap.queue_list); /* Since macvlan supports all offloads by default, make * tap support all offloads also. */ vlantap->tap.tap_features = TUN_OFFLOADS; vlantap->tap.count_tx_dropped = ipvtap_count_tx_dropped; vlantap->tap.update_features = ipvtap_update_features; vlantap->tap.count_rx_dropped = ipvtap_count_rx_dropped; err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap); if (err) return err; /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ err = ipvlan_link_new(dev, params, extack); if (err) { netdev_rx_handler_unregister(dev); return err; } vlantap->tap.dev = vlantap->vlan.dev; return err; } static void ipvtap_dellink(struct net_device *dev, struct list_head *head) { struct ipvtap_dev *vlan = netdev_priv(dev); netdev_rx_handler_unregister(dev); tap_del_queues(&vlan->tap); ipvlan_link_delete(dev, head); } static void ipvtap_setup(struct net_device *dev) { ipvlan_link_setup(dev); dev->tx_queue_len = TUN_READQ_SIZE; dev->priv_flags &= ~IFF_NO_QUEUE; } static struct rtnl_link_ops ipvtap_link_ops __read_mostly = { .kind = "ipvtap", .setup = ipvtap_setup, .newlink = ipvtap_newlink, .dellink = ipvtap_dellink, .priv_size = sizeof(struct ipvtap_dev), }; static int ipvtap_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ipvtap_dev *vlantap; struct device *classdev; dev_t devt; int err; char tap_name[IFNAMSIZ]; if (dev->rtnl_link_ops != &ipvtap_link_ops) return NOTIFY_DONE; snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex); vlantap = netdev_priv(dev); switch (event) { case NETDEV_REGISTER: /* Create the device node here after the network device has * been registered but before register_netdevice has * finished running. */ err = tap_get_minor(ipvtap_major, &vlantap->tap); if (err) return notifier_from_errno(err); devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor); classdev = device_create(&ipvtap_class, &dev->dev, devt, dev, "%s", tap_name); if (IS_ERR(classdev)) { tap_free_minor(ipvtap_major, &vlantap->tap); return notifier_from_errno(PTR_ERR(classdev)); } err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj, tap_name); if (err) return notifier_from_errno(err); break; case NETDEV_UNREGISTER: /* vlan->minor == 0 if NETDEV_REGISTER above failed */ if (vlantap->tap.minor == 0) break; sysfs_remove_link(&dev->dev.kobj, tap_name); devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor); device_destroy(&ipvtap_class, devt); tap_free_minor(ipvtap_major, &vlantap->tap); break; case NETDEV_CHANGE_TX_QUEUE_LEN: if (tap_queue_resize(&vlantap->tap)) return NOTIFY_BAD; break; } return NOTIFY_DONE; } static struct notifier_block ipvtap_notifier_block __read_mostly = { .notifier_call = ipvtap_device_event, }; static int __init ipvtap_init(void) { int err; err = tap_create_cdev(&ipvtap_cdev, &ipvtap_major, "ipvtap", THIS_MODULE); if (err) goto out1; err = class_register(&ipvtap_class); if (err) goto out2; err = register_netdevice_notifier(&ipvtap_notifier_block); if (err) goto out3; err = ipvlan_link_register(&ipvtap_link_ops); if (err) goto out4; return 0; out4: unregister_netdevice_notifier(&ipvtap_notifier_block); out3: class_unregister(&ipvtap_class); out2: tap_destroy_cdev(ipvtap_major, &ipvtap_cdev); out1: return err; } module_init(ipvtap_init); static void __exit ipvtap_exit(void) { rtnl_link_unregister(&ipvtap_link_ops); unregister_netdevice_notifier(&ipvtap_notifier_block); class_unregister(&ipvtap_class); tap_destroy_cdev(ipvtap_major, &ipvtap_cdev); } module_exit(ipvtap_exit); MODULE_ALIAS_RTNL_LINK("ipvtap"); MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>"); MODULE_DESCRIPTION("IP-VLAN based tap driver"); MODULE_LICENSE("GPL"); |
289 265 265 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 | /* SPDX-License-Identifier: GPL-2.0 */ /* * net/dst.h Protocol independent destination cache definitions. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * */ #ifndef _NET_DST_H #define _NET_DST_H #include <net/dst_ops.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> #include <linux/bug.h> #include <linux/jiffies.h> #include <linux/refcount.h> #include <linux/rcuref.h> #include <net/neighbour.h> #include <asm/processor.h> #include <linux/indirect_call_wrapper.h> struct sk_buff; struct dst_entry { struct net_device *dev; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else void *__pad1; #endif int (*input)(struct sk_buff *); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); unsigned short flags; #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 #define DST_NOCOUNT 0x0008 #define DST_FAKE_RTABLE 0x0010 #define DST_XFRM_TUNNEL 0x0020 #define DST_XFRM_QUEUE 0x0040 #define DST_METADATA 0x0080 /* A non-zero value of dst->obsolete forces by-hand validation * of the route entry. Positive values are set by the generic * dst layer to indicate that the entry has been forcefully * destroyed. * * Negative values are used by the implementation layer code to * force invocation of the dst_ops->check() method. */ short obsolete; #define DST_OBSOLETE_NONE 0 #define DST_OBSOLETE_DEAD 2 #define DST_OBSOLETE_FORCE_CHK -1 #define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ /* * __rcuref wants to be on a different cache line from * input/output/ops or performance tanks badly */ #ifdef CONFIG_64BIT rcuref_t __rcuref; /* 64-bit offset 64 */ #endif int __use; unsigned long lastuse; struct rcu_head rcu_head; short error; short __pad; __u32 tclassid; #ifndef CONFIG_64BIT struct lwtunnel_state *lwtstate; rcuref_t __rcuref; /* 32-bit offset 64 */ #endif netdevice_tracker dev_tracker; /* * Used by rtable and rt6_info. Moves lwtstate into the next cache * line on 64bit so that lwtstate does not cause false sharing with * __rcuref under contention of __rcuref. This also puts the * frequently accessed members of rtable and rt6_info out of the * __rcuref cache line. */ struct list_head rt_uncached; struct uncached_list *rt_uncached_list; #ifdef CONFIG_64BIT struct lwtunnel_state *lwtstate; #endif }; struct dst_metrics { u32 metrics[RTAX_MAX]; refcount_t refcnt; } __aligned(4); /* Low pointer bits contain DST_METRICS_FLAGS */ extern const struct dst_metrics dst_default_metrics; u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); #define DST_METRICS_READ_ONLY 0x1UL #define DST_METRICS_REFCOUNTED 0x2UL #define DST_METRICS_FLAGS 0x3UL #define __DST_METRICS_PTR(Y) \ ((u32 *)((Y) & ~DST_METRICS_FLAGS)) #define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics) static inline bool dst_metrics_read_only(const struct dst_entry *dst) { return dst->_metrics & DST_METRICS_READ_ONLY; } void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old); static inline void dst_destroy_metrics_generic(struct dst_entry *dst) { unsigned long val = dst->_metrics; if (!(val & DST_METRICS_READ_ONLY)) __dst_destroy_metrics_generic(dst, val); } static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst) { unsigned long p = dst->_metrics; BUG_ON(!p); if (p & DST_METRICS_READ_ONLY) return dst->ops->cow_metrics(dst, p); return __DST_METRICS_PTR(p); } /* This may only be invoked before the entry has reached global * visibility. */ static inline void dst_init_metrics(struct dst_entry *dst, const u32 *src_metrics, bool read_only) { dst->_metrics = ((unsigned long) src_metrics) | (read_only ? DST_METRICS_READ_ONLY : 0); } static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src) { u32 *dst_metrics = dst_metrics_write_ptr(dest); if (dst_metrics) { u32 *src_metrics = DST_METRICS_PTR(src); memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32)); } } static inline u32 *dst_metrics_ptr(struct dst_entry *dst) { return DST_METRICS_PTR(dst); } static inline u32 dst_metric_raw(const struct dst_entry *dst, const int metric) { u32 *p = DST_METRICS_PTR(dst); return p[metric-1]; } static inline u32 dst_metric(const struct dst_entry *dst, const int metric) { WARN_ON_ONCE(metric == RTAX_HOPLIMIT || metric == RTAX_ADVMSS || metric == RTAX_MTU); return dst_metric_raw(dst, metric); } static inline u32 dst_metric_advmss(const struct dst_entry *dst) { u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS); if (!advmss) advmss = dst->ops->default_advmss(dst); return advmss; } static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) { u32 *p = dst_metrics_write_ptr(dst); if (p) p[metric-1] = val; } /* Kernel-internal feature bits that are unallocated in user space. */ #define DST_FEATURE_ECN_CA (1U << 31) #define DST_FEATURE_MASK (DST_FEATURE_ECN_CA) #define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN) static inline u32 dst_feature(const struct dst_entry *dst, u32 feature) { return dst_metric(dst, RTAX_FEATURES) & feature; } INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *)); INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *)); static inline u32 dst_mtu(const struct dst_entry *dst) { return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst); } /* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric) { return msecs_to_jiffies(dst_metric(dst, metric)); } static inline int dst_metric_locked(const struct dst_entry *dst, int metric) { return dst_metric(dst, RTAX_LOCK) & (1 << metric); } static inline void dst_hold(struct dst_entry *dst) { /* * If your kernel compilation stops here, please check * the placement of __rcuref in struct dst_entry */ BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63); WARN_ON(!rcuref_get(&dst->__rcuref)); } static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { if (unlikely(time != dst->lastuse)) { dst->__use++; dst->lastuse = time; } } static inline struct dst_entry *dst_clone(struct dst_entry *dst) { if (dst) dst_hold(dst); return dst; } void dst_release(struct dst_entry *dst); void dst_release_immediate(struct dst_entry *dst); static inline void refdst_drop(unsigned long refdst) { if (!(refdst & SKB_DST_NOREF)) dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK)); } /** * skb_dst_drop - drops skb dst * @skb: buffer * * Drops dst reference count if a reference was taken. */ static inline void skb_dst_drop(struct sk_buff *skb) { if (skb->_skb_refdst) { refdst_drop(skb->_skb_refdst); skb->_skb_refdst = 0UL; } } static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst) { nskb->slow_gro |= !!refdst; nskb->_skb_refdst = refdst; if (!(nskb->_skb_refdst & SKB_DST_NOREF)) dst_clone(skb_dst(nskb)); } static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb) { __skb_dst_copy(nskb, oskb->_skb_refdst); } /** * dst_hold_safe - Take a reference on a dst if possible * @dst: pointer to dst entry * * This helper returns false if it could not safely * take a reference on a dst. */ static inline bool dst_hold_safe(struct dst_entry *dst) { return rcuref_get(&dst->__rcuref); } /** * skb_dst_force - makes sure skb dst is refcounted * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. * Returns: true if dst is refcounted. */ static inline bool skb_dst_force(struct sk_buff *skb) { if (skb_dst_is_noref(skb)) { struct dst_entry *dst = skb_dst(skb); WARN_ON(!rcu_read_lock_held()); if (!dst_hold_safe(dst)) dst = NULL; skb->_skb_refdst = (unsigned long)dst; skb->slow_gro |= !!dst; } return skb->_skb_refdst != 0UL; } /** * __skb_tunnel_rx - prepare skb for rx reinsert * @skb: buffer * @dev: tunnel device * @net: netns for packet i/o * * After decapsulation, packet is going to re-enter (netif_rx()) our stack, * so make some cleanups. (no accounting done) */ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { skb->dev = dev; /* * Clear hash so that we can recalculate the hash for the * encapsulated packet, unless we have already determine the hash * over the L4 4-tuple. */ skb_clear_hash_if_not_l4(skb); skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, !net_eq(net, dev_net(dev))); } /** * skb_tunnel_rx - prepare skb for rx reinsert * @skb: buffer * @dev: tunnel device * @net: netns for packet i/o * * After decapsulation, packet is going to re-enter (netif_rx()) our stack, * so make some cleanups, and perform accounting. * Note: this accounting is not SMP safe. */ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { DEV_STATS_INC(dev, rx_packets); DEV_STATS_ADD(dev, rx_bytes, skb->len); __skb_tunnel_rx(skb, dev, net); } static inline u32 dst_tclassid(const struct sk_buff *skb) { #ifdef CONFIG_IP_ROUTE_CLASSID const struct dst_entry *dst; dst = skb_dst(skb); if (dst) return dst->tclassid; #endif return 0; } int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static inline int dst_discard(struct sk_buff *skb) { return dst_discard_out(&init_net, skb->sk, skb); } void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags); void dst_dev_put(struct dst_entry *dst); static inline void dst_confirm(struct dst_entry *dst) { } static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr) { struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr); return IS_ERR(n) ? NULL : n; } static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst, struct sk_buff *skb) { struct neighbour *n; if (WARN_ON_ONCE(!dst->ops->neigh_lookup)) return NULL; n = dst->ops->neigh_lookup(dst, skb, NULL); return IS_ERR(n) ? NULL : n; } static inline void dst_confirm_neigh(const struct dst_entry *dst, const void *daddr) { if (dst->ops->confirm_neigh) dst->ops->confirm_neigh(dst, daddr); } static inline void dst_link_failure(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops && dst->ops->link_failure) dst->ops->link_failure(skb); } static inline void dst_set_expires(struct dst_entry *dst, int timeout) { unsigned long expires = jiffies + timeout; if (expires == 0) expires = 1; if (dst->expires == 0 || time_before(expires, dst->expires)) dst->expires = expires; } static inline unsigned int dst_dev_overhead(struct dst_entry *dst, struct sk_buff *skb) { if (likely(dst)) return LL_RESERVED_SPACE(dst->dev); return skb->mac_len; } INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, struct sk_buff *)); /* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return INDIRECT_CALL_INET(skb_dst(skb)->output, ip6_output, ip_output, net, sk, skb); } INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *)); /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { return INDIRECT_CALL_INET(skb_dst(skb)->input, ip6_input, ip_local_deliver, skb); } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { if (dst->obsolete) dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie); return dst; } /* Flags for xfrm_lookup flags argument. */ enum { XFRM_LOOKUP_ICMP = 1 << 0, XFRM_LOOKUP_QUEUE = 1 << 1, XFRM_LOOKUP_KEEP_DST_REF = 1 << 2, }; struct flowi; #ifndef CONFIG_XFRM static inline struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return dst_orig; } static inline struct dst_entry * xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id) { return dst_orig; } static inline struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return dst_orig; } static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) { return NULL; } #else struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); struct dst_entry *xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id); struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); /* skb attached with this dst needs transformation if dst->xfrm is valid */ static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) { return dst->xfrm; } #endif static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, true); } /* update dst pmtu but not do neighbor confirm */ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old); struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); unsigned int dst_blackhole_mtu(const struct dst_entry *dst); #endif /* _NET_DST_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2020 - Google LLC * Author: Quentin Perret <qperret@google.com> */ #ifndef __ARM64_KVM_PKVM_H__ #define __ARM64_KVM_PKVM_H__ #include <linux/arm_ffa.h> #include <linux/memblock.h> #include <linux/scatterlist.h> #include <asm/kvm_pgtable.h> /* Maximum number of VMs that can co-exist under pKVM. */ #define KVM_MAX_PVMS 255 #define HYP_MEMBLOCK_REGIONS 128 int pkvm_init_host_vm(struct kvm *kvm); int pkvm_create_hyp_vm(struct kvm *kvm); void pkvm_destroy_hyp_vm(struct kvm *kvm); int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu); /* * This functions as an allow-list of protected VM capabilities. * Features not explicitly allowed by this function are denied. */ static inline bool kvm_pvm_ext_allowed(long ext) { switch (ext) { case KVM_CAP_IRQCHIP: case KVM_CAP_ARM_PSCI: case KVM_CAP_ARM_PSCI_0_2: case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: case KVM_CAP_MSI_DEVID: case KVM_CAP_ARM_VM_IPA_SIZE: case KVM_CAP_ARM_PMU_V3: case KVM_CAP_ARM_SVE: case KVM_CAP_ARM_PTRAUTH_ADDRESS: case KVM_CAP_ARM_PTRAUTH_GENERIC: return true; default: return false; } } extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); static inline unsigned long hyp_vmemmap_memblock_size(struct memblock_region *reg, size_t vmemmap_entry_size) { unsigned long nr_pages = reg->size >> PAGE_SHIFT; unsigned long start, end; start = (reg->base >> PAGE_SHIFT) * vmemmap_entry_size; end = start + nr_pages * vmemmap_entry_size; start = ALIGN_DOWN(start, PAGE_SIZE); end = ALIGN(end, PAGE_SIZE); return end - start; } static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size) { unsigned long res = 0, i; for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) { res += hyp_vmemmap_memblock_size(&kvm_nvhe_sym(hyp_memory)[i], vmemmap_entry_size); } return res >> PAGE_SHIFT; } static inline unsigned long hyp_vm_table_pages(void) { return PAGE_ALIGN(KVM_MAX_PVMS * sizeof(void *)) >> PAGE_SHIFT; } static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) { unsigned long total = 0; int i; /* Provision the worst case scenario */ for (i = KVM_PGTABLE_FIRST_LEVEL; i <= KVM_PGTABLE_LAST_LEVEL; i++) { nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE); total += nr_pages; } return total; } static inline unsigned long __hyp_pgtable_total_pages(void) { unsigned long res = 0, i; /* Cover all of memory with page-granularity */ for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) { struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i]; res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT); } return res; } static inline unsigned long hyp_s1_pgtable_pages(void) { unsigned long res; res = __hyp_pgtable_total_pages(); /* Allow 1 GiB for private mappings */ res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); return res; } static inline unsigned long host_s2_pgtable_pages(void) { unsigned long res; /* * Include an extra 16 pages to safely upper-bound the worst case of * concatenated pgds. */ res = __hyp_pgtable_total_pages() + 16; /* Allow 1 GiB for MMIO mappings */ res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); return res; } #ifdef CONFIG_NVHE_EL2_DEBUG static inline unsigned long pkvm_selftest_pages(void) { return 32; } #else static inline unsigned long pkvm_selftest_pages(void) { return 0; } #endif #define KVM_FFA_MBOX_NR_PAGES 1 static inline unsigned long hyp_ffa_proxy_pages(void) { size_t desc_max; /* * The hypervisor FFA proxy needs enough memory to buffer a fragmented * descriptor returned from EL3 in response to a RETRIEVE_REQ call. */ desc_max = sizeof(struct ffa_mem_region) + sizeof(struct ffa_mem_region_attributes) + sizeof(struct ffa_composite_mem_region) + SG_MAX_SEGMENTS * sizeof(struct ffa_mem_region_addr_range); /* Plus a page each for the hypervisor's RX and TX mailboxes. */ return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE); } static inline size_t pkvm_host_sve_state_size(void) { if (!system_supports_sve()) return 0; return size_add(sizeof(struct cpu_sve_state), SVE_SIG_REGS_SIZE(sve_vq_from_vl(kvm_host_sve_max_vl))); } struct pkvm_mapping { struct rb_node node; u64 gfn; u64 pfn; u64 nr_pages; u64 __subtree_last; /* Internal member for interval tree */ }; int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops); void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags); int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size); int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold); int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags); void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags); int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_mmu_memory_cache *mc); void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level); kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte); #endif /* __ARM64_KVM_PKVM_H__ */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GFP_H #define __LINUX_GFP_H #include <linux/gfp_types.h> #include <linux/mmzone.h> #include <linux/topology.h> #include <linux/alloc_tag.h> #include <linux/sched.h> struct vm_area_struct; struct mempolicy; /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 static inline int gfp_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE); BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >> GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; /* Group based on mobility */ return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } #undef GFP_MOVABLE_MASK #undef GFP_MOVABLE_SHIFT static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) { return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) { /* * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed. * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd. * All GFP_* flags including GFP_NOWAIT use one or both flags. * try_alloc_pages() is the only API that doesn't specify either flag. * * This is stronger than GFP_NOWAIT or GFP_ATOMIC because * those are guaranteed to never block on a sleeping lock. * Here we are enforcing that the allocation doesn't ever spin * on any locks (i.e. only trylocks). There is no high level * GFP_$FOO flag for this use in try_alloc_pages() as the * regular page allocator doesn't fully support this * allocation mode. */ return !!(gfp_flags & __GFP_RECLAIM); } #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else #define OPT_ZONE_HIGHMEM ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA #define OPT_ZONE_DMA ZONE_DMA #else #define OPT_ZONE_DMA ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA32 #define OPT_ZONE_DMA32 ZONE_DMA32 #else #define OPT_ZONE_DMA32 ZONE_NORMAL #endif /* * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT * bits long and there are 16 of them to cover all possible combinations of * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM. * * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. * But GFP_MOVABLE is not only a zone specifier but also an allocation * policy. Therefore __GFP_MOVABLE plus another zone selector is valid. * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1". * * bit result * ================= * 0x0 => NORMAL * 0x1 => DMA or NORMAL * 0x2 => HIGHMEM or NORMAL * 0x3 => BAD (DMA+HIGHMEM) * 0x4 => DMA32 or NORMAL * 0x5 => BAD (DMA+DMA32) * 0x6 => BAD (HIGHMEM+DMA32) * 0x7 => BAD (HIGHMEM+DMA32+DMA) * 0x8 => NORMAL (MOVABLE+0) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) * 0xc => DMA32 or NORMAL (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) * * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms. */ #if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4 /* ZONE_DEVICE is not a valid GFP zone specifier */ #define GFP_ZONES_SHIFT 2 #else #define GFP_ZONES_SHIFT ZONES_SHIFT #endif #if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG #error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer #endif #define GFP_ZONE_TABLE ( \ (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \ | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \ | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \ | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\ | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\ ) /* * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per * entry starting with bit 0. Bit is set if the combination is not * allowed. */ #define GFP_ZONE_BAD ( \ 1 << (___GFP_DMA | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32) \ | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \ ) static inline enum zone_type gfp_zone(gfp_t flags) { enum zone_type z; int bit = (__force int) (flags & GFP_ZONEMASK); z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); return z; } /* * There is only one page-allocator function, and two main namespaces to * it. The alloc_page*() variants return 'struct page *' and as such * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ static inline int gfp_zonelist(gfp_t flags) { #ifdef CONFIG_NUMA if (unlikely(flags & __GFP_THISNODE)) return ZONELIST_NOFALLBACK; #endif return ZONELIST_FALLBACK; } /* * gfp flag masking for nested internal allocations. * * For code that needs to do allocations inside the public allocation API (e.g. * memory allocation tracking code) the allocations need to obey the caller * allocation context constrains to prevent allocation context mismatches (e.g. * GFP_KERNEL allocations in GFP_NOFS contexts) from potential deadlock * situations. * * It is also assumed that these nested allocations are for internal kernel * object storage purposes only and are not going to be used for DMA, etc. Hence * we strip out all the zone information and leave just the context information * intact. * * Further, internal allocations must fail before the higher level allocation * can fail, so we must make them fail faster and fail silently. We also don't * want them to deplete emergency reserves. Hence nested allocations must be * prepared for these allocations to fail. */ static inline gfp_t gfp_nested_mask(gfp_t flags) { return ((flags & (GFP_KERNEL | GFP_ATOMIC | __GFP_NOLOCKDEP)) | (__GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)); } /* * We get the zone list from the current node and the gfp_mask. * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones. * There are two zonelists per node, one for all zones with memory and * one containing just zones from the node the zonelist belongs to. * * For the case of non-NUMA systems the NODE_DATA() gets optimized to * &contig_page_data at compile-time. */ static inline struct zonelist *node_zonelist(int nid, gfp_t flags) { return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags); } #ifndef HAVE_ARCH_FREE_PAGE static inline void arch_free_page(struct page *page, int order) { } #endif #ifndef HAVE_ARCH_ALLOC_PAGE static inline void arch_alloc_page(struct page *page, int order) { } #endif struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); #define __alloc_pages(...) alloc_hooks(__alloc_pages_noprof(__VA_ARGS__)) struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); #define __folio_alloc(...) alloc_hooks(__folio_alloc_noprof(__VA_ARGS__)) unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); #define alloc_pages_bulk_mempolicy(...) \ alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ #define alloc_pages_bulk(_gfp, _nr_pages, _page_array) \ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } #define alloc_pages_bulk_node(...) \ alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__)) static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) { gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN); if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN)) return; if (node_online(this_node)) return; pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node); dump_stack(); } /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). */ static inline struct page * __alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp_mask); return __alloc_pages_noprof(gfp_mask, order, nid, NULL); } #define __alloc_pages_node(...) alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__)) static inline struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp); return __folio_alloc_noprof(gfp, order, nid, NULL); } #define __folio_alloc_node(...) alloc_hooks(__folio_alloc_node_noprof(__VA_ARGS__)) /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, * prefer the current CPU's closest node. Otherwise node must be valid and * online. */ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return __alloc_pages_node_noprof(nid, gfp_mask, order); } #define alloc_pages_node(...) alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__)) #ifdef CONFIG_NUMA struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr); #else static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node_noprof(gfp, order, numa_node_id()); } static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid) { return folio_alloc_noprof(gfp, order); } #define vma_alloc_folio_noprof(gfp, order, vma, addr) \ folio_alloc_noprof(gfp, order) #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) #define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) static inline struct page *alloc_page_vma_noprof(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr); return &folio->page; } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) struct page *try_alloc_pages_noprof(int nid, unsigned int order); #define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); #define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask); #define get_zeroed_page(...) alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__)) void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1); #define alloc_pages_exact(...) alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__)) void free_pages_exact(void *virt, size_t size); __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); #define alloc_pages_exact_nid(...) \ alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__)) #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) #define __get_dma_pages(gfp_mask, order) \ __get_free_pages((gfp_mask) | GFP_DMA, (order)) extern void __free_pages(struct page *page, unsigned int order); extern void free_pages_nolock(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); void setup_pcp_cacheinfo(unsigned int cpu); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what * GFP flags are used before interrupts are enabled. Once interrupts are * enabled, it is set to __GFP_BITS_MASK while the system is running. During * hibernation, it is used by PM to avoid I/O during memory allocation while * devices are suspended. */ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); static inline bool gfp_has_io_fs(gfp_t gfp) { return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); } /* * Check if the gfp flags allow compaction - GFP_NOIO is a really * tricky context because the migration might require IO. */ static inline bool gfp_compaction_allowed(gfp_t gfp_mask) { return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO); } extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); #ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range_noprof(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); #define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) #endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); #ifdef CONFIG_CONTIG_ALLOC static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, int nid, nodemask_t *node) { struct page *page; if (WARN_ON(!order || !(gfp & __GFP_COMP))) return NULL; page = alloc_contig_pages_noprof(1 << order, gfp, nid, node); return page ? page_folio(page) : NULL; } #else static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, int nid, nodemask_t *node) { return NULL; } #endif /* This should be paired with folio_put() rather than free_contig_range(). */ #define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) #endif /* __LINUX_GFP_H */ |
165 164 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006 Thomas Gleixner * * This file contains driver APIs to the irq subsystem. */ #define pr_fmt(fmt) "genirq: " fmt #include <linux/irq.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include <linux/irqdomain.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/sched/task.h> #include <linux/sched/isolation.h> #include <uapi/linux/sched/types.h> #include <linux/task_work.h> #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); static int __init setup_forced_irqthreads(char *arg) { static_branch_enable(&force_irqthreads_key); return 0; } early_param("threadirqs", setup_forced_irqthreads); #endif static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); bool inprogress; do { unsigned long flags; /* * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); inprogress = irqd_irq_inprogress(&desc->irq_data); /* * If requested and supported, check at the chip whether it * is in flight at the hardware level, i.e. already pending * in a CPU and waiting for service and acknowledge. */ if (!inprogress && sync_chip) { /* * Ignore the return code. inprogress is only updated * when the chip supports it. */ __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, &inprogress); } raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ } while (inprogress); } /** * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending hard IRQ handlers for this * interrupt to complete before returning. If you use this * function while holding a resource the IRQ handler may need you * will deadlock. It does not take associated threaded handlers * into account. * * Do not use this for shutdown scenarios where you must be sure * that all parts (hardirq and threaded handler) have completed. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. * * It does not check whether there is an interrupt in flight at the * hardware level, but not serviced yet, as this might deadlock when * called with interrupts disabled and the target CPU of the interrupt * is the current CPU. */ bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { __synchronize_hardirq(desc, false); return !atomic_read(&desc->threads_active); } return true; } EXPORT_SYMBOL(synchronize_hardirq); static void __synchronize_irq(struct irq_desc *desc) { __synchronize_hardirq(desc, true); /* * We made sure that no hardirq handler is running. Now verify that no * threaded handlers are active. */ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); } /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending IRQ handlers for this interrupt * to complete before returning. If you use this function while * holding a resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when * an interrupt thread is associated to @irq. * * It optionally makes sure (when the irq chip supports that method) * that the interrupt is not pending in any CPU and waiting for * service. */ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) __synchronize_irq(desc); } EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return false; return true; } /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check * */ int irq_can_set_affinity(unsigned int irq) { return __irq_can_set_affinity(irq_to_desc(irq)); } /** * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space * @irq: Interrupt to check * * Like irq_can_set_affinity() above, but additionally checks for the * AFFINITY_MANAGED flag. */ bool irq_can_set_affinity_usr(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); return __irq_can_set_affinity(desc) && !irqd_affinity_is_managed(&desc->irq_data); } /** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affinity changed * * We just set IRQTF_AFFINITY and delegate the affinity setting * to the interrupt thread itself. We can not call * set_cpus_allowed_ptr() here as we hold desc->lock and this * code can be called from hard interrupt context. */ static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; for_each_action_of_desc(desc, action) { if (action->thread) { set_bit(IRQTF_AFFINITY, &action->thread_flags); wake_up_process(action->thread); } if (action->secondary && action->secondary->thread) { set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags); wake_up_process(action->secondary->thread); } } } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); if (!cpumask_empty(m)) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); } #else static inline void irq_validate_effective_affinity(struct irq_data *data) { } #endif static DEFINE_PER_CPU(struct cpumask, __tmp_mask); int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask); struct irq_desc *desc = irq_data_to_desc(data); struct irq_chip *chip = irq_data_get_irq_chip(data); const struct cpumask *prog_mask; int ret; if (!chip || !chip->irq_set_affinity) return -EINVAL; /* * If this is a managed interrupt and housekeeping is enabled on * it check whether the requested affinity mask intersects with * a housekeeping CPU. If so, then remove the isolated CPUs from * the mask and just keep the housekeeping CPU(s). This prevents * the affinity setter from routing the interrupt to an isolated * CPU to avoid that I/O submitted from a housekeeping CPU causes * interrupts on an isolated one. * * If the masks do not intersect or include online CPU(s) then * keep the requested mask. The isolated target CPUs are only * receiving interrupts when the I/O operation was submitted * directly from them. * * If all housekeeping CPUs in the affinity mask are offline, the * interrupt will be migrated by the CPU hotplug code once a * housekeeping CPU which belongs to the affinity mask comes * online. */ if (irqd_affinity_is_managed(data) && housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { const struct cpumask *hk_mask; hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); cpumask_and(tmp_mask, mask, hk_mask); if (!cpumask_intersects(tmp_mask, cpu_online_mask)) prog_mask = mask; else prog_mask = tmp_mask; } else { prog_mask = mask; } /* * Make sure we only provide online CPUs to the irqchip, * unless we are being asked to force the affinity (in which * case we do as we are told). */ cpumask_and(tmp_mask, prog_mask, cpu_online_mask); if (!force && !cpumask_empty(tmp_mask)) ret = chip->irq_set_affinity(data, tmp_mask, force); else if (force) ret = chip->irq_set_affinity(data, mask, force); else ret = -EINVAL; switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); ret = 0; } return ret; } #ifdef CONFIG_GENERIC_PENDING_IRQ static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { struct irq_desc *desc = irq_data_to_desc(data); irqd_set_move_pending(data); irq_copy_pending(desc, dest); return 0; } #else static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { return -EBUSY; } #endif static int irq_try_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { int ret = irq_do_set_affinity(data, dest, force); /* * In case that the underlying vector management is busy and the * architecture supports the generic pending mechanism then utilize * this to avoid returning an error to user space. */ if (ret == -EBUSY && !force) ret = irq_set_affinity_pending(data, dest); return ret; } static bool irq_set_affinity_deactivated(struct irq_data *data, const struct cpumask *mask) { struct irq_desc *desc = irq_data_to_desc(data); /* * Handle irq chips which can handle affinity only in activated * state correctly * * If the interrupt is not yet activated, just store the affinity * mask and do not call the chip driver at all. On activation the * driver has to make sure anyway that the interrupt is in a * usable state so startup works. */ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data) || !irqd_affinity_on_activate(data)) return false; cpumask_copy(desc->irq_common_data.affinity, mask); irq_data_update_effective_affinity(data, mask); irqd_set(data, IRQD_AFFINITY_SET); return true; } int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { struct irq_chip *chip = irq_data_get_irq_chip(data); struct irq_desc *desc = irq_data_to_desc(data); int ret = 0; if (!chip || !chip->irq_set_affinity) return -EINVAL; if (irq_set_affinity_deactivated(data, mask)) return 0; if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); } if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); if (!schedule_work(&desc->affinity_notify->work)) { /* Work was already scheduled, drop our extra ref */ kref_put(&desc->affinity_notify->kref, desc->affinity_notify->release); } } irqd_set(data, IRQD_AFFINITY_SET); return ret; } /** * irq_update_affinity_desc - Update affinity management for an interrupt * @irq: The interrupt number to update * @affinity: Pointer to the affinity descriptor * * This interface can be used to configure the affinity management of * interrupts which have been allocated already. * * There are certain limitations on when it may be used - attempts to use it * for when the kernel is configured for generic IRQ reservation mode (in * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with * managed/non-managed interrupt accounting. In addition, attempts to use it on * an interrupt which is already started or which has already been configured * as managed will also fail, as these mean invalid init state or double init. */ int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { struct irq_desc *desc; unsigned long flags; bool activated; int ret = 0; /* * Supporting this with the reservation scheme used by x86 needs * some more thought. Fail it for now. */ if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) return -EOPNOTSUPP; desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return -EINVAL; /* Requires the interrupt to be shut down */ if (irqd_is_started(&desc->irq_data)) { ret = -EBUSY; goto out_unlock; } /* Interrupts which are already managed cannot be modified */ if (irqd_affinity_is_managed(&desc->irq_data)) { ret = -EBUSY; goto out_unlock; } /* * Deactivate the interrupt. That's required to undo * anything an earlier activation has established. */ activated = irqd_is_activated(&desc->irq_data); if (activated) irq_domain_deactivate_irq(&desc->irq_data); if (affinity->is_managed) { irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); } cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); /* Restore the activation state */ if (activated) irq_domain_activate_irq(&desc->irq_data, false); out_unlock: irq_put_desc_busunlock(desc, flags); return ret; } static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret; if (!desc) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Fails if cpumask does not contain an online CPU */ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, false); } EXPORT_SYMBOL_GPL(irq_set_affinity); /** * irq_force_affinity - Force the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Same as irq_set_affinity, but without checking the mask against * online cpus. * * Solely for low level cpu hotplug code, where we need to make per * cpu interrupts affine before the cpu becomes online. */ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, true); } EXPORT_SYMBOL_GPL(irq_force_affinity); int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; desc->affinity_hint = m; irq_put_desc_unlock(desc, flags); if (m && setaffinity) __irq_set_affinity(irq, m, false); return 0; } EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work); struct irq_desc *desc = irq_to_desc(notify->irq); cpumask_var_t cpumask; unsigned long flags; if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) goto out; raw_spin_lock_irqsave(&desc->lock, flags); if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else cpumask_copy(cpumask, desc->irq_common_data.affinity); raw_spin_unlock_irqrestore(&desc->lock, flags); notify->notify(notify, cpumask); free_cpumask_var(cpumask); out: kref_put(¬ify->kref, notify->release); } /** * irq_set_affinity_notifier - control notification of IRQ affinity changes * @irq: Interrupt for which to enable/disable notification * @notify: Context for notification, or %NULL to disable * notification. Function pointers must be initialised; * the other fields will be initialised by this function. * * Must be called in process context. Notification may only be enabled * after the IRQ is allocated and must be disabled before the IRQ is * freed using free_irq(). */ int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { struct irq_desc *desc = irq_to_desc(irq); struct irq_affinity_notify *old_notify; unsigned long flags; /* The release function is promised process context */ might_sleep(); if (!desc || irq_is_nmi(desc)) return -EINVAL; /* Complete initialisation of *notify */ if (notify) { notify->irq = irq; kref_init(¬ify->kref); INIT_WORK(¬ify->work, irq_affinity_notify); } raw_spin_lock_irqsave(&desc->lock, flags); old_notify = desc->affinity_notify; desc->affinity_notify = notify; raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { if (cancel_work_sync(&old_notify->work)) { /* Pending work had a ref, put that one too */ kref_put(&old_notify->kref, old_notify->release); } kref_put(&old_notify->kref, old_notify->release); } return 0; } EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. */ int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; int ret, node = irq_desc_get_node(desc); static DEFINE_RAW_SPINLOCK(mask_lock); static struct cpumask mask; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!__irq_can_set_affinity(desc)) return 0; raw_spin_lock(&mask_lock); /* * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. */ if (irqd_affinity_is_managed(&desc->irq_data) || irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) set = desc->irq_common_data.affinity; else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } cpumask_and(&mask, cpu_online_mask, set); if (cpumask_empty(&mask)) cpumask_copy(&mask, cpu_online_mask); if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); /* make sure at least one of the cpus in nodemask is online */ if (cpumask_intersects(&mask, nodemask)) cpumask_and(&mask, &mask, nodemask); } ret = irq_do_set_affinity(&desc->irq_data, &mask, false); raw_spin_unlock(&mask_lock); return ret; } #else /* Wrapper for ALPHA specific affinity selector magic */ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } #endif /* CONFIG_AUTO_IRQ_AFFINITY */ #endif /* CONFIG_SMP */ /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt * @irq: interrupt number to set affinity * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU * specific data for percpu_devid interrupts * * This function uses the vCPU specific data to set the vCPU * affinity for an irq. The vCPU specific data is passed from * outside, such as KVM. One example code path is as below: * KVM -> IOMMU -> irq_set_vcpu_affinity(). */ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); struct irq_data *data; struct irq_chip *chip; int ret = -ENOSYS; if (!desc) return -EINVAL; data = irq_desc_get_irq_data(desc); do { chip = irq_data_get_irq_chip(data); if (chip && chip->irq_set_vcpu_affinity) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY data = data->parent_data; #else data = NULL; #endif } while (data); if (data) ret = chip->irq_set_vcpu_affinity(data, vcpu_info); irq_put_desc_unlock(desc, flags); return ret; } EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); void __disable_irq(struct irq_desc *desc) { if (!desc->depth++) irq_disable(desc); } static int __disable_irq_nosync(unsigned int irq) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; __disable_irq(desc); irq_put_desc_busunlock(desc, flags); return 0; } /** * disable_irq_nosync - disable an irq without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and Enables are * nested. * Unlike disable_irq(), this function does not ensure existing * instances of the IRQ handler have completed before returning. * * This function may be called from IRQ context. */ void disable_irq_nosync(unsigned int irq) { __disable_irq_nosync(irq); } EXPORT_SYMBOL(disable_irq_nosync); /** * disable_irq - disable an irq and wait for completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are * nested. * This function waits for any pending IRQ handlers for this interrupt * to complete before returning. If you use this function while * holding a resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when * an interrupt thread is associated to @irq. * */ void disable_irq(unsigned int irq) { might_sleep(); if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } EXPORT_SYMBOL(disable_irq); /** * disable_hardirq - disables an irq and waits for hardirq completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are * nested. * This function waits for any pending hard IRQ handlers for this * interrupt to complete before returning. If you use this function while * holding a resource the hard IRQ handler may need you will deadlock. * * When used to optimistically disable an interrupt from atomic context * the return value must be checked. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. */ bool disable_hardirq(unsigned int irq) { if (!__disable_irq_nosync(irq)) return synchronize_hardirq(irq); return false; } EXPORT_SYMBOL_GPL(disable_hardirq); /** * disable_nmi_nosync - disable an nmi without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and enables are * nested. * The interrupt to disable must have been requested through request_nmi. * Unlike disable_nmi(), this function does not ensure existing * instances of the IRQ handler have completed before returning. */ void disable_nmi_nosync(unsigned int irq) { disable_irq_nosync(irq); } void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { case 0: err_out: WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq_desc_get_irq(desc)); break; case 1: { if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); /* * Call irq_startup() not irq_enable() here because the * interrupt might be marked NOAUTOEN so irq_startup() * needs to be invoked when it gets enabled the first time. * This is also required when __enable_irq() is invoked for * a managed and shutdown interrupt from the S3 resume * path. * * If it was already started up, then irq_startup() will * invoke irq_enable() under the hood. */ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); break; } default: desc->depth--; } } /** * enable_irq - enable handling of an irq * @irq: Interrupt to enable * * Undoes the effect of one call to disable_irq(). If this * matches the last disable, processing of interrupts on this * IRQ line is re-enabled. * * This function may be called from IRQ context only when * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return; if (WARN(!desc->irq_data.chip, KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) goto out; __enable_irq(desc); out: irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL(enable_irq); /** * enable_nmi - enable handling of an nmi * @irq: Interrupt to enable * * The interrupt to enable must have been requested through request_nmi. * Undoes the effect of one call to disable_nmi(). If this * matches the last disable, processing of interrupts on this * IRQ line is re-enabled. */ void enable_nmi(unsigned int irq) { enable_irq(irq); } static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE) return 0; if (desc->irq_data.chip->irq_set_wake) ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); return ret; } /** * irq_set_irq_wake - control irq power management wakeup * @irq: interrupt to control * @on: enable/disable power management wakeup * * Enable/disable power management wakeup mode, which is * disabled by default. Enables and disables must match, * just as they match for non-wakeup mode support. * * Wakeup mode lets this IRQ wake the system from sleep * states like "suspend to RAM". * * Note: irq enable/disable state is completely orthogonal * to the enable/disable state of irq wake. An irq can be * disabled with disable_irq() and still wake the system as * long as the irq has wake enabled. If this does not hold, * then the underlying irq chip and the related driver need * to be investigated. */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); int ret = 0; if (!desc) return -EINVAL; /* Don't use NMIs as wake up interrupts please */ if (irq_is_nmi(desc)) { ret = -EINVAL; goto out_unlock; } /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 0; else irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); } } else { if (desc->wake_depth == 0) { WARN(1, "Unbalanced IRQ %d wake disable\n", irq); } else if (--desc->wake_depth == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 1; else irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } out_unlock: irq_put_desc_busunlock(desc, flags); return ret; } EXPORT_SYMBOL(irq_set_irq_wake); /* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. */ int can_request_irq(unsigned int irq, unsigned long irqflags) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); int canrequest = 0; if (!desc) return 0; if (irq_settings_can_request(desc)) { if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED) canrequest = 1; } irq_put_desc_unlock(desc, flags); return canrequest; } int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) { struct irq_chip *chip = desc->irq_data.chip; int ret, unmask = 0; if (!chip || !chip->irq_set_type) { /* * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ pr_debug("No set_type function for IRQ %d (%s)\n", irq_desc_get_irq(desc), chip ? (chip->name ? : "unknown") : "unknown"); return 0; } if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); if (!irqd_irq_disabled(&desc->irq_data)) unmask = 1; } /* Mask all flags except trigger mode */ flags &= IRQ_TYPE_SENSE_MASK; ret = chip->irq_set_type(&desc->irq_data, flags); switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); irq_settings_set_trigger_mask(desc, flags); irqd_clear(&desc->irq_data, IRQD_LEVEL); irq_settings_clr_level(desc); if (flags & IRQ_TYPE_LEVEL_MASK) { irq_settings_set_level(desc); irqd_set(&desc->irq_data, IRQD_LEVEL); } ret = 0; break; default: pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) unmask_irq(desc); return ret; } #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; desc->parent_irq = parent_irq; irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* * Default primary interrupt handler for threaded interrupts. Is * assigned as primary handler when request_threaded_irq is called * with handler == NULL. Useful for oneshot interrupts. */ static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) { return IRQ_WAKE_THREAD; } /* * Primary handler for nested threaded interrupts. Should never be * called. */ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) { WARN(1, "Primary handler called for nested irq %d\n", irq); return IRQ_NONE; } static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) { WARN(1, "Secondary action handler called for irq %d\n", irq); return IRQ_NONE; } #ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; bool valid = false; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; __set_current_state(TASK_RUNNING); /* * In case we are out of memory we set IRQTF_AFFINITY again and * try again next time */ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { set_bit(IRQTF_AFFINITY, &action->thread_flags); return; } raw_spin_lock_irq(&desc->lock); /* * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ if (cpumask_available(desc->irq_common_data.affinity)) { const struct cpumask *m; m = irq_data_get_effective_affinity_mask(&desc->irq_data); cpumask_copy(mask, m); valid = true; } raw_spin_unlock_irq(&desc->lock); if (valid) set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif static int irq_wait_for_interrupt(struct irq_desc *desc, struct irqaction *action) { for (;;) { set_current_state(TASK_INTERRUPTIBLE); irq_thread_check_affinity(desc, action); if (kthread_should_stop()) { /* may need to run one last time */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } __set_current_state(TASK_RUNNING); return -1; } if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } schedule(); } } /* * Oneshot interrupts keep the irq line masked until the threaded * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ static void irq_finalize_oneshot(struct irq_desc *desc, struct irqaction *action) { if (!(desc->istate & IRQS_ONESHOT) || action->handler == irq_forced_secondary_handler) return; again: chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); /* * Implausible though it may be we need to protect us against * the following scenario: * * The thread is faster done than the hard interrupt handler * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due * to IRQS_INPROGRESS and the irq line is masked forever. * * This also serializes the state of shared oneshot handlers * versus "desc->threads_oneshot |= action->thread_mask;" in * irq_wake_thread(). See the comment there which explains the * serialization. */ if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); goto again; } /* * Now check again, whether the thread should run. Otherwise * we would clear the threads_oneshot bit of this thread which * was just set. */ if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) goto out_unlock; desc->threads_oneshot &= ~action->thread_mask; if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) unmask_threaded_irq(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); } /* * Interrupts explicitly requested as threaded interrupts want to be * preemptible - many of them need to sleep and wait for slow busses to * complete. */ static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret = action->thread_fn(action->irq, action->dev_id); if (ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); irq_finalize_oneshot(desc, action); return ret; } /* * Interrupts which are not explicitly requested as threaded * interrupts rely on the implicit bh/preempt disable of the hard irq * context. So we need to disable bh here to avoid deadlocks and other * side effects. */ static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret; local_bh_disable(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); ret = irq_thread_fn(desc, action); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); local_bh_enable(); return ret; } void wake_threads_waitq(struct irq_desc *desc) { if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } static void irq_thread_dtor(struct callback_head *unused) { struct task_struct *tsk = current; struct irq_desc *desc; struct irqaction *action; if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) return; action = kthread_data(tsk); pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm, tsk->pid, action->irq); desc = irq_to_desc(action->irq); /* * If IRQTF_RUNTHREAD is set, we need to decrement * desc->threads_active and wake possible waiters. */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) wake_threads_waitq(desc); /* Prevent a stale desc->threads_oneshot */ irq_finalize_oneshot(desc, action); } static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) { struct irqaction *secondary = action->secondary; if (WARN_ON_ONCE(!secondary)) return; raw_spin_lock_irq(&desc->lock); __irq_wake_thread(desc, secondary); raw_spin_unlock_irq(&desc->lock); } /* * Internal function to notify that a interrupt thread is ready. */ static void irq_thread_set_ready(struct irq_desc *desc, struct irqaction *action) { set_bit(IRQTF_READY, &action->thread_flags); wake_up(&desc->wait_for_threads); } /* * Internal function to wake up a interrupt thread and wait until it is * ready. */ static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc, struct irqaction *action) { if (!action || !action->thread) return; wake_up_process(action->thread); wait_event(desc->wait_for_threads, test_bit(IRQTF_READY, &action->thread_flags)); } /* * Interrupt handler thread */ static int irq_thread(void *data) { struct callback_head on_exit_work; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); irq_thread_set_ready(desc, action); sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; else handler_fn = irq_thread_fn; init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, TWA_NONE); while (!irq_wait_for_interrupt(desc, action)) { irqreturn_t action_ret; action_ret = handler_fn(desc, action); if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); wake_threads_waitq(desc); } /* * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the * oneshot mask bit can be set. */ task_work_cancel_func(current, irq_thread_dtor); return 0; } /** * irq_wake_thread - wake the irq thread for the action identified by dev_id * @irq: Interrupt line * @dev_id: Device identity for which the thread should be woken * */ void irq_wake_thread(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned long flags; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return; raw_spin_lock_irqsave(&desc->lock, flags); for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); break; } } raw_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL_GPL(irq_wake_thread); static int irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads()) return 0; if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; /* * No further action required for interrupts which are requested as * threaded interrupts already */ if (new->handler == irq_default_primary_handler) return 0; new->flags |= IRQF_ONESHOT; /* * Handle the case where we have a real primary handler and a * thread handler. We force thread them as well by creating a * secondary action. */ if (new->handler && new->thread_fn) { /* Allocate the secondary action */ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!new->secondary) return -ENOMEM; new->secondary->handler = irq_forced_secondary_handler; new->secondary->thread_fn = new->thread_fn; new->secondary->dev_id = new->dev_id; new->secondary->irq = new->irq; new->secondary->name = new->name; } /* Deal with the primary handler */ set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); new->thread_fn = new->handler; new->handler = irq_default_primary_handler; return 0; } static int irq_request_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; return c->irq_request_resources ? c->irq_request_resources(d) : 0; } static void irq_release_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; if (c->irq_release_resources) c->irq_release_resources(d); } static bool irq_supports_nmi(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /* Only IRQs directly managed by the root irqchip can be set as NMI */ if (d->parent_data) return false; #endif /* Don't support NMIs for chips behind a slow bus */ if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock) return false; return d->chip->flags & IRQCHIP_SUPPORTS_NMI; } static int irq_nmi_setup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL; } static void irq_nmi_teardown(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; if (c->irq_nmi_teardown) c->irq_nmi_teardown(d); } static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { struct task_struct *t; if (!secondary) { t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); } if (IS_ERR(t)) return PTR_ERR(t); /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code * references an already freed task_struct. */ new->thread = get_task_struct(t); /* * Tell the thread to set its affinity. This is * important for shared interrupt handlers as we do * not invoke setup_affinity() for the secondary * handlers as everything is already set up. Even for * interrupts marked with IRQF_NO_BALANCE this is * correct as we want the thread to move to the cpu(s) * on which the requesting code placed the interrupt. */ set_bit(IRQTF_AFFINITY, &new->thread_flags); return 0; } /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. * * Locking rules: * * desc->request_mutex Provides serialization against a concurrent free_irq() * chip_bus_lock Provides serialization for slow bus operations * desc->lock Provides serialization against hard interrupts * * chip_bus_lock and desc->lock are sufficient for all other management and * interrupt related functions. desc->request_mutex solely serializes * request/free_irq(). */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; if (!desc) return -EINVAL; if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; if (!try_module_get(desc->owner)) return -ENODEV; new->irq = irq; /* * If the trigger type is not specified by the caller, * then use the default for this interrupt. */ if (!(new->flags & IRQF_TRIGGER_MASK)) new->flags |= irqd_get_trigger_type(&desc->irq_data); /* * Check whether the interrupt nests into another interrupt * thread. */ nested = irq_settings_is_nested_thread(desc); if (nested) { if (!new->thread_fn) { ret = -EINVAL; goto out_mput; } /* * Replace the primary handler which was provided from * the driver for non nested interrupt handling by the * dummy function which warns when called. */ new->handler = irq_nested_primary_handler; } else { if (irq_settings_can_thread(desc)) { ret = irq_setup_forced_threading(new); if (ret) goto out_mput; } } /* * Create a handler thread when a thread function is supplied * and the interrupt does not nest into another interrupt * thread. */ if (new->thread_fn && !nested) { ret = setup_irq_thread(new, irq, false); if (ret) goto out_mput; if (new->secondary) { ret = setup_irq_thread(new->secondary, irq, true); if (ret) goto out_thread; } } /* * Drivers are often written to work w/o knowledge about the * underlying irq chip implementation, so a request for a * threaded irq without a primary hard irq context handler * requires the ONESHOT flag to be set. Some irq chips like * MSI based interrupts are per se one shot safe. Check the * chip flags, so we can avoid the unmask dance at the end of * the threaded handler for those. */ if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; /* * Protects against a concurrent __free_irq() call which might wait * for synchronize_hardirq() to complete without holding the optional * chip bus lock and desc->lock. Also protects against handing out * a recycled oneshot thread_mask bit while it's still in use by * its previous owner. */ mutex_lock(&desc->request_mutex); /* * Acquire bus lock as the irq_request_resources() callback below * might rely on the serialization or the magic power management * functions which are abusing the irq_bus_lock() callback, */ chip_bus_lock(desc); /* First installed action requests resources. */ if (!desc->action) { ret = irq_request_resources(desc); if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); goto out_bus_unlock; } } /* * The following block of code has to be executed atomically * protected against a concurrent interrupt and any of the other * management calls which are not serialized via * desc->request_mutex or the optional bus lock. */ raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; old = *old_ptr; if (old) { /* * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag * fields must have IRQF_SHARED set and the bits which * set the trigger type must match. Also all must * agree on ONESHOT. * Interrupt lines used for NMIs cannot be shared. */ unsigned int oldtype; if (irq_is_nmi(desc)) { pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", new->name, irq, desc->irq_data.chip->name); ret = -EINVAL; goto out_unlock; } /* * If nobody did set the configuration before, inherit * the one provided by the requester. */ if (irqd_trigger_type_was_set(&desc->irq_data)) { oldtype = irqd_get_trigger_type(&desc->irq_data); } else { oldtype = new->flags & IRQF_TRIGGER_MASK; irqd_set_trigger_type(&desc->irq_data, oldtype); } if (!((old->flags & new->flags) & IRQF_SHARED) || (oldtype != (new->flags & IRQF_TRIGGER_MASK))) goto mismatch; if ((old->flags & IRQF_ONESHOT) && (new->flags & IRQF_COND_ONESHOT)) new->flags |= IRQF_ONESHOT; else if ((old->flags ^ new->flags) & IRQF_ONESHOT) goto mismatch; /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) goto mismatch; /* add new interrupt at end of irq queue */ do { /* * Or all existing action->thread_mask bits, * so we can find the next zero bit for this * new action. */ thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } /* * Setup the thread mask for this irqaction for ONESHOT. For * !ONESHOT irqs the thread mask is 0 so we can avoid a * conditional in irq_wake_thread(). */ if (new->flags & IRQF_ONESHOT) { /* * Unlikely to have 32 resp 64 irqs sharing one line, * but who knows. */ if (thread_mask == ~0UL) { ret = -EBUSY; goto out_unlock; } /* * The thread_mask for the action is or'ed to * desc->thread_active to indicate that the * IRQF_ONESHOT thread handler has been woken, but not * yet finished. The bit is cleared when a thread * completes. When all threads of a shared interrupt * line have completed desc->threads_active becomes * zero and the interrupt line is unmasked. See * handle.c:irq_wake_thread() for further information. * * If no thread is woken by primary (hard irq context) * interrupt handlers, then desc->threads_active is * also checked for zero to unmask the irq line in the * affected hard irq flow handlers * (handle_[fasteoi|level]_irq). * * The new action gets the first zero bit of * thread_mask assigned. See the loop above which or's * all existing action->thread_mask bits. */ new->thread_mask = 1UL << ffz(thread_mask); } else if (new->handler == irq_default_primary_handler && !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it * does not have the oneshot flag set. In combination * with level interrupts this is deadly, because the * default primary handler just wakes the thread, then * the irq lines is reenabled, but the device still * has the level irq asserted. Rinse and repeat.... * * While this works for edge type interrupts, we play * it safe and reject unconditionally because we can't * say for sure which type this interrupt really * has. The type flags are unreliable as the * underlying chip implementation can override them. */ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n", new->name, irq); ret = -EINVAL; goto out_unlock; } if (!shared) { /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); if (ret) goto out_unlock; } /* * Activate the interrupt. That activation must happen * independently of IRQ_NOAUTOEN. request_irq() can fail * and the callers are supposed to handle * that. enable_irq() of an interrupt requested with * IRQ_NOAUTOEN is not supposed to fail. The activation * keeps it in shutdown mode, it merily associates * resources if necessary and if that's not possible it * fails. Interrupts which are in managed shutdown mode * will simply ignore that activation request. */ ret = irq_activate(desc); if (ret) goto out_unlock; desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_ONESHOT | IRQS_WAITING); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); if (new->flags & IRQF_PERCPU) { irqd_set(&desc->irq_data, IRQD_PER_CPU); irq_settings_set_per_cpu(desc); if (new->flags & IRQF_NO_DEBUG) irq_settings_set_no_debug(desc); } if (noirqdebug) irq_settings_set_no_debug(desc); if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; /* Exclude IRQ from balancing if requested */ if (new->flags & IRQF_NOBALANCING) { irq_settings_set_no_balancing(desc); irqd_set(&desc->irq_data, IRQD_NO_BALANCING); } if (!(new->flags & IRQF_NO_AUTOEN) && irq_settings_can_autoenable(desc)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); } else { /* * Shared interrupts do not go well with disabling * auto enable. The sharing interrupt might request * it while it's still disabled and then wait for * interrupts forever. */ WARN_ON_ONCE(new->flags & IRQF_SHARED); /* Undo nested disables: */ desc->depth = 1; } } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); if (nmsk != omsk) /* hope the handler works with current trigger mode */ pr_warn("irq %d uses trigger mode %u; requested %u\n", irq, omsk, nmsk); } *old_ptr = new; irq_pm_install_action(desc, new); /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; /* * Check whether we disabled the irq via the spurious handler * before. Reenable it and give it another chance. */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; __enable_irq(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); irq_setup_timings(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new->secondary); register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); return 0; mismatch: if (!(new->flags & IRQF_PROBE_SHARED)) { pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", irq, new->flags, new->name, old->flags, old->name); #ifdef CONFIG_DEBUG_SHIRQ dump_stack(); #endif } ret = -EBUSY; out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); if (!desc->action) irq_release_resources(desc); out_bus_unlock: chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); out_thread: if (new->thread) { struct task_struct *t = new->thread; new->thread = NULL; kthread_stop_put(t); } if (new->secondary && new->secondary->thread) { struct task_struct *t = new->secondary->thread; new->secondary->thread = NULL; kthread_stop_put(t); } out_mput: module_put(desc->owner); return ret; } /* * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. */ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) { unsigned irq = desc->irq_data.irq; struct irqaction *action, **action_ptr; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* * There can be multiple actions per IRQ descriptor, find the right * one based on the dev_id: */ action_ptr = &desc->action; for (;;) { action = *action_ptr; if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); return NULL; } if (action->dev_id == dev_id) break; action_ptr = &action->next; } /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; irq_pm_remove_action(desc, action); /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_settings_clr_disable_unlazy(desc); /* Only shutdown. Deactivate after synchronize_hardirq() */ irq_shutdown(desc); } #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ if (WARN_ON_ONCE(desc->affinity_hint)) desc->affinity_hint = NULL; #endif raw_spin_unlock_irqrestore(&desc->lock, flags); /* * Drop bus_lock here so the changes which were done in the chip * callbacks above are synced out to the irq chips which hang * behind a slow bus (I2C, SPI) before calling synchronize_hardirq(). * * Aside of that the bus_lock can also be taken from the threaded * handler in irq_finalize_oneshot() which results in a deadlock * because kthread_stop() would wait forever for the thread to * complete, which is blocked on the bus lock. * * The still held desc->request_mutex() protects against a * concurrent request_irq() of this irq so the release of resources * and timing data is properly serialized. */ chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); /* * Make sure it's not being used on another CPU and if the chip * supports it also make sure that there is no (not yet serviced) * interrupt in flight at the hardware level. */ __synchronize_irq(desc); #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ * event to happen even now it's being freed, so let's make sure that * is so by doing an extra call to the handler .... * * ( We do this after actually deregistering it, to make sure that a * 'real' IRQ doesn't run in parallel with our fake. ) */ if (action->flags & IRQF_SHARED) { local_irq_save(flags); action->handler(irq, dev_id); local_irq_restore(flags); } #endif /* * The action has already been removed above, but the thread writes * its oneshot mask bit when it completes. Though request_mutex is * held across this which prevents __setup_irq() from handing out * the same bit to a newly requested action. */ if (action->thread) { kthread_stop_put(action->thread); if (action->secondary && action->secondary->thread) kthread_stop_put(action->secondary->thread); } /* Last action releases resources */ if (!desc->action) { /* * Reacquire bus lock as irq_release_resources() might * require it to deallocate resources over the slow bus. */ chip_bus_lock(desc); /* * There is no interrupt on the fly anymore. Deactivate it * completely. */ raw_spin_lock_irqsave(&desc->lock, flags); irq_domain_deactivate_irq(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); irq_release_resources(desc); chip_bus_sync_unlock(desc); irq_remove_timings(desc); } mutex_unlock(&desc->request_mutex); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); return action; } /** * free_irq - free an interrupt allocated with request_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove an interrupt handler. The handler is removed and if the * interrupt line is no longer in use by any driver it is disabled. * On a shared IRQ the caller must ensure the interrupt is disabled * on the card it drives before calling this function. The function * does not return until any executing interrupts for this IRQ * have completed. * * This function must not be called from interrupt context. * * Returns the devname argument passed to request_irq. */ const void *free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; const char *devname; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; #ifdef CONFIG_SMP if (WARN_ON(desc->affinity_notify)) desc->affinity_notify = NULL; #endif action = __free_irq(desc, dev_id); if (!action) return NULL; devname = action->name; kfree(action); return devname; } EXPORT_SYMBOL(free_irq); /* This function must be called with desc->lock held */ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) { const char *devname = NULL; desc->istate &= ~IRQS_NMI; if (!WARN_ON(desc->action == NULL)) { irq_pm_remove_action(desc, desc->action); devname = desc->action->name; unregister_handler_proc(irq, desc->action); kfree(desc->action); desc->action = NULL; } irq_settings_clr_disable_unlazy(desc); irq_shutdown_and_deactivate(desc); irq_release_resources(desc); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return devname; } const void *free_nmi(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; const void *devname; if (!desc || WARN_ON(!irq_is_nmi(desc))) return NULL; if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; /* NMI still enabled */ if (WARN_ON(desc->depth == 0)) disable_nmi_nosync(irq); raw_spin_lock_irqsave(&desc->lock, flags); irq_nmi_teardown(desc); devname = __cleanup_nmi(irq, desc); raw_spin_unlock_irqrestore(&desc->lock, flags); return devname; } /** * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts. * If handler is NULL and thread_fn != NULL * the default primary handler is installed. * @thread_fn: Function called from the irq handler thread * If NULL, no irq thread is created * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the * interrupt line and IRQ handling. From the point this * call is made your handler function may be invoked. Since * your handler function must clear any interrupt the board * raises, you must take care both to initialise your hardware * and to set up the interrupt handler in the right order. * * If you want to set up a threaded irq handler for your device * then you need to supply @handler and @thread_fn. @handler is * still called in hard interrupt context and has to check * whether the interrupt originates from the device. If yes it * needs to disable the interrupt on the device and return * IRQ_WAKE_THREAD which will wake up the handler thread and run * @thread_fn. This split handler design is necessary to support * shared interrupts. * * Dev_id must be globally unique. Normally the address of the * device data structure is used as the cookie. Since the handler * receives this value it makes sense to use it. * * If your interrupt is shared you must pass a non NULL dev_id * as this is required when freeing the interrupt. * * Flags: * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level * IRQF_ONESHOT Run thread_fn with interrupt line masked */ int request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, const char *devname, void *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). * * Also shared interrupts do not go well with disabling auto enable. * The sharing interrupt might request it while it's still disabled * and then wait for interrupts forever. * * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and * it cannot be set along with IRQF_NO_SUSPEND. */ if (((irqflags & IRQF_SHARED) && !dev_id) || ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) || (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (!irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; if (!handler) { if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->thread_fn = thread_fn; action->flags = irqflags; action->name = devname; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action->secondary); kfree(action); } #ifdef CONFIG_DEBUG_SHIRQ_FIXME if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... * We disable the irq to make sure that a 'real' IRQ doesn't * run in parallel with our fake. */ unsigned long flags; disable_irq(irq); local_irq_save(flags); handler(irq, dev_id); local_irq_restore(flags); enable_irq(irq); } #endif return retval; } EXPORT_SYMBOL(request_threaded_irq); /** * request_any_context_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @flags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the * interrupt line and IRQ handling. It selects either a * hardirq or threaded handling method depending on the * context. * * On failure, it returns a negative value. On success, * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED. */ int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { struct irq_desc *desc; int ret; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (irq_settings_is_nested_thread(desc)) { ret = request_threaded_irq(irq, NULL, handler, flags, name, dev_id); return !ret ? IRQC_IS_NESTED : ret; } ret = request_irq(irq, handler, flags, name, dev_id); return !ret ? IRQC_IS_HARDIRQ : ret; } EXPORT_SYMBOL_GPL(request_any_context_irq); /** * request_nmi - allocate an interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @irqflags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the * interrupt line and IRQ handling. It sets up the IRQ line * to be handled as an NMI. * * An interrupt line delivering NMIs cannot be shared and IRQ handling * cannot be threaded. * * Interrupt lines requested for NMI delivering must produce per cpu * interrupts and have auto enabling setting disabled. * * Dev_id must be globally unique. Normally the address of the * device data structure is used as the cookie. Since the handler * receives this value it makes sense to use it. * * If the interrupt line cannot be used to deliver NMIs, function * will fail and return a negative value. */ int request_nmi(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *name, void *dev_id) { struct irqaction *action; struct irq_desc *desc; unsigned long flags; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* NMI cannot be shared, used for Polling */ if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL)) return -EINVAL; if (!(irqflags & IRQF_PERCPU)) return -EINVAL; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || (irq_settings_can_autoenable(desc) && !(irqflags & IRQF_NO_AUTOEN)) || !irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc)) || !irq_supports_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; raw_spin_lock_irqsave(&desc->lock, flags); /* Setup NMI state */ desc->istate |= IRQS_NMI; retval = irq_nmi_setup(desc); if (retval) { __cleanup_nmi(irq, desc); raw_spin_unlock_irqrestore(&desc->lock, flags); return -EINVAL; } raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } void enable_percpu_irq(unsigned int irq, unsigned int type) { unsigned int cpu = smp_processor_id(); unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); if (!desc) return; /* * If the trigger type is not specified by the caller, then * use the default for this interrupt. */ type &= IRQ_TYPE_SENSE_MASK; if (type == IRQ_TYPE_NONE) type = irqd_get_trigger_type(&desc->irq_data); if (type != IRQ_TYPE_NONE) { int ret; ret = __irq_set_trigger(desc, type); if (ret) { WARN(1, "failed to set type for IRQ%d\n", irq); goto out; } } irq_percpu_enable(desc, cpu); out: irq_put_desc_unlock(desc, flags); } EXPORT_SYMBOL_GPL(enable_percpu_irq); void enable_percpu_nmi(unsigned int irq, unsigned int type) { enable_percpu_irq(irq, type); } /** * irq_percpu_is_enabled - Check whether the per cpu irq is enabled * @irq: Linux irq number to check for * * Must be called from a non migratable context. Returns the enable * state of a per cpu interrupt on the current cpu. */ bool irq_percpu_is_enabled(unsigned int irq) { unsigned int cpu = smp_processor_id(); struct irq_desc *desc; unsigned long flags; bool is_enabled; desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); if (!desc) return false; is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); irq_put_desc_unlock(desc, flags); return is_enabled; } EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); void disable_percpu_irq(unsigned int irq) { unsigned int cpu = smp_processor_id(); unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); if (!desc) return; irq_percpu_disable(desc, cpu); irq_put_desc_unlock(desc, flags); } EXPORT_SYMBOL_GPL(disable_percpu_irq); void disable_percpu_nmi(unsigned int irq) { disable_percpu_irq(irq); } /* * Internal function to unregister a percpu irqaction. */ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) return NULL; raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; if (!action || action->percpu_dev_id != dev_id) { WARN(1, "Trying to free already-free IRQ %d\n", irq); goto bad; } if (!cpumask_empty(desc->percpu_enabled)) { WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq, cpumask_first(desc->percpu_enabled)); goto bad; } /* Found it - now remove it from the list of entries: */ desc->action = NULL; desc->istate &= ~IRQS_NMI; raw_spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; bad: raw_spin_unlock_irqrestore(&desc->lock, flags); return NULL; } /** * remove_percpu_irq - free a per-cpu interrupt * @irq: Interrupt line to free * @act: irqaction for the interrupt * * Used to remove interrupts statically setup by the early boot process. */ void remove_percpu_irq(unsigned int irq, struct irqaction *act) { struct irq_desc *desc = irq_to_desc(irq); if (desc && irq_settings_is_per_cpu_devid(desc)) __free_percpu_irq(irq, act->percpu_dev_id); } /** * free_percpu_irq - free an interrupt allocated with request_percpu_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove a percpu interrupt handler. The handler is removed, but * the interrupt line is not disabled. This must be done on each * CPU before calling this function. The function does not return * until any executing interrupts for this IRQ have completed. * * This function must not be called from interrupt context. */ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; chip_bus_lock(desc); kfree(__free_percpu_irq(irq, dev_id)); chip_bus_sync_unlock(desc); } EXPORT_SYMBOL_GPL(free_percpu_irq); void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; if (WARN_ON(!irq_is_nmi(desc))) return; kfree(__free_percpu_irq(irq, dev_id)); } /** * setup_percpu_irq - setup a per-cpu interrupt * @irq: Interrupt line to setup * @act: irqaction for the interrupt * * Used to statically setup per-cpu interrupts in the early boot process. */ int setup_percpu_irq(unsigned int irq, struct irqaction *act) { struct irq_desc *desc = irq_to_desc(irq); int retval; if (!desc || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) return retval; retval = __setup_irq(irq, desc, act); if (retval) irq_chip_pm_put(&desc->irq_data); return retval; } /** * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources and enables the * interrupt on the local CPU. If the interrupt is supposed to be * enabled on other CPUs, it has to be done on each CPU using * enable_percpu_irq(). * * Dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of * that variable. */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (!dev_id) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; if (flags && flags != IRQF_TIMER) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action); } return retval; } EXPORT_SYMBOL_GPL(__request_percpu_irq); /** * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @name: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs * have to be setup on each CPU by calling prepare_percpu_nmi() before * being enabled on the same CPU by using enable_percpu_nmi(). * * Dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of * that variable. * * Interrupt lines requested for NMI delivering should have auto enabling * setting disabled. * * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; unsigned long flags; int retval; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc) || irq_settings_can_autoenable(desc) || !irq_supports_nmi(desc)) return -EINVAL; /* The line cannot already be NMI */ if (irq_is_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; raw_spin_lock_irqsave(&desc->lock, flags); desc->istate |= IRQS_NMI; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } /** * prepare_percpu_nmi - performs CPU local setup for NMI delivery * @irq: Interrupt line to prepare for NMI delivery * * This call prepares an interrupt line to deliver NMI on the current CPU, * before that interrupt line gets enabled with enable_percpu_nmi(). * * As a CPU local operation, this should be called from non-preemptible * context. * * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ int prepare_percpu_nmi(unsigned int irq) { unsigned long flags; struct irq_desc *desc; int ret = 0; WARN_ON(preemptible()); desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); if (!desc) return -EINVAL; if (WARN(!irq_is_nmi(desc), KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq)) { ret = -EINVAL; goto out; } ret = irq_nmi_setup(desc); if (ret) { pr_err("Failed to setup NMI delivery: irq %u\n", irq); goto out; } out: irq_put_desc_unlock(desc, flags); return ret; } /** * teardown_percpu_nmi - undoes NMI setup of IRQ line * @irq: Interrupt line from which CPU local NMI configuration should be * removed * * This call undoes the setup done by prepare_percpu_nmi(). * * IRQ line should not be enabled for the current CPU. * * As a CPU local operation, this should be called from non-preemptible * context. */ void teardown_percpu_nmi(unsigned int irq) { unsigned long flags; struct irq_desc *desc; WARN_ON(preemptible()); desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); if (!desc) return; if (WARN_ON(!irq_is_nmi(desc))) goto out; irq_nmi_teardown(desc); out: irq_put_desc_unlock(desc, flags); } static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) return -ENODEV; if (chip->irq_get_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY data = data->parent_data; #else data = NULL; #endif } while (data); if (data) err = chip->irq_get_irqchip_state(data, which, state); return err; } /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: One of IRQCHIP_STATE_* the caller wants to know about * @state: a pointer to a boolean where the state is to be stored * * This call snapshots the internal irqchip state of an * interrupt, returning into @state the bit corresponding to * stage @which * * This function should be called with preemption disabled if the * interrupt controller has per-cpu registers. */ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state) { struct irq_desc *desc; struct irq_data *data; unsigned long flags; int err = -EINVAL; desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return err; data = irq_desc_get_irq_data(desc); err = __irq_get_irqchip_state(data, which, state); irq_put_desc_busunlock(desc, flags); return err; } EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** * irq_set_irqchip_state - set the state of a forwarded interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: State to be restored (one of IRQCHIP_STATE_*) * @val: Value corresponding to @which * * This call sets the internal irqchip state of an interrupt, * depending on the value of @which. * * This function should be called with migration disabled if the * interrupt controller has per-cpu registers. */ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val) { struct irq_desc *desc; struct irq_data *data; struct irq_chip *chip; unsigned long flags; int err = -EINVAL; desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return err; data = irq_desc_get_irq_data(desc); do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) { err = -ENODEV; goto out_unlock; } if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY data = data->parent_data; #else data = NULL; #endif } while (data); if (data) err = chip->irq_set_irqchip_state(data, which, val); out_unlock: irq_put_desc_busunlock(desc, flags); return err; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); /** * irq_has_action - Check whether an interrupt is requested * @irq: The linux irq number * * Returns: A snapshot of the current state */ bool irq_has_action(unsigned int irq) { bool res; rcu_read_lock(); res = irq_desc_has_action(irq_to_desc(irq)); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_has_action); /** * irq_check_status_bit - Check whether bits in the irq descriptor status are set * @irq: The linux irq number * @bitmask: The bitmask to evaluate * * Returns: True if one of the bits in @bitmask is set */ bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) { struct irq_desc *desc; bool res = false; rcu_read_lock(); desc = irq_to_desc(irq); if (desc) res = !!(desc->status_use_accessors & bitmask); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_check_status_bit); |
5 5 3 5 5 1 1 2 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | // SPDX-License-Identifier: GPL-2.0-only /* * ratelimit.c - Do something with rate limit. * * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com> * * 2008-05-01 rewrite the function and use a ratelimit_state data struct as * parameter. Now every user can use their own standalone ratelimit_state. */ #include <linux/ratelimit.h> #include <linux/jiffies.h> #include <linux/export.h> /* * __ratelimit - rate limiting * @rs: ratelimit_state data * @func: name of calling function * * This enforces a rate limit: not more than @rs->burst callbacks * in every @rs->interval * * RETURNS: * 0 means callbacks will be suppressed. * 1 means go ahead and do it. */ int ___ratelimit(struct ratelimit_state *rs, const char *func) { /* Paired with WRITE_ONCE() in .proc_handler(). * Changing two values seperately could be inconsistent * and some message could be lost. (See: net_ratelimit_state). */ int interval = READ_ONCE(rs->interval); int burst = READ_ONCE(rs->burst); unsigned long flags; int ret; if (!interval) return 1; /* * If we contend on this state's lock then almost * by definition we are too busy to print a message, * in addition to the one that will be printed by * the entity that is holding the lock already: */ if (!raw_spin_trylock_irqsave(&rs->lock, flags)) return 0; if (!rs->begin) rs->begin = jiffies; if (time_is_before_jiffies(rs->begin + interval)) { if (rs->missed) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { printk_deferred(KERN_WARNING "%s: %d callbacks suppressed\n", func, rs->missed); rs->missed = 0; } } rs->begin = jiffies; rs->printed = 0; } if (burst && burst > rs->printed) { rs->printed++; ret = 1; } else { rs->missed++; ret = 0; } raw_spin_unlock_irqrestore(&rs->lock, flags); return ret; } EXPORT_SYMBOL(___ratelimit); |
3 3 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * FIB front-end. * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ /* Changes: * * YOSHIFUJI Hideaki @USAGI * reworked default router selection. * - respect outgoing interface * - select from (probably) reachable routers (i.e. * routers in REACHABLE, STALE, DELAY or PROBE states). * - always select the same router if it is (probably) * reachable. otherwise, round-robin the list. * Ville Nuorvala * Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/capability.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/types.h> #include <linux/times.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/init.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/jhash.h> #include <linux/siphash.h> #include <net/net_namespace.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/tcp.h> #include <linux/rtnetlink.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/xfrm.h> #include <net/netevent.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/ip_tunnels.h> #include <net/l3mdev.h> #include <net/ip.h> #include <linux/uaccess.h> #include <linux/btf_ids.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif static int ip6_rt_type_to_error(u8 fib6_type); #define CREATE_TRACE_POINTS #include <trace/events/fib6.h> EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); #undef CREATE_TRACE_POINTS enum rt6_nud_state { RT6_NUD_FAIL_HARD = -3, RT6_NUD_FAIL_PROBE = -2, RT6_NUD_FAIL_DO_RR = -1, RT6_NUD_SUCCEED = 1 }; INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst); static void ip6_negative_advice(struct sock *sk, struct dst_entry *dst); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev); static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static int ip6_pkt_prohibit(struct sk_buff *skb); static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict); static size_t rt6_nlmsg_size(struct fib6_info *f6i); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev); #endif struct uncached_list { spinlock_t lock; struct list_head head; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); void rt6_uncached_list_add(struct rt6_info *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); rt->dst.rt_uncached_list = ul; spin_lock_bh(&ul->lock); list_add_tail(&rt->dst.rt_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt6_uncached_list_del(struct rt6_info *rt) { if (!list_empty(&rt->dst.rt_uncached)) { struct uncached_list *ul = rt->dst.rt_uncached_list; spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); } } static void rt6_uncached_list_flush_dev(struct net_device *dev) { int cpu; for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); struct rt6_info *rt, *safe; if (list_empty(&ul->head)) continue; spin_lock_bh(&ul->lock); list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { struct inet6_dev *rt_idev = rt->rt6i_idev; struct net_device *rt_dev = rt->dst.dev; bool handled = false; if (rt_idev && rt_idev->dev == dev) { rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); handled = true; } if (rt_dev == dev) { rt->dst.dev = blackhole_netdev; netdev_ref_replace(rt_dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); handled = true; } if (handled) list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } } static inline const void *choose_neigh_daddr(const struct in6_addr *p, struct sk_buff *skb, const void *daddr) { if (!ipv6_addr_any(p)) return (const void *) p; else if (skb) return &ipv6_hdr(skb)->daddr; return daddr; } struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr) { struct neighbour *n; daddr = choose_neigh_daddr(gw, skb, daddr); n = __ipv6_neigh_lookup(dev, daddr); if (n) return n; n = neigh_create(&nd_tbl, daddr, dev); return IS_ERR(n) ? NULL : n; } static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), dst->dev, skb, daddr); } static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); struct net_device *dev = dst->dev; daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) return; if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) return; if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) return; __ipv6_confirm_neigh(dev, daddr); } static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, .gc = ip6_dst_gc, .gc_thresh = 1024, .check = ip6_dst_check, .default_advmss = ip6_default_advmss, .mtu = ip6_mtu, .cow_metrics = dst_cow_metrics_generic, .destroy = ip6_dst_destroy, .ifdown = ip6_dst_ifdown, .negative_advice = ip6_negative_advice, .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, .redirect = rt6_do_redirect, .local_out = __ip6_local_out, .neigh_lookup = ip6_dst_neigh_lookup, .confirm_neigh = ip6_confirm_neigh, }; static struct dst_ops ip6_dst_blackhole_ops = { .family = AF_INET6, .default_advmss = ip6_default_advmss, .neigh_lookup = ip6_dst_neigh_lookup, .check = ip6_dst_check, .destroy = ip6_dst_destroy, .cow_metrics = dst_cow_metrics_generic, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; static const u32 ip6_template_metrics[RTAX_MAX] = { [RTAX_HOPLIMIT - 1] = 0, }; static const struct fib6_info fib6_null_entry_template = { .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), .fib6_protocol = RTPROT_KERNEL, .fib6_metric = ~(u32)0, .fib6_ref = REFCOUNT_INIT(1), .fib6_type = RTN_UNREACHABLE, .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, }; static const struct rt6_info ip6_null_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -ENETUNREACH, .input = ip6_pkt_discard, .output = ip6_pkt_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; #ifdef CONFIG_IPV6_MULTIPLE_TABLES static const struct rt6_info ip6_prohibit_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EACCES, .input = ip6_pkt_prohibit, .output = ip6_pkt_prohibit_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; static const struct rt6_info ip6_blk_hole_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, .output = dst_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; #endif static void rt6_info_init(struct rt6_info *rt) { memset_after(rt, 0, dst); } /* allocate dst with ip6_dst_ops */ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, flags); if (rt) { rt6_info_init(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); } return rt; } EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = dst_rt6_info(dst); struct fib6_info *from; struct inet6_dev *idev; ip_dst_metrics_put(dst); rt6_uncached_list_del(rt); idev = rt->rt6i_idev; if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); } from = unrcu_pointer(xchg(&rt->from, NULL)); fib6_info_release(from); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct rt6_info *rt = dst_rt6_info(dst); struct inet6_dev *idev = rt->rt6i_idev; struct fib6_info *from; if (idev && idev->dev != blackhole_netdev) { struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev); if (blackhole_idev) { rt->rt6i_idev = blackhole_idev; in6_dev_put(idev); } } from = unrcu_pointer(xchg(&rt->from, NULL)); fib6_info_release(from); } static bool __rt6_check_expired(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_EXPIRES) return time_after(jiffies, rt->dst.expires); else return false; } static bool rt6_check_expired(const struct rt6_info *rt) { struct fib6_info *from; from = rcu_dereference(rt->from); if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, rt->dst.expires)) return true; } else if (from) { return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || fib6_check_expired(from); } return false; } static struct fib6_info * rt6_multipath_first_sibling_rcu(const struct fib6_info *rt) { struct fib6_info *iter; struct fib6_node *fn; fn = rcu_dereference(rt->fib6_node); if (!fn) goto out; iter = rcu_dereference(fn->leaf); if (!iter) goto out; while (iter) { if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference(iter->fib6_next); } out: return NULL; } void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { struct fib6_info *first, *match = res->f6i; struct fib6_info *sibling; int hash; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; if (match->nh && have_oif_match && res->nh) return; if (skb) IP6CB(skb)->flags |= IP6SKB_MULTIPATH; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash && (!match->nh || nexthop_is_multipath(match->nh))) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); if (unlikely(match->nh)) { nexthop_path_fib6_result(res, fl6->mp_hash); return; } first = rt6_multipath_first_sibling_rcu(match); if (!first) goto out; hash = fl6->mp_hash; if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif, strict) >= 0) match = first; goto out; } list_for_each_entry_rcu(sibling, &first->fib6_siblings, fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); if (hash > nh_upper_bound) continue; if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; match = sibling; break; } out: res->f6i = match; res->nh = match->fib6_nh; } /* * Route lookup. rcu_read_lock() should be held. */ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, const struct in6_addr *saddr, int oif, int flags) { const struct net_device *dev; if (nh->fib_nh_flags & RTNH_F_DEAD) return false; dev = nh->fib_nh_dev; if (oif) { if (dev->ifindex == oif) return true; } else { if (ipv6_chk_addr(net, saddr, dev, flags & RT6_LOOKUP_F_IFACE)) return true; } return false; } struct fib6_nh_dm_arg { struct net *net; const struct in6_addr *saddr; int oif; int flags; struct fib6_nh *nh; }; static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_dm_arg *arg = _arg; arg->nh = nh; return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, arg->flags); } /* returns fib6_nh from nexthop or NULL */ static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { struct fib6_nh_dm_arg arg = { .net = net, .saddr = saddr, .oif = oif, .flags = flags, }; if (nexthop_is_blackhole(nh)) return NULL; if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) return arg.nh; return NULL; } static void rt6_device_match(struct net *net, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { struct fib6_info *f6i = res->f6i; struct fib6_info *spf6i; struct fib6_nh *nh; if (!oif && ipv6_addr_any(saddr)) { if (unlikely(f6i->nh)) { nh = nexthop_fib6_nh(f6i->nh); if (nexthop_is_blackhole(f6i->nh)) goto out_blackhole; } else { nh = f6i->fib6_nh; } if (!(nh->fib_nh_flags & RTNH_F_DEAD)) goto out; } for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { bool matched = false; if (unlikely(spf6i->nh)) { nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, oif, flags); if (nh) matched = true; } else { nh = spf6i->fib6_nh; if (__rt6_device_match(net, nh, saddr, oif, flags)) matched = true; } if (matched) { res->f6i = spf6i; goto out; } } if (oif && flags & RT6_LOOKUP_F_IFACE) { res->f6i = net->ipv6.fib6_null_entry; nh = res->f6i->fib6_nh; goto out; } if (unlikely(f6i->nh)) { nh = nexthop_fib6_nh(f6i->nh); if (nexthop_is_blackhole(f6i->nh)) goto out_blackhole; } else { nh = f6i->fib6_nh; } if (nh->fib_nh_flags & RTNH_F_DEAD) { res->f6i = net->ipv6.fib6_null_entry; nh = res->f6i->fib6_nh; } out: res->nh = nh; res->fib6_type = res->f6i->fib6_type; res->fib6_flags = res->f6i->fib6_flags; return; out_blackhole: res->fib6_flags |= RTF_REJECT; res->fib6_type = RTN_BLACKHOLE; res->nh = nh; } #ifdef CONFIG_IPV6_ROUTER_PREF struct __rt6_probe_work { struct work_struct work; struct in6_addr target; struct net_device *dev; netdevice_tracker dev_tracker; }; static void rt6_probe_deferred(struct work_struct *w) { struct in6_addr mcaddr; struct __rt6_probe_work *work = container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); netdev_put(work->dev, &work->dev_tracker); kfree(work); } static void rt6_probe(struct fib6_nh *fib6_nh) { struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; unsigned long last_probe; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; /* * Okay, this does not seem to be appropriate * for now, however, we need to check if it * is really so; aka Router Reachability Probing. * * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ if (!fib6_nh->fib_nh_gw_family) return; nh_gw = &fib6_nh->fib_nh_gw6; dev = fib6_nh->fib_nh_dev; rcu_read_lock(); last_probe = READ_ONCE(fib6_nh->last_probe); idev = __in6_dev_get(dev); if (!idev) goto out; neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { if (READ_ONCE(neigh->nud_state) & NUD_VALID) goto out; write_lock_bh(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, neigh->updated + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); } write_unlock_bh(&neigh->lock); } else if (time_after(jiffies, last_probe + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } if (!work || cmpxchg(&fib6_nh->last_probe, last_probe, jiffies) != last_probe) { kfree(work); } else { INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC); work->dev = dev; schedule_work(&work->work); } out: rcu_read_unlock(); } #else static inline void rt6_probe(struct fib6_nh *fib6_nh) { } #endif /* * Default Router Selection (RFC 2461 6.3.6) */ static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) { enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; rcu_read_lock(); neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, &fib6_nh->fib_nh_gw6); if (neigh) { u8 nud_state = READ_ONCE(neigh->nud_state); if (nud_state & NUD_VALID) ret = RT6_NUD_SUCCEED; #ifdef CONFIG_IPV6_ROUTER_PREF else if (!(nud_state & NUD_FAILED)) ret = RT6_NUD_SUCCEED; else ret = RT6_NUD_FAIL_PROBE; #endif } else { ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; } rcu_read_unlock(); return ret; } static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict) { int m = 0; if (!oif || nh->fib_nh_dev->ifindex == oif) m = 2; if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; #endif if ((strict & RT6_LOOKUP_F_REACHABLE) && !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { int n = rt6_check_neigh(nh); if (n < 0) return n; } return m; } static bool find_match(struct fib6_nh *nh, u32 fib6_flags, int oif, int strict, int *mpri, bool *do_rr) { bool match_do_rr = false; bool rc = false; int m; if (nh->fib_nh_flags & RTNH_F_DEAD) goto out; if (ip6_ignore_linkdown(nh->fib_nh_dev) && nh->fib_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; m = rt6_score_route(nh, fib6_flags, oif, strict); if (m == RT6_NUD_FAIL_DO_RR) { match_do_rr = true; m = 0; /* lowest valid score */ } else if (m == RT6_NUD_FAIL_HARD) { goto out; } if (strict & RT6_LOOKUP_F_REACHABLE) rt6_probe(nh); /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { *do_rr = match_do_rr; *mpri = m; rc = true; } out: return rc; } struct fib6_nh_frl_arg { u32 flags; int oif; int strict; int *mpri; bool *do_rr; struct fib6_nh *nh; }; static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_frl_arg *arg = _arg; arg->nh = nh; return find_match(nh, arg->flags, arg->oif, arg->strict, arg->mpri, arg->do_rr); } static void __find_rr_leaf(struct fib6_info *f6i_start, struct fib6_info *nomatch, u32 metric, struct fib6_result *res, struct fib6_info **cont, int oif, int strict, bool *do_rr, int *mpri) { struct fib6_info *f6i; for (f6i = f6i_start; f6i && f6i != nomatch; f6i = rcu_dereference(f6i->fib6_next)) { bool matched = false; struct fib6_nh *nh; if (cont && f6i->fib6_metric != metric) { *cont = f6i; return; } if (fib6_check_expired(f6i)) continue; if (unlikely(f6i->nh)) { struct fib6_nh_frl_arg arg = { .flags = f6i->fib6_flags, .oif = oif, .strict = strict, .mpri = mpri, .do_rr = do_rr }; if (nexthop_is_blackhole(f6i->nh)) { res->fib6_flags = RTF_REJECT; res->fib6_type = RTN_BLACKHOLE; res->f6i = f6i; res->nh = nexthop_fib6_nh(f6i->nh); return; } if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, &arg)) { matched = true; nh = arg.nh; } } else { nh = f6i->fib6_nh; if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) matched = true; } if (matched) { res->f6i = f6i; res->nh = nh; res->fib6_flags = f6i->fib6_flags; res->fib6_type = f6i->fib6_type; } } } static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, struct fib6_info *rr_head, int oif, int strict, bool *do_rr, struct fib6_result *res) { u32 metric = rr_head->fib6_metric; struct fib6_info *cont = NULL; int mpri = -1; __find_rr_leaf(rr_head, NULL, metric, res, &cont, oif, strict, do_rr, &mpri); __find_rr_leaf(leaf, rr_head, metric, res, &cont, oif, strict, do_rr, &mpri); if (res->f6i || !cont) return; __find_rr_leaf(cont, NULL, metric, res, NULL, oif, strict, do_rr, &mpri); } static void rt6_select(struct net *net, struct fib6_node *fn, int oif, struct fib6_result *res, int strict) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct fib6_info *rt0; bool do_rr = false; int key_plen; /* make sure this function or its helpers sets f6i */ res->f6i = NULL; if (!leaf || leaf == net->ipv6.fib6_null_entry) goto out; rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) rt0 = leaf; /* Double check to make sure fn is not an intermediate node * and fn->leaf does not points to its child's leaf * (This might happen if all routes under fn are deleted from * the tree and fib6_repair_tree() is called on the node.) */ key_plen = rt0->fib6_dst.plen; #ifdef CONFIG_IPV6_SUBTREES if (rt0->fib6_src.plen) key_plen = rt0->fib6_src.plen; #endif if (fn->fn_bit != key_plen) goto out; find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); if (do_rr) { struct fib6_info *next = rcu_dereference(rt0->fib6_next); /* no entries matched; do round-robin */ if (!next || next->fib6_metric != rt0->fib6_metric) next = leaf; if (next != rt0) { spin_lock_bh(&leaf->fib6_table->tb6_lock); /* make sure next is not being deleted from the tree */ if (next->fib6_node) rcu_assign_pointer(fn->rr_ptr, next); spin_unlock_bh(&leaf->fib6_table->tb6_lock); } } out: if (!res->f6i) { res->f6i = net->ipv6.fib6_null_entry; res->nh = res->f6i->fib6_nh; res->fib6_flags = res->f6i->fib6_flags; res->fib6_type = res->f6i->fib6_type; } } static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) { return (res->f6i->fib6_flags & RTF_NONEXTHOP) || res->nh->fib_nh_gw_family; } #ifdef CONFIG_IPV6_ROUTE_INFO int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr) { struct net *net = dev_net(dev); struct route_info *rinfo = (struct route_info *) opt; struct in6_addr prefix_buf, *prefix; struct fib6_table *table; unsigned int pref; unsigned long lifetime; struct fib6_info *rt; if (len < sizeof(struct route_info)) { return -EINVAL; } /* Sanity check for prefix_len and length */ if (rinfo->length > 3) { return -EINVAL; } else if (rinfo->prefix_len > 128) { return -EINVAL; } else if (rinfo->prefix_len > 64) { if (rinfo->length < 2) { return -EINVAL; } } else if (rinfo->prefix_len > 0) { if (rinfo->length < 1) { return -EINVAL; } } pref = rinfo->route_pref; if (pref == ICMPV6_ROUTER_PREF_INVALID) return -EINVAL; lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); if (rinfo->length == 3) prefix = (struct in6_addr *)rinfo->prefix; else { /* this function is safe */ ipv6_addr_prefix(&prefix_buf, (struct in6_addr *)rinfo->prefix, rinfo->prefix_len); prefix = &prefix_buf; } if (rinfo->prefix_len == 0) rt = rt6_get_dflt_router(net, gwaddr, dev); else rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev); if (rt && !lifetime) { ip6_del_rt(net, rt, false); rt = NULL; } if (!rt && lifetime) rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev, pref); else if (rt) rt->fib6_flags = RTF_ROUTEINFO | (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); if (!addrconf_finite_timeout(lifetime)) { fib6_clean_expires(rt); fib6_remove_gc_list(rt); } else { fib6_set_expires(rt, jiffies + HZ * lifetime); fib6_add_gc_list(rt); } spin_unlock_bh(&table->tb6_lock); fib6_info_release(rt); } return 0; } #endif /* * Misc support functions */ /* called with rcu_lock held */ static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) { struct net_device *dev = res->nh->fib_nh_dev; if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default */ if (netif_is_l3_slave(dev) && !rt6_need_strict(&res->f6i->fib6_dst.addr)) dev = l3mdev_master_dev_rcu(dev); else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; /* last case is netif_is_l3_master(dev) is true in which * case we want dev returned to be dev */ } return dev; } static const int fib6_prop[RTN_MAX + 1] = { [RTN_UNSPEC] = 0, [RTN_UNICAST] = 0, [RTN_LOCAL] = 0, [RTN_BROADCAST] = 0, [RTN_ANYCAST] = 0, [RTN_MULTICAST] = 0, [RTN_BLACKHOLE] = -EINVAL, [RTN_UNREACHABLE] = -EHOSTUNREACH, [RTN_PROHIBIT] = -EACCES, [RTN_THROW] = -EAGAIN, [RTN_NAT] = -EINVAL, [RTN_XRESOLVE] = -EINVAL, }; static int ip6_rt_type_to_error(u8 fib6_type) { return fib6_prop[fib6_type]; } static unsigned short fib6_info_dst_flags(struct fib6_info *rt) { unsigned short flags = 0; if (rt->dst_nocount) flags |= DST_NOCOUNT; if (rt->dst_nopolicy) flags |= DST_NOPOLICY; return flags; } static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) { rt->dst.error = ip6_rt_type_to_error(fib6_type); switch (fib6_type) { case RTN_BLACKHOLE: rt->dst.output = dst_discard_out; rt->dst.input = dst_discard; break; case RTN_PROHIBIT: rt->dst.output = ip6_pkt_prohibit_out; rt->dst.input = ip6_pkt_prohibit; break; case RTN_THROW: case RTN_UNREACHABLE: default: rt->dst.output = ip6_pkt_discard_out; rt->dst.input = ip6_pkt_discard; break; } } static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) { struct fib6_info *f6i = res->f6i; if (res->fib6_flags & RTF_REJECT) { ip6_rt_init_dst_reject(rt, res->fib6_type); return; } rt->dst.error = 0; rt->dst.output = ip6_output; if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { rt->dst.input = ip6_input; } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; } else { rt->dst.input = ip6_forward; } if (res->nh->fib_nh_lws) { rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); lwtunnel_set_redirect(&rt->dst); } rt->dst.lastuse = jiffies; } /* Caller must already hold reference to @from */ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; rcu_assign_pointer(rt->from, from); ip_dst_init_metrics(&rt->dst, from->fib6_metrics); } /* Caller must already hold reference to f6i in result */ static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) { const struct fib6_nh *nh = res->nh; const struct net_device *dev = nh->fib_nh_dev; struct fib6_info *f6i = res->f6i; ip6_rt_init_dst(rt, res); rt->rt6i_dst = f6i->fib6_dst; rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; rt->rt6i_flags = res->fib6_flags; if (nh->fib_nh_gw_family) { rt->rt6i_gateway = nh->fib_nh_gw6; rt->rt6i_flags |= RTF_GATEWAY; } rt6_set_from(rt, f6i); #ifdef CONFIG_IPV6_SUBTREES rt->rt6i_src = f6i->fib6_src; #endif } static struct fib6_node* fib6_backtrack(struct fib6_node *fn, struct in6_addr *saddr) { struct fib6_node *pn, *sn; while (1) { if (fn->fn_flags & RTN_TL_ROOT) return NULL; pn = rcu_dereference(fn->parent); sn = FIB6_SUBTREE(pn); if (sn && sn != fn) fn = fib6_node_lookup(sn, NULL, saddr); else fn = pn; if (fn->fn_flags & RTN_RTINFO) return fn; } } static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) { struct rt6_info *rt = *prt; if (dst_hold_safe(&rt->dst)) return true; if (net) { rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); } else { rt = NULL; } *prt = rt; return false; } /* called with rcu_lock held */ static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) { struct net_device *dev = res->nh->fib_nh_dev; struct fib6_info *f6i = res->f6i; unsigned short flags; struct rt6_info *nrt; if (!fib6_info_hold_safe(f6i)) goto fallback; flags = fib6_info_dst_flags(f6i); nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (!nrt) { fib6_info_release(f6i); goto fallback; } ip6_rt_copy_init(nrt, res); return nrt; fallback: nrt = dev_net(dev)->ipv6.ip6_null_entry; dst_hold(&nrt->dst); return nrt; } INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; struct fib6_node *fn; struct rt6_info *rt; rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: res.f6i = rcu_dereference(fn->leaf); if (!res.f6i) res.f6i = net->ipv6.fib6_null_entry; else rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, flags); if (res.f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); goto out; } else if (res.fib6_flags & RTF_REJECT) { goto do_create; } fib6_select_path(net, &res, fl6, fl6->flowi6_oif, fl6->flowi6_oif != 0, skb, flags); /* Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); } else { do_create: rt = ip6_create_rt_rcu(&res); } out: trace_fib6_table_lookup(net, &res, table, fl6); rcu_read_unlock(); return rt; } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); } EXPORT_SYMBOL_GPL(ip6_route_lookup); struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, const struct sk_buff *skb, int strict) { struct flowi6 fl6 = { .flowi6_oif = oif, .daddr = *daddr, }; struct dst_entry *dst; int flags = strict ? RT6_LOOKUP_F_IFACE : 0; if (saddr) { memcpy(&fl6.saddr, saddr, sizeof(*saddr)); flags |= RT6_LOOKUP_F_HAS_SADDR; } dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) return dst_rt6_info(dst); dst_release(dst); return NULL; } EXPORT_SYMBOL(rt6_lookup); /* ip6_ins_rt is called with FREE table->tb6_lock. * It takes new route entry, the addition fails by any reason the * route is released. * Caller must hold dst before calling it. */ static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { int err; struct fib6_table *table; table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info, extack); spin_unlock_bh(&table->tb6_lock); return err; } int ip6_ins_rt(struct net *net, struct fib6_info *rt) { struct nl_info info = { .nl_net = net, }; return __ip6_ins_rt(rt, &info, NULL); } static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_info *f6i = res->f6i; struct net_device *dev; struct rt6_info *rt; /* * Clone the route. */ if (!fib6_info_hold_safe(f6i)) return NULL; dev = ip6_rt_get_dev_rcu(res); rt = ip6_dst_alloc(dev_net(dev), dev, 0); if (!rt) { fib6_info_release(f6i); return NULL; } ip6_rt_copy_init(rt, res); rt->rt6i_flags |= RTF_CACHE; rt->rt6i_dst.addr = *daddr; rt->rt6i_dst.plen = 128; if (!rt6_is_gw_or_nonexthop(res)) { if (f6i->fib6_dst.plen != 128 && ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { rt->rt6i_src.addr = *saddr; rt->rt6i_src.plen = 128; } #endif } return rt; } static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) { struct fib6_info *f6i = res->f6i; unsigned short flags = fib6_info_dst_flags(f6i); struct net_device *dev; struct rt6_info *pcpu_rt; if (!fib6_info_hold_safe(f6i)) return NULL; rcu_read_lock(); dev = ip6_rt_get_dev_rcu(res); pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT); rcu_read_unlock(); if (!pcpu_rt) { fib6_info_release(f6i); return NULL; } ip6_rt_copy_init(pcpu_rt, res); pcpu_rt->rt6i_flags |= RTF_PCPU; if (f6i->nh) pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev)); return pcpu_rt; } static bool rt6_is_valid(const struct rt6_info *rt6) { return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev)); } /* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { struct rt6_info *pcpu_rt; pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) { struct rt6_info *prev, **p; p = this_cpu_ptr(res->nh->rt6i_pcpu); /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */ prev = xchg(p, NULL); if (prev) { dst_dev_put(&prev->dst); dst_release(&prev->dst); } pcpu_rt = NULL; } return pcpu_rt; } static struct rt6_info *rt6_make_pcpu_route(struct net *net, const struct fib6_result *res) { struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(res); if (!pcpu_rt) return NULL; p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); if (res->f6i->fib6_destroying) { struct fib6_info *from; from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } return pcpu_rt; } /* exception hash table implementation */ static DEFINE_SPINLOCK(rt6_exception_lock); /* Remove rt6_ex from hash table and free the memory * Caller must hold rt6_exception_lock */ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { struct net *net; if (!bucket || !rt6_ex) return; net = dev_net(rt6_ex->rt6i->dst.dev); net->ipv6.rt6_stats->fib_rt_cache--; /* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time */ dst_dev_put(&rt6_ex->rt6i->dst); hlist_del_rcu(&rt6_ex->hlist); dst_release(&rt6_ex->rt6i->dst); kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; } /* Remove oldest rt6_ex in bucket and free the memory * Caller must hold rt6_exception_lock */ static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) { struct rt6_exception *rt6_ex, *oldest = NULL; if (!bucket) return; hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) oldest = rt6_ex; } rt6_remove_exception(bucket, oldest); } static u32 rt6_exception_hash(const struct in6_addr *dst, const struct in6_addr *src) { static siphash_aligned_key_t rt6_exception_key; struct { struct in6_addr dst; struct in6_addr src; } __aligned(SIPHASH_ALIGNMENT) combined = { .dst = *dst, }; u64 val; net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key)); #ifdef CONFIG_IPV6_SUBTREES if (src) combined.src = *src; #endif val = siphash(&combined, sizeof(combined), &rt6_exception_key); return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); } /* Helper function to find the cached rt in the hash table * and update bucket pointer to point to the bucket for this * (daddr, saddr) pair * Caller must hold rt6_exception_lock */ static struct rt6_exception * __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct rt6_exception *rt6_ex; u32 hval; if (!(*bucket) || !daddr) return NULL; hval = rt6_exception_hash(daddr, saddr); *bucket += hval; hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { struct rt6_info *rt6 = rt6_ex->rt6i; bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); #ifdef CONFIG_IPV6_SUBTREES if (matched && saddr) matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); #endif if (matched) return rt6_ex; } return NULL; } /* Helper function to find the cached rt in the hash table * and update bucket pointer to point to the bucket for this * (daddr, saddr) pair * Caller must hold rcu_read_lock() */ static struct rt6_exception * __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct rt6_exception *rt6_ex; u32 hval; WARN_ON_ONCE(!rcu_read_lock_held()); if (!(*bucket) || !daddr) return NULL; hval = rt6_exception_hash(daddr, saddr); *bucket += hval; hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { struct rt6_info *rt6 = rt6_ex->rt6i; bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); #ifdef CONFIG_IPV6_SUBTREES if (matched && saddr) matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); #endif if (matched) return rt6_ex; } return NULL; } static unsigned int fib6_mtu(const struct fib6_result *res) { const struct fib6_nh *nh = res->nh; unsigned int mtu; if (res->f6i->fib6_pmtu) { mtu = res->f6i->fib6_pmtu; } else { struct net_device *dev = nh->fib_nh_dev; struct inet6_dev *idev; rcu_read_lock(); idev = __in6_dev_get(dev); mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); } mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL /* used when the flushed bit is not relevant, only access to the bucket * (ie., all bucket users except rt6_insert_exception); * * called under rcu lock; sometimes called with rt6_exception_lock held */ static struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, spinlock_t *lock) { struct rt6_exception_bucket *bucket; if (lock) bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(lock)); else bucket = rcu_dereference(nh->rt6i_exception_bucket); /* remove bucket flushed bit if set */ if (bucket) { unsigned long p = (unsigned long)bucket; p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; bucket = (struct rt6_exception_bucket *)p; } return bucket; } static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) { unsigned long p = (unsigned long)bucket; return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); } /* called with rt6_exception_lock held */ static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, spinlock_t *lock) { struct rt6_exception_bucket *bucket; unsigned long p; bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(lock)); p = (unsigned long)bucket; p |= FIB6_EXCEPTION_BUCKET_FLUSHED; bucket = (struct rt6_exception_bucket *)p; rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); } static int rt6_insert_exception(struct rt6_info *nrt, const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; struct fib6_info *f6i = res->f6i; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; struct fib6_nh *nh = res->nh; int max_depth; int err = 0; spin_lock_bh(&rt6_exception_lock); bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), GFP_ATOMIC); if (!bucket) { err = -ENOMEM; goto out; } rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); } else if (fib6_nh_excptn_bucket_flushed(bucket)) { err = -EINVAL; goto out; } #ifdef CONFIG_IPV6_SUBTREES /* fib6_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of * both fib6_dst and fib6_src. * Otherwise, the exception table is indexed by * a hash of only fib6_dst. */ if (f6i->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif /* rt6_mtu_change() might lower mtu on f6i. * Only insert this exception route if its mtu * is less than f6i's mtu value. */ if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { err = -EINVAL; goto out; } rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_remove_exception(bucket, rt6_ex); rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); if (!rt6_ex) { err = -ENOMEM; goto out; } rt6_ex->rt6i = nrt; rt6_ex->stamp = jiffies; hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); bucket->depth++; net->ipv6.rt6_stats->fib_rt_cache++; /* Randomize max depth to avoid some side channels attacks. */ max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH); while (bucket->depth > max_depth) rt6_exception_remove_oldest(bucket); out: spin_unlock_bh(&rt6_exception_lock); /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum(net, f6i); fib6_add_gc_list(f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); fib6_force_start_gc(net); } return err; } static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) goto out; /* Prevent rt6_insert_exception() to recreate the bucket list */ if (!from) fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { if (!from || rcu_access_pointer(rt6_ex->rt6i->from) == from) rt6_remove_exception(bucket, rt6_ex); } WARN_ON_ONCE(!from && bucket->depth); bucket++; } out: spin_unlock_bh(&rt6_exception_lock); } static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) { struct fib6_info *f6i = arg; fib6_nh_flush_exceptions(nh, f6i); return 0; } void rt6_flush_exceptions(struct fib6_info *f6i) { if (f6i->nh) nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i); else fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); } /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct rt6_info *ret = NULL; #ifdef CONFIG_IPV6_SUBTREES /* fib6i_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of * both fib6_dst and fib6_src. * However, the src addr used to create the hash * might not be exactly the passed in saddr which * is a /128 addr from the flow. * So we need to use f6i->fib6_src to redo lookup * if the passed in saddr does not find anything. * (See the logic in ip6_rt_cache_alloc() on how * rt->rt6i_src is updated.) */ if (res->f6i->fib6_src.plen) src_key = saddr; find_ex: #endif bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) ret = rt6_ex->rt6i; #ifdef CONFIG_IPV6_SUBTREES /* Use fib6_src as src_key and redo lookup */ if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { src_key = &res->f6i->fib6_src.addr; goto find_ex; } #endif return ret; } /* Remove the passed in cached rt from the hash table that contains it */ static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, const struct rt6_info *rt) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int err; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) { rt6_remove_exception(bucket, rt6_ex); err = 0; } else { err = -ENOENT; } spin_unlock_bh(&rt6_exception_lock); return err; } struct fib6_nh_excptn_arg { struct rt6_info *rt; int plen; }; static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) { struct fib6_nh_excptn_arg *arg = _arg; int err; err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); if (err == 0) return 1; return 0; } static int rt6_remove_exception_rt(struct rt6_info *rt) { struct fib6_info *from; from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) return -EINVAL; if (from->nh) { struct fib6_nh_excptn_arg arg = { .rt = rt, .plen = from->fib6_src.plen }; int rc; /* rc = 1 means an entry was found */ rc = nexthop_for_each_fib6_nh(from->nh, rt6_nh_remove_exception_rt, &arg); return rc ? 0 : -ENOENT; } return fib6_nh_remove_exception(from->fib6_nh, from->fib6_src.plen, rt); } /* Find rt6_ex which contains the passed in rt cache and * refresh its stamp */ static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, const struct rt6_info *rt) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_ex->stamp = jiffies; } struct fib6_nh_match_arg { const struct net_device *dev; const struct in6_addr *gw; struct fib6_nh *match; }; /* determine if fib6_nh has given device and gateway */ static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_match_arg *arg = _arg; if (arg->dev != nh->fib_nh_dev || (arg->gw && !nh->fib_nh_gw_family) || (!arg->gw && nh->fib_nh_gw_family) || (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) return 0; arg->match = nh; /* found a match, break the loop */ return 1; } static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { struct fib6_info *from; struct fib6_nh *fib6_nh; rcu_read_lock(); from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) goto unlock; if (from->nh) { struct fib6_nh_match_arg arg = { .dev = rt->dst.dev, .gw = &rt->rt6i_gateway, }; nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); if (!arg.match) goto unlock; fib6_nh = arg.match; } else { fib6_nh = from->fib6_nh; } fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, struct rt6_info *rt, int mtu) { /* If the new MTU is lower than the route PMTU, this new MTU will be the * lowest MTU in the path: always allow updating the route PMTU to * reflect PMTU decreases. * * If the new MTU is higher, and the route PMTU is equal to the local * MTU, this means the old MTU is the lowest in the path, so allow * updating it: if other nodes now have lower MTUs, PMTU discovery will * handle this. */ if (dst_mtu(&rt->dst) >= mtu) return true; if (dst_mtu(&rt->dst) == idev->cnf.mtu6) return true; return false; } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, const struct fib6_nh *nh, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i; bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return; for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { struct rt6_info *entry = rt6_ex->rt6i; /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected * route), the metrics of its rt->from have already * been updated. */ if (dst_metric_raw(&entry->dst, RTAX_MTU) && rt6_mtu_change_route_allowed(idev, entry, mtu)) dst_metric_set(&entry->dst, RTAX_MTU, mtu); } bucket++; } } #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, const struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { struct rt6_info *entry = rt6_ex->rt6i; if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY && ipv6_addr_equal(gateway, &entry->rt6i_gateway)) { rt6_remove_exception(bucket, rt6_ex); } } bucket++; } } spin_unlock_bh(&rt6_exception_lock); } static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex, struct fib6_gc_args *gc_args, unsigned long now) { struct rt6_info *rt = rt6_ex->rt6i; /* we are pruning and obsoleting aged-out and non gateway exceptions * even if others have still references to them, so that on next * dst_check() such references can be dropped. * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when * expired, independently from their aging, as per RFC 8201 section 4 */ if (!(rt->rt6i_flags & RTF_EXPIRES)) { if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { pr_debug("aging clone %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } } else if (time_after(jiffies, rt->dst.expires)) { pr_debug("purging expired route %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } if (rt->rt6i_flags & RTF_GATEWAY) { struct neighbour *neigh; neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (!(neigh && (neigh->flags & NTF_ROUTER))) { pr_debug("purging route %p via non-router but gateway\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } } gc_args->more++; } static void fib6_nh_age_exceptions(const struct fib6_nh *nh, struct fib6_gc_args *gc_args, unsigned long now) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; rcu_read_lock_bh(); spin_lock(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { rt6_age_examine_exception(bucket, rt6_ex, gc_args, now); } bucket++; } } spin_unlock(&rt6_exception_lock); rcu_read_unlock_bh(); } struct fib6_nh_age_excptn_arg { struct fib6_gc_args *gc_args; unsigned long now; }; static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) { struct fib6_nh_age_excptn_arg *arg = _arg; fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); return 0; } void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now) { if (f6i->nh) { struct fib6_nh_age_excptn_arg arg = { .gc_args = gc_args, .now = now }; nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, &arg); } else { fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); } } /* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict) { struct fib6_node *fn, *saved_fn; fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; redo_rt6_select: rt6_select(net, fn, oif, res, strict); if (res->f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; else if (strict & RT6_LOOKUP_F_REACHABLE) { /* also consider unreachable route */ strict &= ~RT6_LOOKUP_F_REACHABLE; fn = saved_fn; goto redo_rt6_select; } } trace_fib6_table_lookup(net, res, table, fl6); return 0; } struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; struct rt6_info *rt = NULL; int strict = 0; WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && !rcu_read_lock_held()); strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) strict |= RT6_LOOKUP_F_REACHABLE; rcu_read_lock(); fib6_table_lookup(net, table, oif, fl6, &res, strict); if (res.f6i == net->ipv6.fib6_null_entry) goto out; fib6_select_path(net, &res, fl6, oif, false, skb, strict); /*Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { goto out; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); if (rt) { /* 1 refcnt is taken during ip6_rt_cache_alloc(). * As rt6_uncached_list_add() does not consume refcnt, * this refcnt is always returned to the caller even * if caller sets RT6_LOOKUP_F_DST_NOREF flag. */ rt6_uncached_list_add(rt); rcu_read_unlock(); return rt; } } else { /* Get a percpu copy */ local_bh_disable(); rt = rt6_get_pcpu_route(&res); if (!rt) rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); } out: if (!rt) rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) ip6_hold_safe(net, &rt); rcu_read_unlock(); return rt; } EXPORT_SYMBOL_GPL(ip6_pol_route); INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); } struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); } EXPORT_SYMBOL_GPL(ip6_route_input_lookup); static void ip6_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *keys, struct flow_keys *flkeys) { const struct ipv6hdr *outer_iph = ipv6_hdr(skb); const struct ipv6hdr *key_iph = outer_iph; struct flow_keys *_flkeys = flkeys; const struct ipv6hdr *inner_iph; const struct icmp6hdr *icmph; struct ipv6hdr _inner_iph; struct icmp6hdr _icmph; if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) goto out; icmph = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_icmph), &_icmph); if (!icmph) goto out; if (!icmpv6_is_err(icmph->icmp6_type)) goto out; inner_iph = skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(*icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) goto out; key_iph = inner_iph; _flkeys = NULL; out: if (_flkeys) { keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; keys->tags.flow_label = _flkeys->tags.flow_label; keys->basic.ip_proto = _flkeys->basic.ip_proto; } else { keys->addrs.v6addrs.src = key_iph->saddr; keys->addrs.v6addrs.dst = key_iph->daddr; keys->tags.flow_label = ip6_flowlabel(key_iph); keys->basic.ip_proto = key_iph->nexthdr; } } static u32 rt6_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was * encountered during dissection of the outer flow, then there is no * point in calling the flow dissector again. */ if (!has_inner) return 0; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, 0); if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) return 0; if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_skb(const struct net *net, const struct sk_buff *skb) { u32 mhash, mhash_inner; bool has_inner = true; mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); return jhash_2words(mhash, mhash_inner, 0); } static u32 rt6_multipath_custom_hash_fl6(const struct net *net, const struct flowi6 *fl6) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v6addrs.src = fl6->saddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v6addrs.dst = fl6->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = fl6->fl6_sport; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl6 can be NULL */ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; u32 mhash = 0; switch (ip6_multipath_hash_policy(net)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (skb) { ip6_multipath_l3_keys(skb, &hash_keys, flkeys); } else { hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: if (skb) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; struct flow_keys keys; /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; memset(&hash_keys, 0, sizeof(hash_keys)); if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; } hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; hash_keys.ports.src = flkeys->ports.src; hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (skb) { struct flow_keys keys; if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, 0); flkeys = &keys; } /* Inner can be v4 or v6 */ if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; hash_keys.tags.flow_label = flkeys->tags.flow_label; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; ip6_multipath_l3_keys(skb, &hash_keys, flkeys); } } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) mhash = rt6_multipath_custom_hash_skb(net, skb); else mhash = rt6_multipath_custom_hash_fl6(net, fl6); break; } return mhash >> 1; } /* Called with rcu held */ void ip6_route_input(struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, }; struct flow_keys *flkeys = NULL, _flkeys; tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) flkeys = &_flkeys; if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); } INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } static struct dst_entry *ip6_route_output_flags_noref(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { bool any_src; if (ipv6_addr_type(&fl6->daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; /* This function does not take refcnt on the dst */ dst = l3mdev_link_scope_lookup(net, fl6); if (dst) return dst; } fl6->flowi6_iif = LOOPBACK_IFINDEX; flags |= RT6_LOOKUP_F_DST_NOREF; any_src = ipv6_addr_any(&fl6->saddr); if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || (fl6->flowi6_oif && any_src)) flags |= RT6_LOOKUP_F_IFACE; if (!any_src) flags |= RT6_LOOKUP_F_HAS_SADDR; else if (sk) flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs)); return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { struct dst_entry *dst; struct rt6_info *rt6; rcu_read_lock(); dst = ip6_route_output_flags_noref(net, sk, fl6, flags); rt6 = dst_rt6_info(dst); /* For dst cached in uncached_list, refcnt is already taken. */ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { dst = &net->ipv6.ip6_null_entry->dst; dst_hold(dst); } rcu_read_unlock(); return dst; } EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rt6_info *rt, *ort = dst_rt6_info(dst_orig); struct net_device *loopback_dev = net->loopback_dev; struct dst_entry *new = NULL; rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard_out; dst_copy_metrics(new, &ort->dst); rt->rt6i_idev = in6_dev_get(loopback_dev); rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); #ifdef CONFIG_IPV6_SUBTREES memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); #endif } dst_release(dst_orig); return new ? new : ERR_PTR(-ENOMEM); } /* * Destination cache support functions */ static bool fib6_check(struct fib6_info *f6i, u32 cookie) { u32 rt_cookie = 0; if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) return false; if (fib6_check_expired(f6i)) return false; return true; } static struct dst_entry *rt6_check(struct rt6_info *rt, struct fib6_info *from, u32 cookie) { u32 rt_cookie = 0; if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || rt_cookie != cookie) return NULL; if (rt6_check_expired(rt)) return NULL; return &rt->dst; } static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, struct fib6_info *from, u32 cookie) { if (!__rt6_check_expired(rt) && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && fib6_check(from, cookie)) return &rt->dst; else return NULL; } INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct dst_entry *dst_ret; struct fib6_info *from; struct rt6_info *rt; rt = dst_rt6_info(dst); if (rt->sernum) return rt6_is_valid(rt) ? dst : NULL; rcu_read_lock(); /* All IPV6 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ from = rcu_dereference(rt->from); if (from && (rt->rt6i_flags & RTF_PCPU || unlikely(!list_empty(&rt->dst.rt_uncached)))) dst_ret = rt6_dst_from_check(rt, from, cookie); else dst_ret = rt6_check(rt, from, cookie); rcu_read_unlock(); return dst_ret; } EXPORT_INDIRECT_CALLABLE(ip6_dst_check); static void ip6_negative_advice(struct sock *sk, struct dst_entry *dst) { struct rt6_info *rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_CACHE) { rcu_read_lock(); if (rt6_check_expired(rt)) { /* rt/dst can not be destroyed yet, * because of rcu_read_lock() */ sk_dst_reset(sk); rt6_remove_exception_rt(rt); } rcu_read_unlock(); return; } sk_dst_reset(sk); } static void ip6_link_failure(struct sk_buff *skb) { struct rt6_info *rt; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); rt = dst_rt6_info(skb_dst(skb)); if (rt) { rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { rt6_remove_exception_rt(rt); } else { struct fib6_info *from; struct fib6_node *fn; from = rcu_dereference(rt->from); if (from) { fn = rcu_dereference(from->fib6_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) WRITE_ONCE(fn->fn_sernum, -1); } } rcu_read_unlock(); } } static void rt6_update_expires(struct rt6_info *rt0, int timeout) { if (!(rt0->rt6i_flags & RTF_EXPIRES)) { struct fib6_info *from; rcu_read_lock(); from = rcu_dereference(rt0->from); if (from) rt0->dst.expires = from->expires; rcu_read_unlock(); } dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; } static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) { struct net *net = dev_net(rt->dst.dev); dst_metric_set(&rt->dst, RTAX_MTU, mtu); rt->rt6i_flags |= RTF_MODIFIED; rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); } static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { return !(rt->rt6i_flags & RTF_CACHE) && (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, const struct ipv6hdr *iph, u32 mtu, bool confirm_neigh) { const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = dst_rt6_info(dst); /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. * [see also comment in rt6_mtu_change_route()] */ if (iph) { daddr = &iph->daddr; saddr = &iph->saddr; } else if (sk) { daddr = &sk->sk_v6_daddr; saddr = &inet6_sk(sk)->saddr; } else { daddr = NULL; saddr = NULL; } if (confirm_neigh) dst_confirm_neigh(dst, daddr); if (mtu < IPV6_MIN_MTU) return; if (mtu >= dst_mtu(dst)) return; if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); /* update rt6_ex->stamp for cache */ if (rt6->rt6i_flags & RTF_CACHE) rt6_update_exception_stamp_rt(rt6); } else if (daddr) { struct fib6_result res = {}; struct rt6_info *nrt6; rcu_read_lock(); res.f6i = rcu_dereference(rt6->from); if (!res.f6i) goto out_unlock; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; if (res.f6i->nh) { struct fib6_nh_match_arg arg = { .dev = dst->dev, .gw = &rt6->rt6i_gateway, }; nexthop_for_each_fib6_nh(res.f6i->nh, fib6_nh_find_match, &arg); /* fib6_info uses a nexthop that does not have fib6_nh * using the dst->dev + gw. Should be impossible. */ if (!arg.match) goto out_unlock; res.nh = arg.match; } else { res.nh = res.f6i->fib6_nh; } nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); if (rt6_insert_exception(nrt6, &res)) dst_release_immediate(&nrt6->dst); } out_unlock: rcu_read_unlock(); } } static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, confirm_neigh); } void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark, kuid_t uid) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_oif = oif, .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_uid = uid, }; dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) { int oif = sk->sk_bound_dev_if; struct dst_entry *dst; if (!oif && skb->dev) oif = l3mdev_master_ifindex(skb->dev); ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), sk->sk_uid); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) return; bh_lock_sock(sk); if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) ip6_datagram_dst_update(sk, false); bh_unlock_sock(sk); } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, const struct flowi6 *fl6) { #ifdef CONFIG_IPV6_SUBTREES struct ipv6_pinfo *np = inet6_sk(sk); #endif ip6_dst_store(sk, dst, ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? &sk->sk_v6_daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? &np->saddr : #endif NULL); } static bool ip6_redirect_nh_match(const struct fib6_result *res, struct flowi6 *fl6, const struct in6_addr *gw, struct rt6_info **ret) { const struct fib6_nh *nh = res->nh; if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || fl6->flowi6_oif != nh->fib_nh_dev->ifindex) return false; /* rt_cache's gateway might be different from its 'parent' * in the case of an ip redirect. * So we keep searching in the exception table if the gateway * is different. */ if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { struct rt6_info *rt_cache; rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); if (rt_cache && ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { *ret = rt_cache; return true; } return false; } return true; } struct fib6_nh_rd_arg { struct fib6_result *res; struct flowi6 *fl6; const struct in6_addr *gw; struct rt6_info **ret; }; static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_rd_arg *arg = _arg; arg->res->nh = nh; return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); } /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; struct in6_addr gateway; }; INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; struct rt6_info *ret = NULL; struct fib6_result res = {}; struct fib6_nh_rd_arg arg = { .res = &res, .fl6 = fl6, .gw = &rdfl->gateway, .ret = &ret }; struct fib6_info *rt; struct fib6_node *fn; /* Get the "current" route for this destination and * check if the redirect has come from appropriate router. * * RFC 4861 specifies that redirects should only be * accepted if they come from the nexthop to the target. * Due to the way the routes are chosen, this notion * is a bit fuzzy and one might need to check all possible * routes. */ rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { res.f6i = rt; if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; if (unlikely(rt->nh)) { if (nexthop_is_blackhole(rt->nh)) continue; /* on match, res->nh is filled in and potentially ret */ if (nexthop_for_each_fib6_nh(rt->nh, fib6_nh_redirect_match, &arg)) goto out; } else { res.nh = rt->fib6_nh; if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) goto out; } } if (!rt) rt = net->ipv6.fib6_null_entry; else if (rt->fib6_flags & RTF_REJECT) { ret = net->ipv6.ip6_null_entry; goto out; } if (rt == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; } res.f6i = rt; res.nh = rt->fib6_nh; out: if (ret) { ip6_hold_safe(net, &ret); } else { res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; ret = ip6_create_rt_rcu(&res); } rcu_read_unlock(); trace_fib6_table_lookup(net, &res, table, fl6); return ret; }; static struct dst_entry *ip6_route_redirect(struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, const struct in6_addr *gateway) { int flags = RT6_LOOKUP_F_HAS_SADDR; struct ip6rd_flowi rdfl; rdfl.fl6 = *fl6; rdfl.gateway = *gateway; return fib6_rule_lookup(net, &rdfl.fl6, skb, flags, __ip6_route_redirect); } void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, kuid_t uid) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_oif = oif, .flowi6_mark = mark, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_uid = uid, }; dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_redirect); void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_oif = oif, .daddr = msg->dest, .saddr = iph->daddr, .flowi6_uid = sock_net_uid(net, NULL), }; dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), sk->sk_uid); } EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { struct net_device *dev = dst->dev; unsigned int mtu = dst_mtu(dst); struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); rcu_read_lock(); net = dev_net_rcu(dev); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; rcu_read_unlock(); /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. * IPV6_MAXPLEN is also valid and means: "any MSS, * rely only on pmtu discovery" */ if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) mtu = IPV6_MAXPLEN; return mtu; } INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) { return ip6_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ip6_mtu); /* MTU selection: * 1. mtu on route is locked - use it * 2. mtu from nexthop exception * 3. mtu from egress device * * based on ip6_dst_mtu_forward and exception logic of * rt6_find_cached_rt; called with rcu_read_lock */ u32 ip6_mtu_from_fib6(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { const struct fib6_nh *nh = res->nh; struct fib6_info *f6i = res->f6i; struct inet6_dev *idev; struct rt6_info *rt; u32 mtu = 0; if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { mtu = f6i->fib6_pmtu; if (mtu) goto out; } rt = rt6_find_cached_rt(res, daddr, saddr); if (unlikely(rt)) { mtu = dst_metric_raw(&rt->dst, RTAX_MTU); } else { struct net_device *dev = nh->fib_nh_dev; mtu = IPV6_MIN_MTU; idev = __in6_dev_get(dev); if (idev) mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6)); } mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); out: return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6) { struct dst_entry *dst; struct rt6_info *rt; struct inet6_dev *idev = in6_dev_get(dev); struct net *net = dev_net(dev); if (unlikely(!idev)) return ERR_PTR(-ENODEV); rt = ip6_dst_alloc(net, dev, 0); if (unlikely(!rt)) { in6_dev_put(idev); dst = ERR_PTR(-ENOMEM); goto out; } rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); /* Add this dst into uncached_list so that rt6_disable_ip() can * do proper release of the net_device */ rt6_uncached_list_add(rt); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); out: return dst; } static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; unsigned int val; int entries; if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: val = atomic_read(&net->ipv6.ip6_rt_gc_expire); atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); } static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, const struct in6_addr *gw_addr, u32 tbid, int flags, struct fib6_result *res) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, .daddr = *gw_addr, .saddr = cfg->fc_prefsrc, }; struct fib6_table *table; int err; table = fib6_get_table(net, tbid); if (!table) return -EINVAL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); if (!err && res->f6i != net->ipv6.fib6_null_entry) fib6_select_path(net, res, &fl6, cfg->fc_ifindex, cfg->fc_ifindex != 0, NULL, flags); return err; } static int ip6_route_check_nh_onlink(struct net *net, struct fib6_config *cfg, const struct net_device *dev, struct netlink_ext_ack *extack) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; struct fib6_result res = {}; int err; err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); if (!err && !(res.fib6_flags & RTF_REJECT) && /* ignore match if it is the default route */ !ipv6_addr_any(&res.f6i->fib6_dst.addr) && (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway or device mismatch"); err = -EINVAL; } return err; } static int ip6_route_check_nh(struct net *net, struct fib6_config *cfg, struct net_device **_dev, netdevice_tracker *dev_tracker, struct inet6_dev **idev) { const struct in6_addr *gw_addr = &cfg->fc_gateway; struct net_device *dev = _dev ? *_dev : NULL; int flags = RT6_LOOKUP_F_IFACE; struct fib6_result res = {}; int err = -EHOSTUNREACH; if (cfg->fc_table) { err = ip6_nh_lookup_table(net, cfg, gw_addr, cfg->fc_table, flags, &res); /* gw_addr can not require a gateway or resolve to a reject * route. If a device is given, it must match the result. */ if (err || res.fib6_flags & RTF_REJECT || res.nh->fib_nh_gw_family || (dev && dev != res.nh->fib_nh_dev)) err = -EHOSTUNREACH; } if (err < 0) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, .daddr = *gw_addr, }; err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); if (err || res.fib6_flags & RTF_REJECT || res.nh->fib_nh_gw_family) err = -EHOSTUNREACH; if (err) return err; fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, cfg->fc_ifindex != 0, NULL, flags); } err = 0; if (dev) { if (dev != res.nh->fib_nh_dev) err = -EHOSTUNREACH; } else { *_dev = dev = res.nh->fib_nh_dev; netdev_hold(dev, dev_tracker, GFP_ATOMIC); *idev = in6_dev_get(dev); } return err; } static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, struct net_device **_dev, netdevice_tracker *dev_tracker, struct inet6_dev **idev, struct netlink_ext_ack *extack) { const struct in6_addr *gw_addr = &cfg->fc_gateway; int gwa_type = ipv6_addr_type(gw_addr); bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; const struct net_device *dev = *_dev; bool need_addr_check = !dev; int err = -EINVAL; /* if gw_addr is local we will fail to detect this in case * address is still TENTATIVE (DAD in progress). rt6_lookup() * will return already-added prefix route via interface that * prefix route was assigned to, which might be non-loopback. */ if (dev && ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); goto out; } if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { /* IPv6 strictly inhibits using not link-local * addresses as nexthop address. * Otherwise, router will not able to send redirects. * It is very good, but in some (rare!) circumstances * (SIT, PtP, NBMA NOARP links) it is handy to allow * some exceptions. --ANK * We allow IPv4-mapped nexthops to support RFC4798-type * addressing */ if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { NL_SET_ERR_MSG(extack, "Invalid gateway address"); goto out; } rcu_read_lock(); if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else err = ip6_route_check_nh(net, cfg, _dev, dev_tracker, idev); rcu_read_unlock(); if (err) goto out; } /* reload in case device was changed */ dev = *_dev; err = -EINVAL; if (!dev) { NL_SET_ERR_MSG(extack, "Egress device not specified"); goto out; } else if (dev->flags & IFF_LOOPBACK) { NL_SET_ERR_MSG(extack, "Egress device can not be loopback device for this route"); goto out; } /* if we did not check gw_addr above, do so now that the * egress device has been resolved. */ if (need_addr_check && ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); goto out; } err = 0; out: return err; } static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) { if ((flags & RTF_REJECT) || (dev && (dev->flags & IFF_LOOPBACK) && !(addr_type & IPV6_ADDR_LOOPBACK) && !(flags & (RTF_ANYCAST | RTF_LOCAL)))) return true; return false; } int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; int addr_type; int err; fib6_nh->fib_nh_family = AF_INET6; #ifdef CONFIG_IPV6_ROUTER_PREF fib6_nh->last_probe = jiffies; #endif if (cfg->fc_is_fdb) { fib6_nh->fib_nh_gw6 = cfg->fc_gateway; fib6_nh->fib_nh_gw_family = AF_INET6; return 0; } err = -ENODEV; if (cfg->fc_ifindex) { dev = netdev_get_by_index(net, cfg->fc_ifindex, dev_tracker, gfp_flags); if (!dev) goto out; idev = in6_dev_get(dev); if (!idev) goto out; } if (cfg->fc_flags & RTNH_F_ONLINK) { if (!dev) { NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); goto out; } if (!(dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; } fib6_nh->fib_nh_weight = 1; /* We cannot add true routes via loopback here, * they would result in kernel looping; promote them to reject routes */ addr_type = ipv6_addr_type(&cfg->fc_dst); if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { netdev_put(dev, dev_tracker); in6_dev_put(idev); } dev = net->loopback_dev; netdev_hold(dev, dev_tracker, gfp_flags); idev = in6_dev_get(dev); if (!idev) { err = -ENODEV; goto out; } } goto pcpu_alloc; } if (cfg->fc_flags & RTF_GATEWAY) { err = ip6_validate_gw(net, cfg, &dev, dev_tracker, &idev, extack); if (err) goto out; fib6_nh->fib_nh_gw6 = cfg->fc_gateway; fib6_nh->fib_nh_gw_family = AF_INET6; } err = -ENODEV; if (!dev) goto out; if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); err = -EACCES; goto out; } if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && !netif_carrier_ok(dev)) fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) goto out; pcpu_alloc: fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); if (!fib6_nh->rt6i_pcpu) { err = -ENOMEM; goto out; } fib6_nh->fib_nh_dev = dev; fib6_nh->fib_nh_oif = dev->ifindex; err = 0; out: if (idev) in6_dev_put(idev); if (err) { fib_nh_common_release(&fib6_nh->nh_common); fib6_nh->nh_common.nhc_pcpu_rth_output = NULL; fib6_nh->fib_nh_lws = NULL; netdev_put(dev, dev_tracker); } return err; } void fib6_nh_release(struct fib6_nh *fib6_nh) { struct rt6_exception_bucket *bucket; rcu_read_lock(); fib6_nh_flush_exceptions(fib6_nh, NULL); bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); if (bucket) { rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); kfree(bucket); } rcu_read_unlock(); fib6_nh_release_dsts(fib6_nh); free_percpu(fib6_nh->rt6i_pcpu); fib_nh_common_release(&fib6_nh->nh_common); } void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) { int cpu; if (!fib6_nh->rt6i_pcpu) return; for_each_possible_cpu(cpu) { struct rt6_info *pcpu_rt, **ppcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = xchg(ppcpu_rt, NULL); if (pcpu_rt) { dst_dev_put(&pcpu_rt->dst); dst_release(&pcpu_rt->dst); } } } static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_info *rt = NULL; struct nexthop *nh = NULL; struct fib6_table *table; struct fib6_nh *fib6_nh; int err = -EINVAL; int addr_type; /* RTF_PCPU is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_PCPU) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); goto out; } /* RTF_CACHE is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_CACHE) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); goto out; } if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); goto out; } if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto out; } if (cfg->fc_src_len > 128) { NL_SET_ERR_MSG(extack, "Invalid source address length"); goto out; } #ifndef CONFIG_IPV6_SUBTREES if (cfg->fc_src_len) { NL_SET_ERR_MSG(extack, "Specifying source address requires IPV6_SUBTREES to be enabled"); goto out; } #endif if (cfg->fc_nh_id) { nh = nexthop_find_by_id(net, cfg->fc_nh_id); if (!nh) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto out; } err = fib6_check_nexthop(nh, cfg, extack); if (err) goto out; } err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { table = fib6_get_table(net, cfg->fc_table); if (!table) { pr_warn("NLM_F_CREATE should be specified when creating new route\n"); table = fib6_new_table(net, cfg->fc_table); } } else { table = fib6_new_table(net, cfg->fc_table); } if (!table) goto out; err = -ENOMEM; rt = fib6_info_alloc(gfp_flags, !nh); if (!rt) goto out; rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); /* Do not leave garbage there. */ rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; goto out_free; } if (cfg->fc_flags & RTF_ADDRCONF) rt->dst_nocount = true; if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; rt->fib6_protocol = cfg->fc_protocol; rt->fib6_table = table; rt->fib6_metric = cfg->fc_metric; rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->fib6_dst.plen = cfg->fc_dst_len; #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif if (nh) { if (rt->fib6_src.plen) { NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); err = -EINVAL; goto out_free; } if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -ENOENT; goto out_free; } rt->nh = nh; fib6_nh = nexthop_fib6_nh(rt->nh); } else { err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); if (err) goto out; fib6_nh = rt->fib6_nh; /* We cannot add true routes via loopback here, they would * result in kernel looping; promote them to reject routes */ addr_type = ipv6_addr_type(&cfg->fc_dst); if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type)) rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; } if (!ipv6_addr_any(&cfg->fc_prefsrc)) { struct net_device *dev = fib6_nh->fib_nh_dev; if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; goto out; } rt->fib6_prefsrc.addr = cfg->fc_prefsrc; rt->fib6_prefsrc.plen = 128; } else rt->fib6_prefsrc.plen = 0; return rt; out: fib6_info_release(rt); return ERR_PTR(err); out_free: ip_fib_metrics_put(rt->fib6_metrics); kfree(rt); return ERR_PTR(err); } int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct fib6_info *rt; int err; rt = ip6_route_info_create(cfg, gfp_flags, extack); if (IS_ERR(rt)) return PTR_ERR(rt); err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); fib6_info_release(rt); return err; } static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_table *table; int err; if (rt == net->ipv6.fib6_null_entry) { err = -ENOENT; goto out; } table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); spin_unlock_bh(&table->tb6_lock); out: fib6_info_release(rt); return err; } int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) { struct nl_info info = { .nl_net = net, .skip_notify = skip_notify }; return __ip6_del_rt(rt, &info); } static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) { struct nl_info *info = &cfg->fc_nlinfo; struct net *net = info->nl_net; struct sk_buff *skb = NULL; struct fib6_table *table; int err = -ENOENT; if (rt == net->ipv6.fib6_null_entry) goto out_put; table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { struct fib6_info *sibling, *next_sibling; struct fib6_node *fn; /* prefer to send a single notification with all hops */ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (skb) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; if (rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_DELROUTE, info->portid, seq, 0) < 0) { kfree_skb(skb); skb = NULL; } else info->skip_notify = 1; } /* 'rt' points to the first sibling route. If it is not the * leaf, then we do not need to send a notification. Otherwise, * we need to check if the last sibling has a next route or not * and emit a replace or delete notification, respectively. */ info->skip_notify_kernel = 1; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (rcu_access_pointer(fn->leaf) == rt) { struct fib6_info *last_sibling, *replace_rt; last_sibling = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); replace_rt = rcu_dereference_protected( last_sibling->fib6_next, lockdep_is_held(&table->tb6_lock)); if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); else call_fib6_multipath_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, rt->fib6_nsiblings, NULL); } list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { err = fib6_del(sibling, info); if (err) goto out_unlock; } } err = fib6_del(rt, info); out_unlock: spin_unlock_bh(&table->tb6_lock); out_put: fib6_info_release(rt); if (skb) { rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); } return err; } static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) { int rc = -ESRCH; if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) goto out; if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) goto out; rc = rt6_remove_exception_rt(rt); out: return rc; } static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, struct fib6_nh *nh) { struct fib6_result res = { .f6i = rt, .nh = nh, }; struct rt6_info *rt_cache; rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); if (rt_cache) return __ip6_del_cached_rt(rt_cache, cfg); return 0; } struct fib6_nh_del_cached_rt_arg { struct fib6_config *cfg; struct fib6_info *f6i; }; static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) { struct fib6_nh_del_cached_rt_arg *arg = _arg; int rc; rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); return rc != -ESRCH ? rc : 0; } static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) { struct fib6_nh_del_cached_rt_arg arg = { .cfg = cfg, .f6i = f6i }; return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); } static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_table *table; struct fib6_info *rt; struct fib6_node *fn; int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); if (!table) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); return err; } rcu_read_lock(); fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, &cfg->fc_src, cfg->fc_src_len, !(cfg->fc_flags & RTF_CACHE)); if (fn) { for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; if (rt->nh && cfg->fc_nh_id && rt->nh->id != cfg->fc_nh_id) continue; if (cfg->fc_flags & RTF_CACHE) { int rc = 0; if (rt->nh) { rc = ip6_del_cached_rt_nh(cfg, rt); } else if (cfg->fc_nh_id) { continue; } else { nh = rt->fib6_nh; rc = ip6_del_cached_rt(cfg, rt, nh); } if (rc != -ESRCH) { rcu_read_unlock(); return rc; } continue; } if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) continue; if (rt->nh) { if (!fib6_info_hold_safe(rt)) continue; rcu_read_unlock(); return __ip6_del_rt(rt, &cfg->fc_nlinfo); } if (cfg->fc_nh_id) continue; nh = rt->fib6_nh; if (cfg->fc_ifindex && (!nh->fib_nh_dev || nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) continue; if (!fib6_info_hold_safe(rt)) continue; rcu_read_unlock(); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) return __ip6_del_rt(rt, &cfg->fc_nlinfo); return __ip6_del_rt_siblings(rt, cfg); } } rcu_read_unlock(); return err; } static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct netevent_redirect netevent; struct rt6_info *rt, *nrt = NULL; struct fib6_result res = {}; struct ndisc_options ndopts; struct inet6_dev *in6_dev; struct neighbour *neigh; struct rd_msg *msg; int optlen, on_link; u8 *lladdr; optlen = skb_tail_pointer(skb) - skb_transport_header(skb); optlen -= sizeof(*msg); if (optlen < 0) { net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); return; } msg = (struct rd_msg *)icmp6_hdr(skb); if (ipv6_addr_is_multicast(&msg->dest)) { net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); return; } on_link = 0; if (ipv6_addr_equal(&msg->dest, &msg->target)) { on_link = 1; } else if (ipv6_addr_type(&msg->target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); return; } in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) return; if (READ_ONCE(in6_dev->cnf.forwarding) || !READ_ONCE(in6_dev->cnf.accept_redirects)) return; /* RFC2461 8.1: * The IP source address of the Redirect MUST be the same as the current * first-hop router for the specified ICMP Destination Address. */ if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); return; } lladdr = NULL; if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, skb->dev); if (!lladdr) { net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); return; } } rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_REJECT) { net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); return; } /* Redirect received -> path was valid. * Look, redirects are sent only in response to data packets, * so that this nexthop apparently is reachable. --ANK */ dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); if (!neigh) return; /* * We have finally decided to accept it. */ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER)), NDISC_REDIRECT, &ndopts); rcu_read_lock(); res.f6i = rcu_dereference(rt->from); if (!res.f6i) goto out; if (res.f6i->nh) { struct fib6_nh_match_arg arg = { .dev = dst->dev, .gw = &rt->rt6i_gateway, }; nexthop_for_each_fib6_nh(res.f6i->nh, fib6_nh_find_match, &arg); /* fib6_info uses a nexthop that does not have fib6_nh * using the dst->dev. Should be impossible */ if (!arg.match) goto out; res.nh = arg.match; } else { res.nh = res.f6i->fib6_nh; } res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); if (!nrt) goto out; nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; /* rt6_insert_exception() will take care of duplicated exceptions */ if (rt6_insert_exception(nrt, &res)) { dst_release_immediate(&nrt->dst); goto out; } netevent.old = &rt->dst; netevent.new = &nrt->dst; netevent.daddr = &msg->dest; netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); out: rcu_read_unlock(); neigh_release(neigh); } #ifdef CONFIG_IPV6_ROUTE_INFO static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; int ifindex = dev->ifindex; struct fib6_node *fn; struct fib6_info *rt = NULL; struct fib6_table *table; table = fib6_get_table(net, tb_id); if (!table) return NULL; rcu_read_lock(); fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; for_each_fib6_node_rt_rcu(fn) { /* these routes do not use nexthops */ if (rt->nh) continue; if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || !rt->fib6_nh->fib_nh_gw_family) continue; if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; break; } out: rcu_read_unlock(); return rt; } static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) { struct fib6_config cfg = { .fc_metric = IP6_RT_PRIO_USER, .fc_ifindex = dev->ifindex, .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), .fc_protocol = RTPROT_RA, .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, }; cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; cfg.fc_dst = *prefix; cfg.fc_gateway = *gwaddr; /* We should treat it as a default route if prefix length is 0. */ if (!prefixlen) cfg.fc_flags |= RTF_DEFAULT; ip6_route_add(&cfg, GFP_ATOMIC, NULL); return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); } #endif struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; struct fib6_info *rt; struct fib6_table *table; table = fib6_get_table(net, tb_id); if (!table) return NULL; rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { struct fib6_nh *nh; /* RA routes do not use nexthops */ if (rt->nh) continue; nh = rt->fib6_nh; if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&nh->fib_nh_gw6, addr)) break; } if (rt && !fib6_info_hold_safe(rt)) rt = NULL; rcu_read_unlock(); return rt; } struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, u32 defrtr_usr_metric, int lifetime) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, .fc_metric = defrtr_usr_metric, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), .fc_protocol = RTPROT_RA, .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, .fc_expires = jiffies_to_clock_t(lifetime * HZ), }; cfg.fc_gateway = *gwaddr; if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { struct fib6_table *table; table = fib6_get_table(dev_net(dev), cfg.fc_table); if (table) table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; } return rt6_get_dflt_router(net, gwaddr, dev); } static void __rt6_purge_dflt_routers(struct net *net, struct fib6_table *table) { struct fib6_info *rt; restart: rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { struct net_device *dev = fib6_info_nh_dev(rt); struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!idev || idev->cnf.accept_ra != 2) && fib6_info_hold_safe(rt)) { rcu_read_unlock(); ip6_del_rt(net, rt, false); goto restart; } } rcu_read_unlock(); table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; } void rt6_purge_dflt_routers(struct net *net) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) __rt6_purge_dflt_routers(net, table); } } rcu_read_unlock(); } static void rtmsg_to_fib6_config(struct net *net, struct in6_rtmsg *rtmsg, struct fib6_config *cfg) { *cfg = (struct fib6_config){ .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? : RT6_TABLE_MAIN, .fc_ifindex = rtmsg->rtmsg_ifindex, .fc_metric = rtmsg->rtmsg_metric, .fc_expires = rtmsg->rtmsg_info, .fc_dst_len = rtmsg->rtmsg_dst_len, .fc_src_len = rtmsg->rtmsg_src_len, .fc_flags = rtmsg->rtmsg_flags, .fc_type = rtmsg->rtmsg_type, .fc_nlinfo.nl_net = net, .fc_dst = rtmsg->rtmsg_dst, .fc_src = rtmsg->rtmsg_src, .fc_gateway = rtmsg->rtmsg_gateway, }; } int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) { struct fib6_config cfg; int err; if (cmd != SIOCADDRT && cmd != SIOCDELRT) return -EINVAL; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtmsg_to_fib6_config(net, rtmsg, &cfg); rtnl_lock(); switch (cmd) { case SIOCADDRT: /* Only do the default setting of fc_metric in route adding */ if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; err = ip6_route_add(&cfg, GFP_KERNEL, NULL); break; case SIOCDELRT: err = ip6_route_del(&cfg, NULL); break; } rtnl_unlock(); return err; } /* * Drop the packet on the floor */ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) { struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(dst->dev); struct inet6_dev *idev; SKB_DR(reason); int type; if (netif_is_l3_master(skb->dev) || dst->dev == net->loopback_dev) idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); else idev = ip6_dst_idev(dst); switch (ipstats_mib_noroutes) { case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); if (type == IPV6_ADDR_ANY) { SKB_DR_SET(reason, IP_INADDRERRORS); IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); break; } SKB_DR_SET(reason, IP_INNOROUTES); fallthrough; case IPSTATS_MIB_OUTNOROUTES: SKB_DR_OR(reason, IP_OUTNOROUTES); IP6_INC_STATS(net, idev, ipstats_mib_noroutes); break; } /* Start over by dropping the dst for l3mdev case */ if (netif_is_l3_master(skb->dev)) skb_dst_drop(skb); icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); kfree_skb_reason(skb, reason); return 0; } static int ip6_pkt_discard(struct sk_buff *skb) { return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); } static int ip6_pkt_prohibit(struct sk_buff *skb) { return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); } /* * Allocate a dst for local (unicast / anycast) address. */ struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, .fc_ifindex = idev->dev->ifindex, .fc_flags = RTF_UP | RTF_NONEXTHOP, .fc_dst = *addr, .fc_dst_len = 128, .fc_protocol = RTPROT_KERNEL, .fc_nlinfo.nl_net = net, .fc_ignore_dev_down = true, }; struct fib6_info *f6i; if (anycast) { cfg.fc_type = RTN_ANYCAST; cfg.fc_flags |= RTF_ANYCAST; } else { cfg.fc_type = RTN_LOCAL; cfg.fc_flags |= RTF_LOCAL; } f6i = ip6_route_info_create(&cfg, gfp_flags, extack); if (!IS_ERR(f6i)) { f6i->dst_nocount = true; if (!anycast && (READ_ONCE(net->ipv6.devconf_all->disable_policy) || READ_ONCE(idev->cnf.disable_policy))) f6i->dst_nopolicy = true; } return f6i; } /* remove deleted ip from prefsrc entries */ struct arg_dev_net_ip { struct net *net; struct in6_addr *addr; }; static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) { struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; if (!rt->nh && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) && !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->fib6_prefsrc.plen = 0; spin_unlock_bh(&rt6_exception_lock); } return 0; } void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) { struct net *net = dev_net(ifp->idev->dev); struct arg_dev_net_ip adni = { .net = net, .addr = &ifp->addr, }; fib6_clean_all(net, fib6_remove_prefsrc, &adni); } #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) /* Remove routers and update dst entries when gateway turn into host. */ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; struct fib6_nh *nh; /* RA routes do not use nexthops */ if (rt->nh) return 0; nh = rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) return -1; /* Further clean up cached routes in exception table. * This is needed because cached route may have a different * gateway than its 'parent' in the case of an ip redirect. */ fib6_nh_exceptions_clean_tohost(nh, gateway); return 0; } void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) { fib6_clean_all(net, fib6_clean_tohost, gateway); } struct arg_netdev_event { const struct net_device *dev; union { unsigned char nh_flags; unsigned long event; }; }; static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) { struct fib6_info *iter; struct fib6_node *fn; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); iter = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); } return NULL; } /* only called for fib entries with builtin fib6_nh */ static bool rt6_is_dead(const struct fib6_info *rt) { if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) return true; return false; } static int rt6_multipath_total_weight(const struct fib6_info *rt) { struct fib6_info *iter; int total = 0; if (!rt6_is_dead(rt)) total += rt->fib6_nh->fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) total += iter->fib6_nh->fib_nh_weight; } return total; } static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) { int upper_bound = -1; if (!rt6_is_dead(rt)) { *weight += rt->fib6_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) { struct fib6_info *iter; int weight = 0; rt6_upper_bound_set(rt, &weight, total); list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) rt6_upper_bound_set(iter, &weight, total); } void rt6_multipath_rebalance(struct fib6_info *rt) { struct fib6_info *first; int total; /* In case the entire multipath route was marked for flushing, * then there is no need to rebalance upon the removal of every * sibling route. */ if (!rt->fib6_nsiblings || rt->should_flush) return; /* During lookup routes are evaluated in order, so we need to * make sure upper bounds are assigned from the first sibling * onwards. */ first = rt6_multipath_first_sibling(rt); if (WARN_ON_ONCE(!first)) return; total = rt6_multipath_total_weight(first); rt6_multipath_upper_bound_set(first, total); } static int fib6_ifup(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); if (rt != net->ipv6.fib6_null_entry && !rt->nh && rt->fib6_nh->fib_nh_dev == arg->dev) { rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } return 0; } void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) { struct arg_netdev_event arg = { .dev = dev, { .nh_flags = nh_flags, }, }; if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) arg.nh_flags |= RTNH_F_LINKDOWN; fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } /* only called for fib entries with inline fib6_nh */ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { struct fib6_info *iter; if (rt->fib6_nh->fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == dev) return true; return false; } static void rt6_multipath_flush(struct fib6_info *rt) { struct fib6_info *iter; rt->should_flush = 1; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) iter->should_flush = 1; } static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, const struct net_device *down_dev) { struct fib6_info *iter; unsigned int dead = 0; if (rt->fib6_nh->fib_nh_dev == down_dev || rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == down_dev || iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; return dead; } static void rt6_multipath_nh_flags_set(struct fib6_info *rt, const struct net_device *dev, unsigned char nh_flags) { struct fib6_info *iter; if (rt->fib6_nh->fib_nh_dev == dev) rt->fib6_nh->fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == dev) iter->fib6_nh->fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; const struct net_device *dev = arg->dev; struct net *net = dev_net(dev); if (rt == net->ipv6.fib6_null_entry || rt->nh) return 0; switch (arg->event) { case NETDEV_UNREGISTER: return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; count = rt6_multipath_dead_count(rt, dev); if (rt->fib6_nsiblings + 1 == count) { rt6_multipath_flush(rt); return -1; } rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); fib6_update_sernum(net, rt); rt6_multipath_rebalance(rt); } return -2; case NETDEV_CHANGE: if (rt->fib6_nh->fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } return 0; } void rt6_sync_down_dev(struct net_device *dev, unsigned long event) { struct arg_netdev_event arg = { .dev = dev, { .event = event, }, }; struct net *net = dev_net(dev); if (net->ipv6.sysctl.skip_notify_on_dev_down) fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); else fib6_clean_all(net, fib6_ifdown, &arg); } void rt6_disable_ip(struct net_device *dev, unsigned long event) { rt6_sync_down_dev(dev, event); rt6_uncached_list_flush_dev(dev); neigh_ifdown(&nd_tbl, dev); } struct rt6_mtu_change_arg { struct net_device *dev; unsigned int mtu; struct fib6_info *f6i; }; static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; struct fib6_info *f6i = arg->f6i; /* For administrative MTU increase, there is no way to discover * IPv6 PMTU increase, so PMTU increase should be updated here. * Since RFC 1981 doesn't include administrative MTU increase * update PMTU increase is a MUST. (i.e. jumbo frame) */ if (nh->fib_nh_dev == arg->dev) { struct inet6_dev *idev = __in6_dev_get(arg->dev); u32 mtu = f6i->fib6_pmtu; if (mtu >= arg->mtu || (mtu < arg->mtu && mtu == idev->cnf.mtu6)) fib6_metric_set(f6i, RTAX_MTU, arg->mtu); spin_lock_bh(&rt6_exception_lock); rt6_exceptions_update_pmtu(idev, nh, arg->mtu); spin_unlock_bh(&rt6_exception_lock); } return 0; } static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; /* In IPv6 pmtu discovery is not optional, so that RTAX_MTU lock cannot disable it. We still use this lock to block changes caused by addrconf/ndisc. */ idev = __in6_dev_get(arg->dev); if (!idev) return 0; if (fib6_metric_locked(f6i, RTAX_MTU)) return 0; arg->f6i = f6i; if (f6i->nh) { /* fib6_nh_mtu_change only returns 0, so this is safe */ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, arg); } return fib6_nh_mtu_change(f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) { struct rt6_mtu_change_arg arg = { .dev = dev, .mtu = mtu, }; fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_PREF] = { .type = NLA_U8 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_EXPIRES] = { .type = NLA_U32 }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, [RTA_FLOWLABEL] = { .type = NLA_BE32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; unsigned int pref; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); if (rtm->rtm_tos) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): option not available for IPv6"); goto errout; } if (tb[RTA_FLOWLABEL]) { NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], "Flow label cannot be specified for this operation"); goto errout; } *cfg = (struct fib6_config){ .fc_table = rtm->rtm_table, .fc_dst_len = rtm->rtm_dst_len, .fc_src_len = rtm->rtm_src_len, .fc_flags = RTF_UP, .fc_protocol = rtm->rtm_protocol, .fc_type = rtm->rtm_type, .fc_nlinfo.portid = NETLINK_CB(skb).portid, .fc_nlinfo.nlh = nlh, .fc_nlinfo.nl_net = sock_net(skb->sk), }; if (rtm->rtm_type == RTN_UNREACHABLE || rtm->rtm_type == RTN_BLACKHOLE || rtm->rtm_type == RTN_PROHIBIT || rtm->rtm_type == RTN_THROW) cfg->fc_flags |= RTF_REJECT; if (rtm->rtm_type == RTN_LOCAL) cfg->fc_flags |= RTF_LOCAL; if (rtm->rtm_flags & RTM_F_CLONED) cfg->fc_flags |= RTF_CACHE; cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); if (tb[RTA_NH_ID]) { if (tb[RTA_GATEWAY] || tb[RTA_OIF] || tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); goto errout; } cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); } if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; } if (tb[RTA_VIA]) { NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); goto errout; } if (tb[RTA_DST]) { int plen = (rtm->rtm_dst_len + 7) >> 3; if (nla_len(tb[RTA_DST]) < plen) goto errout; nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); } if (tb[RTA_SRC]) { int plen = (rtm->rtm_src_len + 7) >> 3; if (nla_len(tb[RTA_SRC]) < plen) goto errout; nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); } if (tb[RTA_PREFSRC]) cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); if (tb[RTA_OIF]) cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); if (tb[RTA_PRIORITY]) cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); if (tb[RTA_METRICS]) { cfg->fc_mx = nla_data(tb[RTA_METRICS]); cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); } if (tb[RTA_TABLE]) cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); if (tb[RTA_MULTIPATH]) { cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack, true); if (err < 0) goto errout; } if (tb[RTA_PREF]) { pref = nla_get_u8(tb[RTA_PREF]); if (pref != ICMPV6_ROUTER_PREF_LOW && pref != ICMPV6_ROUTER_PREF_HIGH) pref = ICMPV6_ROUTER_PREF_MEDIUM; cfg->fc_flags |= RTF_PREF(pref); } if (tb[RTA_ENCAP]) cfg->fc_encap = tb[RTA_ENCAP]; if (tb[RTA_ENCAP_TYPE]) { cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack, true); if (err < 0) goto errout; } if (tb[RTA_EXPIRES]) { unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); if (addrconf_finite_timeout(timeout)) { cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); cfg->fc_flags |= RTF_EXPIRES; } } err = 0; errout: return err; } struct rt6_nh { struct fib6_info *fib6_info; struct fib6_config r_cfg; struct list_head next; }; static int ip6_route_info_append(struct net *net, struct list_head *rt6_nh_list, struct fib6_info *rt, struct fib6_config *r_cfg) { struct rt6_nh *nh; int err = -EEXIST; list_for_each_entry(nh, rt6_nh_list, next) { /* check if fib6_info already exists */ if (rt6_duplicate_nexthop(nh->fib6_info, rt)) return err; } nh = kzalloc(sizeof(*nh), GFP_KERNEL); if (!nh) return -ENOMEM; nh->fib6_info = rt; memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); list_add_tail(&nh->next, rt6_nh_list); return 0; } static void ip6_route_mpath_notify(struct fib6_info *rt, struct fib6_info *rt_last, struct nl_info *info, __u16 nlflags) { /* if this is an APPEND route, then rt points to the first route * inserted and rt_last points to last route inserted. Userspace * wants a consistent dump of the route which starts at the first * nexthop. Since sibling routes are always added at the end of * the list, find the first sibling of the last route appended */ rcu_read_lock(); if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { rt = list_first_or_null_rcu(&rt_last->fib6_siblings, struct fib6_info, fib6_siblings); } if (rt) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); rcu_read_unlock(); } static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) { bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool should_notify = false; struct fib6_info *leaf; struct fib6_node *fn; rcu_read_lock(); fn = rcu_dereference(rt->fib6_node); if (!fn) goto out; leaf = rcu_dereference(fn->leaf); if (!leaf) goto out; if (rt == leaf || (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf))) should_notify = true; out: rcu_read_unlock(); return should_notify; } static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, struct netlink_ext_ack *extack) { if (nla_len(nla) < sizeof(*gw)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); return -EINVAL; } *gw = nla_get_in6_addr(nla); return 0; } static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; struct fib6_config r_cfg; struct rtnexthop *rtnh; struct fib6_info *rt; struct rt6_nh *err_nh; struct rt6_nh *nh, *nh_safe; __u16 nlflags; int remaining; int attrlen; int err = 1; int nhn = 0; int replace = (cfg->fc_nlinfo.nlh && (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); LIST_HEAD(rt6_nh_list); nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; /* Parse a Multipath Entry and build a list (rt6_nh_list) of * fib6_info structs per nexthop */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) r_cfg.fc_ifindex = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, extack); if (err) goto cleanup; r_cfg.fc_flags |= RTF_GATEWAY; } r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); /* RTA_ENCAP_TYPE length checked in * lwtunnel_valid_encap_type_attr */ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); } r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto cleanup; } if (!rt6_qualify_for_ecmp(rt)) { err = -EINVAL; NL_SET_ERR_MSG(extack, "Device only routes can not be added for IPv6 using the multipath API."); fib6_info_release(rt); goto cleanup; } rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); if (err) { fib6_info_release(rt); goto cleanup; } rtnh = rtnh_next(rtnh, &remaining); } if (list_empty(&rt6_nh_list)) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops"); return -EINVAL; } /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done */ info->skip_notify = 1; /* For add and replace, send one notification with all nexthops. For * append, send one notification with all appended nexthops. */ info->skip_notify_kernel = 1; err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { err = __ip6_ins_rt(nh->fib6_info, info, extack); if (err) { if (replace && nhn) NL_SET_ERR_MSG_MOD(extack, "multipath route replace failed (check consistency of installed routes)"); err_nh = nh; goto add_errout; } /* save reference to last route successfully inserted */ rt_last = nh->fib6_info; /* save reference to first route for notification */ if (!rt_notif) rt_notif = nh->fib6_info; /* Because each route is added like a single route we remove * these flags after the first nexthop: if there is a collision, * we have already failed to add the first nexthop: * fib6_add_rt2node() has rejected it; when replacing, old * nexthops have been replaced by first new, the rest should * be added to it. */ if (cfg->fc_nlinfo.nlh) { cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; } nhn++; } /* An in-kernel notification should only be sent in case the new * multipath route is added as the first route in the node, or if * it was appended to it. We pass 'rt_notif' since it is the first * sibling and might allow us to skip some checks in the replace case. */ if (ip6_route_mpath_should_notify(rt_notif)) { enum fib_event_type fib_event; if (rt_notif->fib6_nsiblings != nhn - 1) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_multipath_entry_notifiers(info->nl_net, fib_event, rt_notif, nhn - 1, extack); if (err) { /* Delete all the siblings that were just added */ err_nh = NULL; goto add_errout; } } /* success ... tell user about new route */ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); goto cleanup; add_errout: /* send notification for routes that were added so that * the delete notifications sent by ip6_route_del are * coherent */ if (rt_notif) ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); /* Delete routes that were already added */ list_for_each_entry(nh, &rt6_nh_list, next) { if (err_nh == nh) break; ip6_route_del(&nh->r_cfg, extack); } cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { fib6_info_release(nh->fib6_info); list_del(&nh->next); kfree(nh); } return err; } static int ip6_route_multipath_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_config r_cfg; struct rtnexthop *rtnh; int last_err = 0; int remaining; int attrlen; int err; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; /* Parse a Multipath Entry */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) r_cfg.fc_ifindex = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, extack); if (err) { last_err = err; goto next_rtnh; } r_cfg.fc_flags |= RTF_GATEWAY; } } err = ip6_route_del(&r_cfg, extack); if (err) last_err = err; next_rtnh: rtnh = rtnh_next(rtnh, &remaining); } return last_err; } static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib6_config cfg; int err; err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_nh_id && !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); return -EINVAL; } if (cfg.fc_mp) return ip6_route_multipath_del(&cfg, extack); else { cfg.fc_delete_all_nh = 1; return ip6_route_del(&cfg, extack); } } static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib6_config cfg; int err; err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; if (cfg.fc_mp) return ip6_route_multipath_add(&cfg, extack); else return ip6_route_add(&cfg, GFP_KERNEL, extack); } /* add the overhead of this fib6_nh to nexthop_len */ static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) { int *nexthop_len = arg; *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16); /* RTA_GATEWAY */ if (nh->fib_nh_lws) { /* RTA_ENCAP_TYPE */ *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); /* RTA_ENCAP */ *nexthop_len += nla_total_size(2); } return 0; } static size_t rt6_nlmsg_size(struct fib6_info *f6i) { int nexthop_len; if (f6i->nh) { nexthop_len = nla_total_size(4); /* RTA_NH_ID */ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, &nexthop_len); } else { struct fib6_nh *nh = f6i->fib6_nh; struct fib6_info *sibling; nexthop_len = 0; if (f6i->fib6_nsiblings) { rt6_nh_nlmsg_size(nh, &nexthop_len); rcu_read_lock(); list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, fib6_siblings) { rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); } rcu_read_unlock(); } nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ + nla_total_size(16) /* RTA_DST */ + nla_total_size(16) /* RTA_GATEWAY */ + nla_total_size(16) /* RTA_PREFSRC */ + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_IIF */ + nla_total_size(4) /* RTA_OIF */ + nla_total_size(4) /* RTA_PRIORITY */ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ + nexthop_len; } static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, unsigned char *flags) { if (nexthop_is_multipath(nh)) { struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (nexthop_mpath_fill_node(skb, nh, AF_INET6)) goto nla_put_failure; nla_nest_end(skb, mp); } else { struct fib6_nh *fib6_nh; fib6_nh = nexthop_fib6_nh(nh); if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6, flags, false) < 0) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags) { struct rt6_info *rt6 = dst_rt6_info(dst); struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; unsigned char nh_flags = 0; struct nlmsghdr *nlh; struct rtmsg *rtm; long expires = 0; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; if (rt6) { rt6_dst = &rt6->rt6i_dst; rt6_src = &rt6->rt6i_src; rt6_flags = rt6->rt6i_flags; } else { rt6_dst = &rt->fib6_dst; rt6_src = &rt->fib6_src; rt6_flags = rt->fib6_flags; } rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET6; rtm->rtm_dst_len = rt6_dst->plen; rtm->rtm_src_len = rt6_src->plen; rtm->rtm_tos = 0; if (rt->fib6_table) table = rt->fib6_table->tb6_id; else table = RT6_TABLE_UNSPEC; rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table)) goto nla_put_failure; rtm->rtm_type = rt->fib6_type; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->fib6_protocol; if (rt6_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; if (dest) { if (nla_put_in6_addr(skb, RTA_DST, dest)) goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { if (nla_put_in6_addr(skb, RTA_SRC, src)) goto nla_put_failure; rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len && nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) goto nla_put_failure; #endif if (iif) { #ifdef CONFIG_IPV6_MROUTE if (ipv6_addr_is_multicast(&rt6_dst->addr)) { int err = ip6mr_get_route(net, skb, rtm, portid); if (err == 0) return 0; if (err < 0) goto nla_put_failure; } else #endif if (nla_put_u32(skb, RTA_IIF, iif)) goto nla_put_failure; } else if (dest) { struct in6_addr saddr_buf; if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 && nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } if (rt->fib6_prefsrc.plen) { struct in6_addr saddr_buf; saddr_buf = rt->fib6_prefsrc.addr; if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; if (rtnetlink_put_metrics(skb, pmetrics) < 0) goto nla_put_failure; if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) goto nla_put_failure; /* For multipath routes, walk the siblings list and add * each as a nexthop within RTA_MULTIPATH. */ if (rt6) { if (rt6_flags & RTF_GATEWAY && nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) goto nla_put_failure; if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) goto nla_put_failure; if (dst->lwtstate && lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; } else if (rt->fib6_nsiblings) { struct fib6_info *sibling; struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, rt->fib6_nh->fib_nh_weight, AF_INET6, 0) < 0) goto nla_put_failure; rcu_read_lock(); list_for_each_entry_rcu(sibling, &rt->fib6_siblings, fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, AF_INET6, 0) < 0) { rcu_read_unlock(); goto nla_put_failure; } } rcu_read_unlock(); nla_nest_end(skb, mp); } else if (rt->nh) { if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) goto nla_put_failure; if (nexthop_is_blackhole(rt->nh)) rtm->rtm_type = RTN_BLACKHOLE; if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) && rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; } else { if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6, &nh_flags, false) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; } if (rt6_flags & RTF_EXPIRES) { expires = dst ? dst->expires : rt->expires; expires -= jiffies; } if (!dst) { if (READ_ONCE(rt->offload)) rtm->rtm_flags |= RTM_F_OFFLOAD; if (READ_ONCE(rt->trap)) rtm->rtm_flags |= RTM_F_TRAP; if (READ_ONCE(rt->offload_failed)) rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; } if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) { const struct net_device *dev = arg; if (nh->fib_nh_dev == dev) return 1; return 0; } static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { if (f6i->nh) { struct net_device *_dev = (struct net_device *)dev; return !!nexthop_for_each_fib6_nh(f6i->nh, fib6_info_nh_uses_dev, _dev); } if (f6i->fib6_nh->fib_nh_dev == dev) return true; if (f6i->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; list_for_each_entry_safe(sibling, next_sibling, &f6i->fib6_siblings, fib6_siblings) { if (sibling->fib6_nh->fib_nh_dev == dev) return true; } } return false; } struct fib6_nh_exception_dump_walker { struct rt6_rtnl_dump_arg *dump; struct fib6_info *rt; unsigned int flags; unsigned int skip; unsigned int count; }; static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) { struct fib6_nh_exception_dump_walker *w = arg; struct rt6_rtnl_dump_arg *dump = w->dump; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i, err; bucket = fib6_nh_get_excptn_bucket(nh, NULL); if (!bucket) return 0; for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { if (w->skip) { w->skip--; continue; } /* Expiration of entries doesn't bump sernum, insertion * does. Removal is triggered by insertion, so we can * rely on the fact that if entries change between two * partial dumps, this node is scanned again completely, * see rt6_insert_exception() and fib6_dump_table(). * * Count expired entries we go through as handled * entries that we'll skip next time, in case of partial * node dump. Otherwise, if entries expire meanwhile, * we'll skip the wrong amount. */ if (rt6_check_expired(rt6_ex->rt6i)) { w->count++; continue; } err = rt6_fill_node(dump->net, dump->skb, w->rt, &rt6_ex->rt6i->dst, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(dump->cb->skb).portid, dump->cb->nlh->nlmsg_seq, w->flags); if (err) return err; w->count++; } bucket++; } return 0; } /* Return -1 if done with node, number of handled routes on partial dump */ int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct fib_dump_filter *filter = &arg->filter; unsigned int flags = NLM_F_MULTI; struct net *net = arg->net; int count = 0; if (rt == net->ipv6.fib6_null_entry) return -1; if ((filter->flags & RTM_F_PREFIX) && !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ return -1; } if (filter->filter_set && ((filter->rt_type && rt->fib6_type != filter->rt_type) || (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || (filter->protocol && rt->fib6_protocol != filter->protocol))) { return -1; } if (filter->filter_set || !filter->dump_routes || !filter->dump_exceptions) { flags |= NLM_F_DUMP_FILTERED; } if (filter->dump_routes) { if (skip) { skip--; } else { if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, flags)) { return 0; } count++; } } if (filter->dump_exceptions) { struct fib6_nh_exception_dump_walker w = { .dump = arg, .rt = rt, .flags = flags, .skip = skip, .count = 0 }; int err; rcu_read_lock(); if (rt->nh) { err = nexthop_for_each_fib6_nh(rt->nh, rt6_nh_dump_exceptions, &w); } else { err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); } rcu_read_unlock(); if (err) return count + w.count; } return -1; } static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); return -EINVAL; } if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { NL_SET_ERR_MSG_MOD(extack, "Invalid flags for get route request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); return -EINVAL; } if (tb[RTA_FLOWLABEL] && (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) { NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], "Invalid flow label"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_SRC: case RTA_DST: case RTA_IIF: case RTA_OIF: case RTA_MARK: case RTA_UID: case RTA_SPORT: case RTA_DPORT: case RTA_IP_PROTO: case RTA_FLOWLABEL: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); return -EINVAL; } } return 0; } static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; int err, iif = 0, oif = 0; struct fib6_info *from; struct dst_entry *dst; struct rt6_info *rt; struct sk_buff *skb; struct rtmsg *rtm; struct flowi6 fl6 = {}; __be32 flowlabel; bool fibmatch; err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); if (tb[RTA_SRC]) { if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) goto errout; fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); } if (tb[RTA_DST]) { if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) goto errout; fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); } if (tb[RTA_IIF]) iif = nla_get_u32(tb[RTA_IIF]); if (tb[RTA_OIF]) oif = nla_get_u32(tb[RTA_OIF]); if (tb[RTA_MARK]) fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); if (tb[RTA_UID]) fl6.flowi6_uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); if (tb[RTA_SPORT]) fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); if (tb[RTA_DPORT]) fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], &fl6.flowi6_proto, AF_INET6, extack); if (err) goto errout; } flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0); fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel); if (iif) { struct net_device *dev; int flags = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(net, iif); if (!dev) { rcu_read_unlock(); err = -ENODEV; goto errout; } fl6.flowi6_iif = iif; if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); rcu_read_unlock(); } else { fl6.flowi6_oif = oif; dst = ip6_route_output(net, NULL, &fl6); } rt = dst_rt6_info(dst); if (rt->dst.error) { err = rt->dst.error; ip6_rt_put(rt); goto errout; } if (rt == net->ipv6.ip6_null_entry) { err = rt->dst.error; ip6_rt_put(rt); goto errout; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); err = -ENOBUFS; goto errout; } skb_dst_set(skb, &rt->dst); rcu_read_lock(); from = rcu_dereference(rt->from); if (from) { if (fibmatch) err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); else err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } else { err = -ENETUNREACH; } rcu_read_unlock(); if (err < 0) { kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; } void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int nlm_flags) { struct sk_buff *skb; struct net *net = info->nl_net; u32 seq; int err; err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, event, info->portid, seq, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_rt_update(struct net *net, struct fib6_info *rt, struct nl_info *info) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, bool trap, bool offload_failed) { struct sk_buff *skb; int err; if (READ_ONCE(f6i->offload) == offload && READ_ONCE(f6i->trap) == trap && READ_ONCE(f6i->offload_failed) == offload_failed) return; WRITE_ONCE(f6i->offload, offload); WRITE_ONCE(f6i->trap, trap); /* 2 means send notifications only if offload_failed was changed. */ if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && READ_ONCE(f6i->offload_failed) == offload_failed) return; WRITE_ONCE(f6i->offload_failed, offload_failed); if (!rcu_access_pointer(f6i->fib6_node)) /* The route was removed from the tree, do not send * notification. */ return; if (!net->ipv6.sysctl.fib_notify_on_flag_change) return; skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0, 0, 0); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } EXPORT_SYMBOL(fib6_info_hw_flags_set); static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (!(dev->flags & IFF_LOOPBACK)) return NOTIFY_OK; if (event == NETDEV_REGISTER) { net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.ip6_prohibit_entry->dst.dev = dev; net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); net->ipv6.ip6_blk_hole_entry->dst.dev = dev; net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); #endif } else if (event == NETDEV_UNREGISTER && dev->reg_state != NETREG_UNREGISTERED) { /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); #endif } return NOTIFY_OK; } /* * /proc */ #ifdef CONFIG_PROC_FS static int rt6_stats_seq_show(struct seq_file *seq, void *v) { struct net *net = (struct net *)seq->private; seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", net->ipv6.rt6_stats->fib_nodes, net->ipv6.rt6_stats->fib_route_nodes, atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), net->ipv6.rt6_stats->fib_rt_entries, net->ipv6.rt6_stats->fib_rt_cache, dst_entries_get_slow(&net->ipv6.ip6_dst_ops), net->ipv6.rt6_stats->fib_discarded_routes); return 0; } #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int delay; int ret; if (!write) return -EINVAL; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (ret) return ret; net = (struct net *)ctl->extra1; delay = net->ipv6.sysctl.flush_delay; fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } static struct ctl_table ipv6_route_table_template[] = { { .procname = "max_size", .data = &init_net.ipv6.sysctl.ip6_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_thresh", .data = &ip6_dst_ops_template.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "flush", .data = &init_net.ipv6.sysctl.flush_delay, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv6_sysctl_rtcache_flush }, { .procname = "gc_min_interval", .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_timeout", .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_interval", .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_elasticity", .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "mtu_expires", .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "min_adv_mss", .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_min_interval_ms", .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "skip_notify_on_dev_down", .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) { struct ctl_table *table; table = kmemdup(ipv6_route_table_template, sizeof(ipv6_route_table_template), GFP_KERNEL); if (table) { table[0].data = &net->ipv6.sysctl.ip6_rt_max_size; table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; table[2].data = &net->ipv6.sysctl.flush_delay; table[2].extra1 = net; table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; } return table; } size_t ipv6_route_sysctl_table_size(struct net *net) { /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) return 1; return ARRAY_SIZE(ipv6_route_table_template); } #endif static int __net_init ip6_route_net_init(struct net *net) { int ret = -ENOMEM; memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, sizeof(net->ipv6.ip6_dst_ops)); if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); if (!net->ipv6.fib6_null_entry) goto out_ip6_dst_entries; memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, sizeof(*net->ipv6.fib6_null_entry)); net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), GFP_KERNEL); if (!net->ipv6.ip6_null_entry) goto out_fib6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_has_custom_rules = false; net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, sizeof(*net->ipv6.ip6_prohibit_entry), GFP_KERNEL); if (!net->ipv6.ip6_prohibit_entry) goto out_ip6_null_entry; net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), GFP_KERNEL); if (!net->ipv6.ip6_blk_hole_entry) goto out_ip6_prohibit_entry; net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached); #ifdef CONFIG_IPV6_SUBTREES net->ipv6.fib6_routes_require_src = 0; #endif #endif net->ipv6.sysctl.flush_delay = 0; net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: return ret; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_ip6_prohibit_entry: kfree(net->ipv6.ip6_prohibit_entry); out_ip6_null_entry: kfree(net->ipv6.ip6_null_entry); #endif out_fib6_null_entry: kfree(net->ipv6.fib6_null_entry); out_ip6_dst_entries: dst_entries_destroy(&net->ipv6.ip6_dst_ops); out_ip6_dst_ops: goto out; } static void __net_exit ip6_route_net_exit(struct net *net) { kfree(net->ipv6.fib6_null_entry); kfree(net->ipv6.ip6_null_entry); #ifdef CONFIG_IPV6_MULTIPLE_TABLES kfree(net->ipv6.ip6_prohibit_entry); kfree(net->ipv6.ip6_blk_hole_entry); #endif dst_entries_destroy(&net->ipv6.ip6_dst_ops); } static int __net_init ip6_route_net_init_late(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, sizeof(struct ipv6_route_iter))) return -ENOMEM; if (!proc_create_net_single("rt6_stats", 0444, net->proc_net, rt6_stats_seq_show, NULL)) { remove_proc_entry("ipv6_route", net->proc_net); return -ENOMEM; } #endif return 0; } static void __net_exit ip6_route_net_exit_late(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ipv6_route", net->proc_net); remove_proc_entry("rt6_stats", net->proc_net); #endif } static struct pernet_operations ip6_route_net_ops = { .init = ip6_route_net_init, .exit = ip6_route_net_exit, }; static int __net_init ipv6_inetpeer_init(struct net *net) { struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; inet_peer_base_init(bp); net->ipv6.peers = bp; return 0; } static void __net_exit ipv6_inetpeer_exit(struct net *net) { struct inet_peer_base *bp = net->ipv6.peers; net->ipv6.peers = NULL; inetpeer_invalidate_tree(bp); kfree(bp); } static struct pernet_operations ipv6_inetpeer_ops = { .init = ipv6_inetpeer_init, .exit = ipv6_inetpeer_exit, }; static struct pernet_operations ip6_route_net_late_ops = { .init = ip6_route_net_init_late, .exit = ip6_route_net_exit_late, }; static struct notifier_block ip6_route_dev_notifier = { .notifier_call = ip6_route_dev_notify, .priority = ADDRCONF_NOTIFY_PRIORITY - 10, }; void __init ip6_route_init_special_entries(void) { /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #endif } #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) BTF_ID_LIST(btf_fib6_info_id) BTF_ID(struct, fib6_info) static const struct bpf_iter_seq_info ipv6_route_seq_info = { .seq_ops = &ipv6_route_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct ipv6_route_iter), }; static struct bpf_iter_reg ipv6_route_reg_info = { .target = "ipv6_route", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__ipv6_route, rt), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &ipv6_route_seq_info, }; static int __init bpf_iter_register(void) { ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; return bpf_iter_reg_target(&ipv6_route_reg_info); } static void bpf_iter_unregister(void) { bpf_iter_unreg_target(&ipv6_route_reg_info); } #endif #endif static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE, .doit = inet6_rtm_newroute}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE, .doit = inet6_rtm_delroute}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; int __init ip6_route_init(void) { int ret; int cpu; ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!ip6_dst_ops_template.kmem_cachep) goto out; ret = dst_entries_init(&ip6_dst_blackhole_ops); if (ret) goto out_kmem_cache; ret = register_pernet_subsys(&ipv6_inetpeer_ops); if (ret) goto out_dst_entries; ret = register_pernet_subsys(&ip6_route_net_ops); if (ret) goto out_register_inetpeer; ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; ret = fib6_init(); if (ret) goto out_register_subsys; ret = xfrm6_init(); if (ret) goto out_fib6_init; ret = fib6_rules_init(); if (ret) goto xfrm6_init; ret = register_pernet_subsys(&ip6_route_net_late_ops); if (ret) goto fib6_rules_init; ret = rtnl_register_many(ip6_route_rtnl_msg_handlers); if (ret < 0) goto out_register_late_subsys; ret = register_netdevice_notifier(&ip6_route_dev_notifier); if (ret) goto out_register_late_subsys; #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) ret = bpf_iter_register(); if (ret) goto out_register_late_subsys; #endif #endif for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); spin_lock_init(&ul->lock); } out: return ret; out_register_late_subsys: rtnl_unregister_all(PF_INET6); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_init: fib6_rules_cleanup(); xfrm6_init: xfrm6_fini(); out_fib6_init: fib6_gc_cleanup(); out_register_subsys: unregister_pernet_subsys(&ip6_route_net_ops); out_register_inetpeer: unregister_pernet_subsys(&ipv6_inetpeer_ops); out_dst_entries: dst_entries_destroy(&ip6_dst_blackhole_ops); out_kmem_cache: kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); goto out; } void ip6_route_cleanup(void) { #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_unregister(); #endif #endif unregister_netdevice_notifier(&ip6_route_dev_notifier); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_cleanup(); xfrm6_fini(); fib6_gc_cleanup(); unregister_pernet_subsys(&ipv6_inetpeer_ops); unregister_pernet_subsys(&ip6_route_net_ops); dst_entries_destroy(&ip6_dst_blackhole_ops); kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); } |
157 157 157 156 156 157 157 157 157 156 156 157 157 156 155 1 156 164 166 166 127 127 127 165 166 165 127 127 23 23 22 23 23 23 23 22 23 23 157 157 156 157 157 157 157 157 157 157 157 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <hyp/switch.h> #include <linux/arm-smccc.h> #include <linux/kvm_host.h> #include <linux/types.h> #include <linux/jump_label.h> #include <linux/percpu.h> #include <uapi/linux/psci.h> #include <kvm/arm_psci.h> #include <asm/barrier.h> #include <asm/cpufeature.h> #include <asm/kprobes.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/fpsimd.h> #include <asm/debug-monitors.h> #include <asm/processor.h> #include <asm/thread_info.h> #include <asm/vectors.h> /* VHE specific context */ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); /* * HCR_EL2 bits that the NV guest can freely change (no RES0/RES1 * semantics, irrespective of the configuration), but that cannot be * applied to the actual HW as things would otherwise break badly. * * - TGE: we want the guest to use EL1, which is incompatible with * this bit being set * * - API/APK: they are already accounted for by vcpu_load(), and can * only take effect across a load/put cycle (such as ERET) */ #define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK) static u64 __compute_hcr(struct kvm_vcpu *vcpu) { u64 guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2); u64 hcr = vcpu->arch.hcr_el2; if (!vcpu_has_nv(vcpu)) return hcr; /* * We rely on the invariant that a vcpu entered from HYP * context must also exit in the same context, as only an ERET * instruction can kick us out of it, and we obviously trap * that sucker. PSTATE.M will get fixed-up on exit. */ if (is_hyp_ctxt(vcpu)) { host_data_set_flag(VCPU_IN_HYP_CONTEXT); hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB; if (!vcpu_el2_e2h_is_set(vcpu)) hcr |= HCR_NV1; write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); } else { host_data_clear_flag(VCPU_IN_HYP_CONTEXT); if (guest_hcr & HCR_NV) { u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id())); /* Inherit the low bits from the actual register */ va |= __vcpu_sys_reg(vcpu, VNCR_EL2) & GENMASK(PAGE_SHIFT - 1, 0); write_sysreg_s(va, SYS_VNCR_EL2); /* Force NV2 in case the guest is forgetful... */ guest_hcr |= HCR_NV2; } } BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) && host_data_test_flag(L1_VNCR_MAPPED)); return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); } static void __activate_cptr_traps(struct kvm_vcpu *vcpu) { u64 cptr; /* * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, * except for some missing controls, such as TAM. * In this case, CPTR_EL2.TAM has the same position with or without * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM * shift value for trapping the AMU accesses. */ u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM; if (guest_owns_fp_regs()) { val |= CPACR_EL1_FPEN; if (vcpu_has_sve(vcpu)) val |= CPACR_EL1_ZEN; } else { __activate_traps_fpsimd32(vcpu); } if (!vcpu_has_nv(vcpu)) goto write; /* * The architecture is a bit crap (what a surprise): an EL2 guest * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, * as they are RES0 in the guest's view. To work around it, trap the * sucker using the very same bit it can't set... */ if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) val |= CPTR_EL2_TCPAC; /* * Layer the guest hypervisor's trap configuration on top of our own if * we're in a nested context. */ if (is_hyp_ctxt(vcpu)) goto write; cptr = vcpu_sanitised_cptr_el2(vcpu); /* * Pay attention, there's some interesting detail here. * * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): * * - CPTR_EL2.xEN = x0, traps are enabled * - CPTR_EL2.xEN = x1, traps are disabled * * In other words, bit[0] determines if guest accesses trap or not. In * the interest of simplicity, clear the entire field if the guest * hypervisor has traps enabled to dispel any illusion of something more * complicated taking place. */ if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) val &= ~CPACR_EL1_FPEN; if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) val &= ~CPACR_EL1_ZEN; if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) val |= cptr & CPACR_EL1_E0POE; val |= cptr & CPTR_EL2_TCPAC; write: write_sysreg(val, cpacr_el1); } static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) { u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; if (cpus_have_final_cap(ARM64_SME)) val |= CPACR_EL1_SMEN_EL1EN; write_sysreg(val, cpacr_el1); } static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; ___activate_traps(vcpu, __compute_hcr(vcpu)); if (has_cntpoff()) { struct timer_map map; get_timer_map(vcpu, &map); /* * We're entrering the guest. Reload the correct * values from memory now that TGE is clear. */ if (map.direct_ptimer == vcpu_ptimer(vcpu)) val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); if (map.direct_ptimer == vcpu_hptimer(vcpu)) val = __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2); if (map.direct_ptimer) { write_sysreg_el0(val, SYS_CNTP_CVAL); isb(); } } __activate_cptr_traps(vcpu); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1); } NOKPROBE_SYMBOL(__activate_traps); static void __deactivate_traps(struct kvm_vcpu *vcpu) { const char *host_vectors = vectors; ___deactivate_traps(vcpu); write_sysreg_hcr(HCR_HOST_VHE_FLAGS); if (has_cntpoff()) { struct timer_map map; u64 val, offset; get_timer_map(vcpu, &map); /* * We're exiting the guest. Save the latest CVAL value * to memory and apply the offset now that TGE is set. */ val = read_sysreg_el0(SYS_CNTP_CVAL); if (map.direct_ptimer == vcpu_ptimer(vcpu)) __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = val; if (map.direct_ptimer == vcpu_hptimer(vcpu)) __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = val; offset = read_sysreg_s(SYS_CNTPOFF_EL2); if (map.direct_ptimer && offset) { write_sysreg_el0(val + offset, SYS_CNTP_CVAL); isb(); } } /* * ARM errata 1165522 and 1530923 require the actual execution of the * above before we can switch to the EL2/EL0 translation regime used by * the host. */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); __deactivate_cptr_traps(vcpu); if (!arm64_kernel_unmapped_at_el0()) host_vectors = __this_cpu_read(this_cpu_vector); write_sysreg(host_vectors, vbar_el1); } NOKPROBE_SYMBOL(__deactivate_traps); /* * Disable IRQs in __vcpu_{load,put}_{activate,deactivate}_traps() to * prevent a race condition between context switching of PMUSERENR_EL0 * in __{activate,deactivate}_traps_common() and IPIs that attempts to * update PMUSERENR_EL0. See also kvm_set_pmuserenr(). */ static void __vcpu_load_activate_traps(struct kvm_vcpu *vcpu) { unsigned long flags; local_irq_save(flags); __activate_traps_common(vcpu); local_irq_restore(flags); } static void __vcpu_put_deactivate_traps(struct kvm_vcpu *vcpu) { unsigned long flags; local_irq_save(flags); __deactivate_traps_common(vcpu); local_irq_restore(flags); } void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu) { host_data_ptr(host_ctxt)->__hyp_running_vcpu = vcpu; __vcpu_load_switch_sysregs(vcpu); __vcpu_load_activate_traps(vcpu); __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); } void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu) { __vcpu_put_deactivate_traps(vcpu); __vcpu_put_switch_sysregs(vcpu); host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL; } static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg) { unsigned long ctl; u64 cval, cnt; bool stat; switch (reg) { case CNTP_CTL_EL0: cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); ctl = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); cnt = compute_counter_value(vcpu_ptimer(vcpu)); break; case CNTV_CTL_EL0: cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); ctl = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); cnt = compute_counter_value(vcpu_vtimer(vcpu)); break; default: BUG(); } stat = cval <= cnt; __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat); return ctl; } static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code) { u64 esr, val; /* * Having FEAT_ECV allows for a better quality of timer emulation. * However, this comes at a huge cost in terms of traps. Try and * satisfy the reads from guest's hypervisor context without * returning to the kernel if we can. */ if (!is_hyp_ctxt(vcpu)) return false; esr = kvm_vcpu_get_esr(vcpu); if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ) return false; switch (esr_sys64_to_sysreg(esr)) { case SYS_CNTP_CTL_EL02: val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); break; case SYS_CNTP_CTL_EL0: if (vcpu_el2_e2h_is_set(vcpu)) val = read_sysreg_el0(SYS_CNTP_CTL); else val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); break; case SYS_CNTP_CVAL_EL02: val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); break; case SYS_CNTP_CVAL_EL0: if (vcpu_el2_e2h_is_set(vcpu)) { val = read_sysreg_el0(SYS_CNTP_CVAL); if (!has_cntpoff()) val -= timer_get_offset(vcpu_hptimer(vcpu)); } else { val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); } break; case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: val = compute_counter_value(vcpu_hptimer(vcpu)); break; case SYS_CNTV_CTL_EL02: val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); break; case SYS_CNTV_CTL_EL0: if (vcpu_el2_e2h_is_set(vcpu)) val = read_sysreg_el0(SYS_CNTV_CTL); else val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); break; case SYS_CNTV_CVAL_EL02: val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); break; case SYS_CNTV_CVAL_EL0: if (vcpu_el2_e2h_is_set(vcpu)) val = read_sysreg_el0(SYS_CNTV_CVAL); else val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); break; case SYS_CNTVCT_EL0: case SYS_CNTVCTSS_EL0: val = compute_counter_value(vcpu_hvtimer(vcpu)); break; default: return false; } vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); __kvm_skip_instr(vcpu); return true; } static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) { u64 esr = kvm_vcpu_get_esr(vcpu); u64 spsr, elr, mode; /* * Going through the whole put/load motions is a waste of time * if this is a VHE guest hypervisor returning to its own * userspace, or the hypervisor performing a local exception * return. No need to save/restore registers, no need to * switch S2 MMU. Just do the canonical ERET. * * Unless the trap has to be forwarded further down the line, * of course... */ if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) || (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_ERET)) return false; spsr = read_sysreg_el1(SYS_SPSR); mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT); switch (mode) { case PSR_MODE_EL0t: if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) return false; break; case PSR_MODE_EL2t: mode = PSR_MODE_EL1t; break; case PSR_MODE_EL2h: mode = PSR_MODE_EL1h; break; default: return false; } /* If ERETAx fails, take the slow path */ if (esr_iss_is_eretax(esr)) { if (!(vcpu_has_ptrauth(vcpu) && kvm_auth_eretax(vcpu, &elr))) return false; } else { elr = read_sysreg_el1(SYS_ELR); } spsr = (spsr & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode; write_sysreg_el2(spsr, SYS_SPSR); write_sysreg_el2(elr, SYS_ELR); return true; } static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) { int ret = -EINVAL; u32 instr; u64 val; /* * Ideally, we would never trap on EL2 S1 TLB invalidations using * the EL1 instructions when the guest's HCR_EL2.{E2H,TGE}=={1,1}. * But "thanks" to FEAT_NV2, we don't trap writes to HCR_EL2, * meaning that we can't track changes to the virtual TGE bit. So we * have to leave HCR_EL2.TTLB set on the host. Oopsie... * * Try and handle these invalidation as quickly as possible, without * fully exiting. Note that we don't need to consider any forwarding * here, as having E2H+TGE set is the very definition of being * InHost. * * For the lesser hypervisors out there that have failed to get on * with the VHE program, we can also handle the nVHE style of EL2 * invalidation. */ if (!(is_hyp_ctxt(vcpu))) return false; instr = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); if ((kvm_supported_tlbi_s1e1_op(vcpu, instr) && vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) || kvm_supported_tlbi_s1e2_op (vcpu, instr)) ret = __kvm_tlbi_s1e2(NULL, val, instr); if (ret) return false; /* * If we have to check for any VNCR mapping being invalidated, * go back to the slow path for further processing. */ if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu) && atomic_read(&vcpu->kvm->arch.vncr_map_count)) return false; __kvm_skip_instr(vcpu); return true; } static bool kvm_hyp_handle_cpacr_el1(struct kvm_vcpu *vcpu, u64 *exit_code) { u64 esr = kvm_vcpu_get_esr(vcpu); int rt; if (!is_hyp_ctxt(vcpu) || esr_sys64_to_sysreg(esr) != SYS_CPACR_EL1) return false; rt = kvm_vcpu_sys_get_rt(vcpu); if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ) { vcpu_set_reg(vcpu, rt, __vcpu_sys_reg(vcpu, CPTR_EL2)); } else { vcpu_write_sys_reg(vcpu, vcpu_get_reg(vcpu, rt), CPTR_EL2); __activate_cptr_traps(vcpu); } __kvm_skip_instr(vcpu); return true; } static bool kvm_hyp_handle_zcr_el2(struct kvm_vcpu *vcpu, u64 *exit_code) { u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); if (!vcpu_has_nv(vcpu)) return false; if (sysreg != SYS_ZCR_EL2) return false; if (guest_owns_fp_regs()) return false; /* * ZCR_EL2 traps are handled in the slow path, with the expectation * that the guest's FP context has already been loaded onto the CPU. * * Load the guest's FP context and unconditionally forward to the * slow path for handling (i.e. return false). */ kvm_hyp_handle_fpsimd(vcpu, exit_code); return false; } static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) { if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code)) return true; if (kvm_hyp_handle_timer(vcpu, exit_code)) return true; if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code)) return true; if (kvm_hyp_handle_zcr_el2(vcpu, exit_code)) return true; return kvm_hyp_handle_sysreg(vcpu, exit_code); } static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code) { u64 iss; if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) return false; /* * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2 * is populated with a correct ISS for a sysreg trap. These fruity * parts are 64bit only, so unconditionally set IL. */ iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2)); vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) | FIELD_PREP(ESR_ELx_ISS_MASK, iss) | ESR_ELx_IL; return false; } static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg_vhe, [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, [ESR_ELx_EC_ERET] = kvm_hyp_handle_eret, [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, /* Apple shenanigans */ [0x3F] = kvm_hyp_handle_impdef, }; static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { synchronize_vcpu_pstate(vcpu, exit_code); /* * If we were in HYP context on entry, adjust the PSTATE view * so that the usual helpers work correctly. This enforces our * invariant that the guest's HYP context status is preserved * across a run. */ if (vcpu_has_nv(vcpu) && unlikely(host_data_test_flag(VCPU_IN_HYP_CONTEXT))) { u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); switch (mode) { case PSR_MODE_EL1t: mode = PSR_MODE_EL2t; break; case PSR_MODE_EL1h: mode = PSR_MODE_EL2h; break; } *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); *vcpu_cpsr(vcpu) |= mode; } /* Apply extreme paranoia! */ BUG_ON(vcpu_has_nv(vcpu) && !!host_data_test_flag(VCPU_IN_HYP_CONTEXT) != is_hyp_ctxt(vcpu)); return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); } /* Switch to the guest for VHE systems running in EL2 */ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; u64 exit_code; host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; sysreg_save_host_state_vhe(host_ctxt); fpsimd_lazy_switch_to_guest(vcpu); /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear * HCR_EL2.TGE. The stage 1 and stage 2 guest context has already been * loaded on the CPU in kvm_vcpu_load_vhe(). */ __activate_traps(vcpu); __kvm_adjust_pc(vcpu); sysreg_restore_guest_state_vhe(guest_ctxt); __debug_switch_to_guest(vcpu); do { /* Jump in the fire! */ exit_code = __guest_enter(vcpu); /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); sysreg_save_guest_state_vhe(guest_ctxt); __deactivate_traps(vcpu); fpsimd_lazy_switch_to_host(vcpu); sysreg_restore_host_state_vhe(host_ctxt); if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); return exit_code; } NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe); int __kvm_vcpu_run(struct kvm_vcpu *vcpu) { int ret; local_daif_mask(); /* * Having IRQs masked via PMR when entering the guest means the GIC * will not signal the CPU of interrupts of lower priority, and the * only way to get out will be via guest exceptions. * Naturally, we want to avoid this. * * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU. */ pmr_sync(); ret = __kvm_vcpu_run_vhe(vcpu); /* * local_daif_restore() takes care to properly restore PSTATE.DAIF * and the GIC PMR if the host is using IRQ priorities. */ local_daif_restore(DAIF_PROCCTX_NOIRQ); /* * When we exit from the guest we change a number of CPU configuration * parameters, such as traps. We rely on the isb() in kvm_call_hyp*() * to make sure these changes take effect before running the host or * additional guests. */ return ret; } static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par) { struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; host_ctxt = host_data_ptr(host_ctxt); vcpu = host_ctxt->__hyp_running_vcpu; __deactivate_traps(vcpu); sysreg_restore_host_state_vhe(host_ctxt); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n", spsr, elr, read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR), read_sysreg(hpfar_el2), par, vcpu); } NOKPROBE_SYMBOL(__hyp_call_panic); void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg_par(); __hyp_call_panic(spsr, elr, par); } asmlinkage void kvm_unexpected_el2_exception(void) { __kvm_unexpected_el2_exception(); } |
179 179 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Implementations of the security context functions. * * Author: Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2020 Red Hat, Inc. */ #include <linux/jhash.h> #include "context.h" #include "mls.h" u32 context_compute_hash(const struct context *c) { u32 hash = 0; /* * If a context is invalid, it will always be represented by a * context struct with only the len & str set (and vice versa) * under a given policy. Since context structs from different * policies should never meet, it is safe to hash valid and * invalid contexts differently. The context_equal() function * already operates under the same assumption. */ if (c->len) return full_name_hash(NULL, c->str, c->len); hash = jhash_3words(c->user, c->role, c->type, hash); hash = mls_range_hash(&c->range, hash); return hash; } |
72 5 66 72 61 61 12 46 74 1 1 73 71 10 62 13 55 67 75 23 5 69 17 52 69 76 27 27 28 2 6 21 27 27 28 28 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = filemap_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET; } /** * vfs_setpos_cookie - update the file offset for lseek and reset cookie * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * @cookie: cookie to reset * * Update the file offset to the value specified by @offset if the given * offset is valid and it is not equal to the current file offset and * reset the specified cookie to indicate that a seek happened. * * Return the specified offset on success and -EINVAL on invalid offset. */ static loff_t vfs_setpos_cookie(struct file *file, loff_t offset, loff_t maxsize, u64 *cookie) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; if (cookie) *cookie = 0; } return offset; } /** * vfs_setpos - update the file offset for lseek * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * * This is a low-level filesystem helper for updating the file offset to * the value specified by @offset if the given offset is valid and it is * not equal to the current file offset. * * Return the specified offset on success and -EINVAL on invalid offset. */ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { return vfs_setpos_cookie(file, offset, maxsize, NULL); } EXPORT_SYMBOL(vfs_setpos); /** * must_set_pos - check whether f_pos has to be updated * @file: file to seek on * @offset: offset to use * @whence: type of seek operation * @eof: end of file * * Check whether f_pos needs to be updated and update @offset according * to @whence. * * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be * updated, and negative error code on failure. */ static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof) { switch (whence) { case SEEK_END: *offset += eof; break; case SEEK_CUR: /* * Here we special-case the lseek(fd, 0, SEEK_CUR) * position-querying operation. Avoid rewriting the "same" * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ if (*offset == 0) { *offset = file->f_pos; return 0; } break; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ if ((unsigned long long)*offset >= eof) return -ENXIO; break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ if ((unsigned long long)*offset >= eof) return -ENXIO; *offset = eof; break; } return 1; } /** * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { int ret; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; if (whence == SEEK_CUR) { /* * If the file requires locking via f_pos_lock we know * that mutual exclusion for SEEK_CUR on the same file * is guaranteed. If the file isn't locked, we take * f_lock to protect against f_pos races with other * SEEK_CURs. */ if (file_seek_cur_needs_f_lock(file)) { guard(spinlock)(&file->f_lock); return vfs_setpos(file, file->f_pos + offset, maxsize); } return vfs_setpos(file, file->f_pos + offset, maxsize); } return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_llseek_cookie - versioned llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @cookie: cookie to update * * See generic_file_llseek for a general description and locking assumptions. * * In contrast to generic_file_llseek, this function also resets a * specified cookie to indicate a seek took place. */ loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, u64 *cookie) { struct inode *inode = file->f_mapping->host; loff_t maxsize = inode->i_sb->s_maxbytes; loff_t eof = i_size_read(inode); int ret; if (WARN_ON_ONCE(!cookie)) return -EINVAL; /* * Require that this is only used for directories that guarantee * synchronization between readdir and seek so that an update to * @cookie is correctly synchronized with concurrent readdir. */ if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS))) return -EINVAL; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; /* No need to hold f_lock because we know that f_pos_lock is held. */ if (whence == SEEK_CUR) return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie); return vfs_setpos_cookie(file, offset, maxsize, cookie); } EXPORT_SYMBOL(generic_llseek_cookie); /** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); /** * fixed_size_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: size of the file * */ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, size, size); default: return -EINVAL; } } EXPORT_SYMBOL(fixed_size_llseek); /** * no_seek_end_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * */ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, OFFSET_MAX, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek); /** * no_seek_end_llseek_size - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: maximal offset allowed * */ loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, size, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek_size); /** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) file->f_pos = offset; retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { if (!(file->f_mode & FMODE_LSEEK)) return -ESPIPE; return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; CLASS(fd_pos, f)(fd); if (fd_empty(f)) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(fd_file(f), offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } return retval; } SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) { int retval; CLASS(fd_pos, f)(fd); loff_t offset; if (fd_empty(f)) return -EBADF; if (whence > SEEK_MAX) return -EINVAL; offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low, whence); retval = (int)offset; if (offset >= 0) { retval = -EFAULT; if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } return retval; } #endif int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { int mask = read_write == READ ? MAY_READ : MAY_WRITE; int ret; if (unlikely((ssize_t) count < 0)) return -EINVAL; if (ppos) { loff_t pos = *ppos; if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) return -EINVAL; } } ret = security_file_permission(file, mask); if (ret) return ret; return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = filp->f_op->read_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } static int warn_unsupported(struct file *file, const char *op) { pr_warn_ratelimited( "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; /* * Also fail if ->read_iter and ->read are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->read_iter || file->f_op->read)) return warn_unsupported(file, "read"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) ret = new_sync_read(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { struct kiocb kiocb; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; /* * Also fail if ->write_iter and ->write are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->write_iter || file->f_op->write)) return warn_unsupported(file, "write"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = (void *)buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually * wants this _and_ can be built as a module. So we need to export * this symbol for autofs, even though it really isn't appropriate * for any other kernel modules. */ EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; file_start_write(file); ret = __kernel_write(file, buf, count, pos); file_end_write(file); return ret; } EXPORT_SYMBOL(kernel_write); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } /* file_ppos returns &file->f_pos or NULL if file is stream */ static inline loff_t *file_ppos(struct file *file) { return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_read(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { return ksys_read(fd, buf, count); } ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { return ksys_write(fd, buf, count); } ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_PREAD) return vfs_read(fd_file(f), buf, count, &pos); return -ESPIPE; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_PWRITE) return vfs_write(fd_file(f), buf, count, &pos); return -ESPIPE; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, loff_t, pos) { return ksys_pwrite64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = filp->f_op->read_iter(&kiocb, iter); else ret = filp->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } else { nr = filp->f_op->write(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } return ret; } ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = file->f_op->read_iter(iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iter_read); /* * Caller is responsible for calling kiocb_end_write() on completion * if async iocb was queued. */ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->write_iter) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; kiocb_start_write(iocb); ret = file->f_op->write_iter(iocb, iter); if (ret != -EIOCBQUEUED) kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!file->f_op->write_iter) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; file_start_write(file); ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) goto out; if (file->f_op->read_iter) ret = do_iter_readv_writev(file, &iter, pos, READ, flags); else ret = do_loop_readv_writev(file, &iter, pos, READ, flags); out: if (ret >= 0) fsnotify_access(file); kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) goto out; file_start_write(file); if (file->f_op->write_iter) ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); else ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); out: kfree(iov); return ret; } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) { #define HALF_LONG_BITS (BITS_PER_LONG / 2) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PREAD) ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PWRITE) ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_readv(fd, vec, vlen, 0); } SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_writev(fd, vec, vlen, 0); } SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_preadv(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_pwritev(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on * in_compat_syscall(). */ #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; int fl; /* * Get input file, and verify that it is ok.. */ CLASS(fd, in)(in_fd); if (fd_empty(in)) return -EBADF; if (!(fd_file(in)->f_mode & FMODE_READ)) return -EBADF; if (!ppos) { pos = fd_file(in)->f_pos; } else { pos = *ppos; if (!(fd_file(in)->f_mode & FMODE_PREAD)) return -ESPIPE; } retval = rw_verify_area(READ, fd_file(in), &pos, count); if (retval < 0) return retval; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ CLASS(fd, out)(out_fd); if (fd_empty(out)) return -EBADF; if (!(fd_file(out)->f_mode & FMODE_WRITE)) return -EBADF; in_inode = file_inode(fd_file(in)); out_inode = file_inode(fd_file(out)); out_pos = fd_file(out)->f_pos; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { if (pos >= max) return -EOVERFLOW; count = max - pos; } fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (fd_file(in)->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif opipe = get_pipe_info(fd_file(out), true); if (!opipe) { retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count); if (retval < 0) return retval; retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos, count, fl); } else { if (fd_file(out)->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl); } if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(fd_file(in)); fsnotify_modify(fd_file(out)); fd_file(out)->f_pos = out_pos; if (ppos) *ppos = pos; else fd_file(in)->f_pos = pos; } inc_syscr(current); inc_syscw(current); if (pos > max) retval = -EOVERFLOW; return retval; } SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, compat_off_t __user *, offset, compat_size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, compat_loff_t __user *, offset, compat_size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif /* * Performs necessary checks before doing a file copy * * Can adjust amount of bytes to copy via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the copy should be allowed. */ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); uint64_t count = *req_count; loff_t size_in; int ret; ret = generic_file_rw_checks(file_in, file_out); if (ret) return ret; /* * We allow some filesystems to handle cross sb copy, but passing * a file of the wrong filesystem type to filesystem driver can result * in an attempt to dereference the wrong type of ->private_data, so * avoid doing that until we really have a good reason. * * nfs and cifs define several different file_system_type structures * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ if (flags & COPY_FILE_SPLICE) { /* cross sb splice is allowed */ } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { return -EXDEV; } /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EOVERFLOW; /* Shorten the copy to EOF */ size_in = i_size_read(inode_in); if (pos_in >= size_in) count = 0; else count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* Don't allow overlapped copying within the same file. */ if (inode_in == inode_out && pos_out + count > pos_in && pos_out < pos_in + count) return -EINVAL; *req_count = count; return 0; } /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. */ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, flags); if (unlikely(ret)) return ret; ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; ret = rw_verify_area(WRITE, file_out, &pos_out, len); if (unlikely(ret)) return ret; if (len == 0) return 0; file_start_write(file_out); /* * Cloning is supported by more file systems, so we implement copy on * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; } else if (samesb) { /* Fallback to splice for same sb copy for backward compat */ splice = true; } file_end_write(file_out); if (!splice) goto done; /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in * case filesystem supports clone but rejected the clone request (e.g. * because it was not block aligned). * * In both cases, fall back to kernel copy so we are able to maintain a * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE * for server-side-copy between any two sb. * * In any case, we call do_splice_direct() and not splice_file_range(), * without file_start_write() held, to avoid possible deadlocks related * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, min_t(size_t, len, MAX_RW_COUNT), 0); done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } inc_syscr(current); inc_syscw(current); return ret; } EXPORT_SYMBOL(vfs_copy_file_range); SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { loff_t pos_in; loff_t pos_out; ssize_t ret = -EBADF; CLASS(fd, f_in)(fd_in); if (fd_empty(f_in)) return -EBADF; CLASS(fd, f_out)(fd_out); if (fd_empty(f_out)) return -EBADF; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) return -EFAULT; } else { pos_in = fd_file(f_in)->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) return -EFAULT; } else { pos_out = fd_file(f_out)->f_pos; } if (flags != 0) return -EINVAL; ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len, flags); if (ret > 0) { pos_in += ret; pos_out += ret; if (off_in) { if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_in)->f_pos = pos_in; } if (off_out) { if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_out)->f_pos = pos_out; } } return ret; } /* * Don't operate on ranges the page cache doesn't support, and don't exceed the * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; loff_t limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } *count = min(*count, limit - pos); } if (!(file->f_flags & O_LARGEFILE)) max_size = MAX_NON_LFS; if (unlikely(pos >= max_size)) return -EFBIG; *count = min(*count, max_size - pos); return 0; } EXPORT_SYMBOL_GPL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; if (IS_SWAPFILE(inode)) return -ETXTBSY; if (!*count) return 0; if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !((iocb->ki_flags & IOCB_DIRECT) || (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); } EXPORT_SYMBOL(generic_write_checks_count); /* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { loff_t count = iov_iter_count(from); int ret; ret = generic_write_checks_count(iocb, &count); if (ret) return ret; iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. */ int generic_file_rw_checks(struct file *file_in, struct file *file_out) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) return -EBADF; return 0; } int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) return -EINVAL; if (!is_power_of_2(len)) return -EINVAL; if (!IS_ALIGNED(iocb->ki_pos, len)) return -EINVAL; if (!(iocb->ki_flags & IOCB_DIRECT)) return -EOPNOTSUPP; return 0; } EXPORT_SYMBOL_GPL(generic_atomic_write_valid); |
539 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2005-2010 IBM Corporation * * Authors: * Mimi Zohar <zohar@us.ibm.com> * Kylene Hall <kjhall@us.ibm.com> * * File: evm.h */ #ifndef __INTEGRITY_EVM_H #define __INTEGRITY_EVM_H #include <linux/xattr.h> #include <linux/security.h> #include "../integrity.h" #define EVM_INIT_HMAC 0x0001 #define EVM_INIT_X509 0x0002 #define EVM_ALLOW_METADATA_WRITES 0x0004 #define EVM_SETUP_COMPLETE 0x80000000 /* userland has signaled key load */ #define EVM_KEY_MASK (EVM_INIT_HMAC | EVM_INIT_X509) #define EVM_INIT_MASK (EVM_INIT_HMAC | EVM_INIT_X509 | EVM_SETUP_COMPLETE | \ EVM_ALLOW_METADATA_WRITES) struct xattr_list { struct list_head list; char *name; bool enabled; }; #define EVM_NEW_FILE 0x00000001 #define EVM_IMMUTABLE_DIGSIG 0x00000002 /* EVM integrity metadata associated with an inode */ struct evm_iint_cache { unsigned long flags; enum integrity_status evm_status:4; struct integrity_inode_attributes metadata_inode; }; extern struct lsm_blob_sizes evm_blob_sizes; static inline struct evm_iint_cache *evm_iint_inode(const struct inode *inode) { if (unlikely(!inode->i_security)) return NULL; return inode->i_security + evm_blob_sizes.lbs_inode; } extern int evm_initialized; #define EVM_ATTR_FSUUID 0x0001 extern int evm_hmac_attrs; /* List of EVM protected security xattrs */ extern struct list_head evm_config_xattrnames; struct evm_digest { struct ima_digest_data_hdr hdr; char digest[IMA_MAX_DIGEST_SIZE]; } __packed; int evm_protected_xattr(const char *req_xattr_name); int evm_init_key(void); int evm_update_evmxattr(struct dentry *dentry, const char *req_xattr_name, const char *req_xattr_value, size_t req_xattr_value_len); int evm_calc_hmac(struct dentry *dentry, const char *req_xattr_name, const char *req_xattr_value, size_t req_xattr_value_len, struct evm_digest *data, struct evm_iint_cache *iint); int evm_calc_hash(struct dentry *dentry, const char *req_xattr_name, const char *req_xattr_value, size_t req_xattr_value_len, char type, struct evm_digest *data, struct evm_iint_cache *iint); int evm_init_hmac(struct inode *inode, const struct xattr *xattrs, char *hmac_val); int evm_init_secfs(void); #endif |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2006 Nick Piggin * Copyright (C) 2012 Konstantin Khlebnikov */ #ifndef _LINUX_RADIX_TREE_H #define _LINUX_RADIX_TREE_H #include <linux/bitops.h> #include <linux/gfp_types.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/math.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/xarray.h> #include <linux/local_lock.h> /* Keep unconverted code working */ #define radix_tree_root xarray #define radix_tree_node xa_node struct radix_tree_preload { local_lock_t lock; unsigned nr; /* nodes->parent points to next preallocated node */ struct radix_tree_node *nodes; }; DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads); /* * The bottom two bits of the slot determine how the remaining bits in the * slot are interpreted: * * 00 - data pointer * 10 - internal entry * x1 - value entry * * The internal entry may be a pointer to the next level in the tree, a * sibling entry, or an indicator that the entry in this slot has been moved * to another location in the tree and the lookup should be restarted. While * NULL fits the 'data pointer' pattern, it means that there is no entry in * the tree for this index (no matter what level of the tree it is found at). * This means that storing a NULL entry in the tree is the same as deleting * the entry from the tree. */ #define RADIX_TREE_ENTRY_MASK 3UL #define RADIX_TREE_INTERNAL_NODE 2UL static inline bool radix_tree_is_internal_node(void *ptr) { return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) == RADIX_TREE_INTERNAL_NODE; } /*** radix-tree API starts here ***/ #define RADIX_TREE_MAP_SHIFT XA_CHUNK_SHIFT #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) #define RADIX_TREE_MAX_TAGS XA_MAX_MARKS #define RADIX_TREE_TAG_LONGS XA_MARK_LONGS #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) /* The IDR tag is stored in the low bits of xa_flags */ #define ROOT_IS_IDR ((__force gfp_t)4) /* The top bits of xa_flags are used to store the root tags */ #define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT) #define RADIX_TREE_INIT(name, mask) XARRAY_INIT(name, mask) #define RADIX_TREE(name, mask) \ struct radix_tree_root name = RADIX_TREE_INIT(name, mask) #define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask) static inline bool radix_tree_empty(const struct radix_tree_root *root) { return root->xa_head == NULL; } /** * struct radix_tree_iter - radix tree iterator state * * @index: index of current slot * @next_index: one beyond the last index for this chunk * @tags: bit-mask for tag-iterating * @node: node that contains current slot * * This radix tree iterator works in terms of "chunks" of slots. A chunk is a * subinterval of slots contained within one radix tree leaf node. It is * described by a pointer to its first slot and a struct radix_tree_iter * which holds the chunk's position in the tree and its size. For tagged * iteration radix_tree_iter also holds the slots' bit-mask for one chosen * radix tree tag. */ struct radix_tree_iter { unsigned long index; unsigned long next_index; unsigned long tags; struct radix_tree_node *node; }; /** * Radix-tree synchronization * * The radix-tree API requires that users provide all synchronisation (with * specific exceptions, noted below). * * Synchronization of access to the data items being stored in the tree, and * management of their lifetimes must be completely managed by API users. * * For API usage, in general, * - any function _modifying_ the tree or tags (inserting or deleting * items, setting or clearing tags) must exclude other modifications, and * exclude any functions reading the tree. * - any function _reading_ the tree or tags (looking up items or tags, * gang lookups) must exclude modifications to the tree, but may occur * concurrently with other readers. * * The notable exceptions to this rule are the following functions: * __radix_tree_lookup * radix_tree_lookup * radix_tree_lookup_slot * radix_tree_tag_get * radix_tree_gang_lookup * radix_tree_gang_lookup_tag * radix_tree_gang_lookup_tag_slot * radix_tree_tagged * * The first 7 functions are able to be called locklessly, using RCU. The * caller must ensure calls to these functions are made within rcu_read_lock() * regions. Other readers (lock-free or otherwise) and modifications may be * running concurrently. * * It is still required that the caller manage the synchronization and lifetimes * of the items. So if RCU lock-free lookups are used, typically this would mean * that the items have their own locks, or are amenable to lock-free access; and * that the items are freed by RCU (or only freed after having been deleted from * the radix tree *and* a synchronize_rcu() grace period). * * (Note, rcu_assign_pointer and rcu_dereference are not needed to control * access to data items when inserting into or looking up from the radix tree) * * Note that the value returned by radix_tree_tag_get() may not be relied upon * if only the RCU read lock is held. Functions to set/clear tags and to * delete nodes running concurrently with it may affect its result such that * two consecutive reads in the same locked section may return different * values. If reliability is required, modification functions must also be * excluded from concurrency. * * radix_tree_tagged is able to be called without locking or RCU. */ /** * radix_tree_deref_slot - dereference a slot * @slot: slot pointer, returned by radix_tree_lookup_slot * * For use with radix_tree_lookup_slot(). Caller must hold tree at least read * locked across slot lookup and dereference. Not required if write lock is * held (ie. items cannot be concurrently inserted). * * radix_tree_deref_retry must be used to confirm validity of the pointer if * only the read lock is held. * * Return: entry stored in that slot. */ static inline void *radix_tree_deref_slot(void __rcu **slot) { return rcu_dereference(*slot); } /** * radix_tree_deref_slot_protected - dereference a slot with tree lock held * @slot: slot pointer, returned by radix_tree_lookup_slot * * Similar to radix_tree_deref_slot. The caller does not hold the RCU read * lock but it must hold the tree lock to prevent parallel updates. * * Return: entry stored in that slot. */ static inline void *radix_tree_deref_slot_protected(void __rcu **slot, spinlock_t *treelock) { return rcu_dereference_protected(*slot, lockdep_is_held(treelock)); } /** * radix_tree_deref_retry - check radix_tree_deref_slot * @arg: pointer returned by radix_tree_deref_slot * Returns: 0 if retry is not required, otherwise retry is required * * radix_tree_deref_retry must be used with radix_tree_deref_slot. */ static inline int radix_tree_deref_retry(void *arg) { return unlikely(radix_tree_is_internal_node(arg)); } /** * radix_tree_exception - radix_tree_deref_slot returned either exception? * @arg: value returned by radix_tree_deref_slot * Returns: 0 if well-aligned pointer, non-0 if either kind of exception. */ static inline int radix_tree_exception(void *arg) { return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } int radix_tree_insert(struct radix_tree_root *, unsigned long index, void *); void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp); void *radix_tree_lookup(const struct radix_tree_root *, unsigned long); void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *, unsigned long index); void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *, void __rcu **slot, void *entry); void radix_tree_iter_replace(struct radix_tree_root *, const struct radix_tree_iter *, void __rcu **slot, void *entry); void radix_tree_replace_slot(struct radix_tree_root *, void __rcu **slot, void *entry); void radix_tree_iter_delete(struct radix_tree_root *, struct radix_tree_iter *iter, void __rcu **slot); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); unsigned int radix_tree_gang_lookup(const struct radix_tree_root *, void **results, unsigned long first_index, unsigned int max_items); int radix_tree_preload(gfp_t gfp_mask); int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *, unsigned long index, unsigned int tag); void *radix_tree_tag_clear(struct radix_tree_root *, unsigned long index, unsigned int tag); int radix_tree_tag_get(const struct radix_tree_root *, unsigned long index, unsigned int tag); void radix_tree_iter_tag_clear(struct radix_tree_root *, const struct radix_tree_iter *iter, unsigned int tag); unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag); unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *, void __rcu ***results, unsigned long first_index, unsigned int max_items, unsigned int tag); int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag); static inline void radix_tree_preload_end(void) { local_unlock(&radix_tree_preloads.lock); } void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max); enum { RADIX_TREE_ITER_TAG_MASK = 0x0f, /* tag index in lower nybble */ RADIX_TREE_ITER_TAGGED = 0x10, /* lookup tagged slots */ RADIX_TREE_ITER_CONTIG = 0x20, /* stop at first hole */ }; /** * radix_tree_iter_init - initialize radix tree iterator * * @iter: pointer to iterator state * @start: iteration starting index * Returns: NULL */ static __always_inline void __rcu ** radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start) { /* * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it * in the case of a successful tagged chunk lookup. If the lookup was * unsuccessful or non-tagged then nobody cares about ->tags. * * Set index to zero to bypass next_index overflow protection. * See the comment in radix_tree_next_chunk() for details. */ iter->index = 0; iter->next_index = start; return NULL; } /** * radix_tree_next_chunk - find next chunk of slots for iteration * * @root: radix tree root * @iter: iterator state * @flags: RADIX_TREE_ITER_* flags and tag index * Returns: pointer to chunk first slot, or NULL if there no more left * * This function looks up the next chunk in the radix tree starting from * @iter->next_index. It returns a pointer to the chunk's first slot. * Also it fills @iter with data about chunk: position in the tree (index), * its end (next_index), and constructs a bit mask for tagged iterating (tags). */ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *, struct radix_tree_iter *iter, unsigned flags); /** * radix_tree_iter_lookup - look up an index in the radix tree * @root: radix tree root * @iter: iterator state * @index: key to look up * * If @index is present in the radix tree, this function returns the slot * containing it and updates @iter to describe the entry. If @index is not * present, it returns NULL. */ static inline void __rcu ** radix_tree_iter_lookup(const struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned long index) { radix_tree_iter_init(iter, index); return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG); } /** * radix_tree_iter_retry - retry this chunk of the iteration * @iter: iterator state * * If we iterate over a tree protected only by the RCU lock, a race * against deletion or creation may result in seeing a slot for which * radix_tree_deref_retry() returns true. If so, call this function * and continue the iteration. */ static inline __must_check void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter) { iter->next_index = iter->index; iter->tags = 0; return NULL; } static inline unsigned long __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) { return iter->index + slots; } /** * radix_tree_iter_resume - resume iterating when the chunk may be invalid * @slot: pointer to current slot * @iter: iterator state * Returns: New slot pointer * * If the iterator needs to release then reacquire a lock, the chunk may * have been invalidated by an insertion or deletion. Call this function * before releasing the lock to continue the iteration from the next index. */ void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter); /** * radix_tree_chunk_size - get current chunk size * * @iter: pointer to radix tree iterator * Returns: current chunk size */ static __always_inline long radix_tree_chunk_size(struct radix_tree_iter *iter) { return iter->next_index - iter->index; } /** * radix_tree_next_slot - find next slot in chunk * * @slot: pointer to current slot * @iter: pointer to iterator state * @flags: RADIX_TREE_ITER_*, should be constant * Returns: pointer to next slot, or NULL if there no more left * * This function updates @iter->index in the case of a successful lookup. * For tagged lookup it also eats @iter->tags. * * There are several cases where 'slot' can be passed in as NULL to this * function. These cases result from the use of radix_tree_iter_resume() or * radix_tree_iter_retry(). In these cases we don't end up dereferencing * 'slot' because either: * a) we are doing tagged iteration and iter->tags has been set to 0, or * b) we are doing non-tagged iteration, and iter->index and iter->next_index * have been set up so that radix_tree_chunk_size() returns 1 or 0. */ static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot, struct radix_tree_iter *iter, unsigned flags) { if (flags & RADIX_TREE_ITER_TAGGED) { iter->tags >>= 1; if (unlikely(!iter->tags)) return NULL; if (likely(iter->tags & 1ul)) { iter->index = __radix_tree_iter_add(iter, 1); slot++; goto found; } if (!(flags & RADIX_TREE_ITER_CONTIG)) { unsigned offset = __ffs(iter->tags); iter->tags >>= offset++; iter->index = __radix_tree_iter_add(iter, offset); slot += offset; goto found; } } else { long count = radix_tree_chunk_size(iter); while (--count > 0) { slot++; iter->index = __radix_tree_iter_add(iter, 1); if (likely(*slot)) goto found; if (flags & RADIX_TREE_ITER_CONTIG) { /* forbid switching to the next chunk */ iter->next_index = 0; break; } } } return NULL; found: return slot; } /** * radix_tree_for_each_slot - iterate over non-empty slots * * @slot: the void** variable for pointer to slot * @root: the struct radix_tree_root pointer * @iter: the struct radix_tree_iter pointer * @start: iteration starting index * * @slot points to radix tree slot, @iter->index contains its index. */ #define radix_tree_for_each_slot(slot, root, iter, start) \ for (slot = radix_tree_iter_init(iter, start) ; \ slot || (slot = radix_tree_next_chunk(root, iter, 0)) ; \ slot = radix_tree_next_slot(slot, iter, 0)) /** * radix_tree_for_each_tagged - iterate over tagged slots * * @slot: the void** variable for pointer to slot * @root: the struct radix_tree_root pointer * @iter: the struct radix_tree_iter pointer * @start: iteration starting index * @tag: tag index * * @slot points to radix tree slot, @iter->index contains its index. */ #define radix_tree_for_each_tagged(slot, root, iter, start, tag) \ for (slot = radix_tree_iter_init(iter, start) ; \ slot || (slot = radix_tree_next_chunk(root, iter, \ RADIX_TREE_ITER_TAGGED | tag)) ; \ slot = radix_tree_next_slot(slot, iter, \ RADIX_TREE_ITER_TAGGED | tag)) #endif /* _LINUX_RADIX_TREE_H */ |
499 417 122 499 497 498 496 3 498 145 147 2 145 145 146 146 147 25 120 25 25 23 2 238 237 244 244 21 345 347 46 46 46 147 147 494 2 145 125 146 146 144 147 147 125 23 146 146 144 414 415 21 415 415 495 494 243 241 50 146 108 51 147 162 42 147 147 148 4 41 41 41 41 41 149 149 41 175 152 152 152 152 152 51 149 149 149 8 6 2 27 27 4 24 24 27 179 157 36 179 41 33 32 41 41 41 41 3 41 243 243 23 23 5 5 5 442 443 537 538 442 537 537 162 162 162 6 162 162 162 3 3 36 34 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1997 Linus Torvalds * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) */ #include <linux/export.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/mm.h> #include <linux/backing-dev.h> #include <linux/hash.h> #include <linux/swap.h> #include <linux/security.h> #include <linux/cdev.h> #include <linux/memblock.h> #include <linux/fsnotify.h> #include <linux/mount.h> #include <linux/posix_acl.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> #include <linux/list_lru.h> #include <linux/iversion.h> #include <linux/rw_hint.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <trace/events/writeback.h> #define CREATE_TRACE_POINTS #include <trace/events/timestamp.h> #include "internal.h" /* * Inode locking rules: * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget(), inode->i_io_list * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode->i_sb->s_inode_list_lock protects: * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * * Lock ordering: * * inode->i_sb->s_inode_list_lock * inode->i_lock * Inode LRU list locks * * bdi->wb.list_lock * inode->i_lock * * inode_hash_lock * inode->i_sb->s_inode_list_lock * inode->i_lock * * iunique_lock * inode_hash_lock */ static unsigned int i_hash_mask __ro_after_init; static unsigned int i_hash_shift __ro_after_init; static struct hlist_head *inode_hashtable __ro_after_init; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. */ const struct address_space_operations empty_aops = { }; EXPORT_SYMBOL(empty_aops); static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __ro_after_init; static long get_nr_inodes(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } static inline long get_nr_inodes_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } #ifdef CONFIG_DEBUG_FS static DEFINE_PER_CPU(long, mg_ctime_updates); static DEFINE_PER_CPU(long, mg_fine_stamps); static DEFINE_PER_CPU(long, mg_ctime_swaps); static unsigned long get_mg_ctime_updates(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_ctime_updates, i)); return sum; } static unsigned long get_mg_fine_stamps(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_fine_stamps, i)); return sum; } static unsigned long get_mg_ctime_swaps(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_ctime_swaps, i)); return sum; } #define mgtime_counter_inc(__var) this_cpu_inc(__var) static int mgts_show(struct seq_file *s, void *p) { unsigned long ctime_updates = get_mg_ctime_updates(); unsigned long ctime_swaps = get_mg_ctime_swaps(); unsigned long fine_stamps = get_mg_fine_stamps(); unsigned long floor_swaps = timekeeping_get_mg_floor_swaps(); seq_printf(s, "%lu %lu %lu %lu\n", ctime_updates, ctime_swaps, fine_stamps, floor_swaps); return 0; } DEFINE_SHOW_ATTRIBUTE(mgts); static int __init mg_debugfs_init(void) { debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops); return 0; } late_initcall(mg_debugfs_init); #else /* ! CONFIG_DEBUG_FS */ #define mgtime_counter_inc(__var) do { } while (0) #endif /* CONFIG_DEBUG_FS */ /* * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL /* * Statistics gathering.. */ static struct inodes_stat_t inodes_stat; static int proc_nr_inodes(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table inodes_sysctls[] = { { .procname = "inode-nr", .data = &inodes_stat, .maxlen = 2*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, { .procname = "inode-state", .data = &inodes_stat, .maxlen = 7*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, }; static int __init init_fs_inode_sysctls(void) { register_sysctl_init("fs", inodes_sysctls); return 0; } early_initcall(init_fs_inode_sysctls); #endif static int no_open(struct inode *inode, struct file *file) { return -ENXIO; } /** * inode_init_always_gfp - perform inode structure initialisation * @sb: superblock inode belongs to * @inode: inode to initialise * @gfp: allocation flags * * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. * If there are additional allocations required @gfp is used. */ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp) { static const struct inode_operations empty_iops; static const struct file_operations no_open_fops = {.open = no_open}; struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; inode->i_state = 0; atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; inode->i_ino = 0; inode->__i_nlink = 1; inode->i_opflags = 0; if (sb->s_xattr) inode->i_opflags |= IOP_XATTR; if (sb->s_type->fs_flags & FS_MGTIME) inode->i_opflags |= IOP_MGTIME; i_uid_write(inode, 0); i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; inode->i_pipe = NULL; inode->i_cdev = NULL; inode->i_link = NULL; inode->i_dir_seq = 0; inode->i_rdev = 0; inode->dirtied_when = 0; #ifdef CONFIG_CGROUP_WRITEBACK inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0; #endif spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); atomic_set(&inode->i_dio_count, 0); mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); #ifdef CONFIG_READ_ONLY_THP_FOR_FS atomic_set(&mapping->nr_thps, 0); #endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->i_private_data = NULL; mapping->writeback_index = 0; init_rwsem(&mapping->invalidate_lock); lockdep_set_class_and_name(&mapping->invalidate_lock, &sb->s_type->invalidate_lock_key, "mapping.invalidate_lock"); if (sb->s_iflags & SB_I_STABLE_WRITES) mapping_set_stable_writes(mapping); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; #endif #ifdef CONFIG_FSNOTIFY inode->i_fsnotify_mask = 0; #endif inode->i_flctx = NULL; if (unlikely(security_inode_alloc(inode, gfp))) return -ENOMEM; this_cpu_inc(nr_inodes); return 0; } EXPORT_SYMBOL(inode_init_always_gfp); void free_inode_nonrcu(struct inode *inode) { kmem_cache_free(inode_cachep, inode); } EXPORT_SYMBOL(free_inode_nonrcu); static void i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); if (inode->free_inode) inode->free_inode(inode); else free_inode_nonrcu(inode); } /** * alloc_inode - obtain an inode * @sb: superblock * * Allocates a new inode for given superblock. * Inode wont be chained in superblock s_inodes list * This means : * - fs can't be unmount * - quotas, fsnotify, writeback can't work */ struct inode *alloc_inode(struct super_block *sb) { const struct super_operations *ops = sb->s_op; struct inode *inode; if (ops->alloc_inode) inode = ops->alloc_inode(sb); else inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL); if (!inode) return NULL; if (unlikely(inode_init_always(sb, inode))) { if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return NULL; } inode->free_inode = ops->free_inode; i_callback(&inode->i_rcu); return NULL; } return inode; } void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); locks_free_lock_context(inode); if (!inode->i_nlink) { WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); atomic_long_dec(&inode->i_sb->s_remove_count); } #ifdef CONFIG_FS_POSIX_ACL if (inode->i_acl && !is_uncached_acl(inode->i_acl)) posix_acl_release(inode->i_acl); if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) posix_acl_release(inode->i_default_acl); #endif this_cpu_dec(nr_inodes); } EXPORT_SYMBOL(__destroy_inode); static void destroy_inode(struct inode *inode) { const struct super_operations *ops = inode->i_sb->s_op; BUG_ON(!list_empty(&inode->i_lru)); __destroy_inode(inode); if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return; } inode->free_inode = ops->free_inode; call_rcu(&inode->i_rcu, i_callback); } /** * drop_nlink - directly drop an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. In cases * where we are attempting to track writes to the * filesystem, a decrement to zero means an imminent * write when the file is truncated and actually unlinked * on the filesystem. */ void drop_nlink(struct inode *inode) { WARN_ON(inode->i_nlink == 0); inode->__i_nlink--; if (!inode->i_nlink) atomic_long_inc(&inode->i_sb->s_remove_count); } EXPORT_SYMBOL(drop_nlink); /** * clear_nlink - directly zero an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. See * drop_nlink() for why we care about i_nlink hitting zero. */ void clear_nlink(struct inode *inode) { if (inode->i_nlink) { inode->__i_nlink = 0; atomic_long_inc(&inode->i_sb->s_remove_count); } } EXPORT_SYMBOL(clear_nlink); /** * set_nlink - directly set an inode's link count * @inode: inode * @nlink: new nlink (should be non-zero) * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. */ void set_nlink(struct inode *inode, unsigned int nlink) { if (!nlink) { clear_nlink(inode); } else { /* Yes, some filesystems do change nlink from zero to one */ if (inode->i_nlink == 0) atomic_long_dec(&inode->i_sb->s_remove_count); inode->__i_nlink = nlink; } } EXPORT_SYMBOL(set_nlink); /** * inc_nlink - directly increment an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. Currently, * it is only here for parity with dec_nlink(). */ void inc_nlink(struct inode *inode) { if (unlikely(inode->i_nlink == 0)) { WARN_ON(!(inode->i_state & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count); } inode->__i_nlink++; } EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->i_private_list); spin_lock_init(&mapping->i_private_lock); mapping->i_mmap = RB_ROOT_CACHED; } void address_space_init_once(struct address_space *mapping) { memset(mapping, 0, sizeof(*mapping)); __address_space_init_once(mapping); } EXPORT_SYMBOL(address_space_init_once); /* * These are initializations that only need to be done * once, because the fields are idempotent across use * of the inode, so let the slab aware of that. */ void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); INIT_LIST_HEAD(&inode->i_sb_list); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } EXPORT_SYMBOL(inode_init_once); static void init_once(void *foo) { struct inode *inode = (struct inode *) foo; inode_init_once(inode); } /* * get additional reference to inode; caller must already hold one. */ void ihold(struct inode *inode) { WARN_ON(atomic_inc_return(&inode->i_count) < 2); } EXPORT_SYMBOL(ihold); static void __inode_add_lru(struct inode *inode, bool rotate) { if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; if (atomic_read(&inode->i_count)) return; if (!(inode->i_sb->s_flags & SB_ACTIVE)) return; if (!mapping_shrinkable(&inode->i_data)) return; if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else if (rotate) inode->i_state |= I_REFERENCED; } struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit) { void *bit_address; bit_address = inode_state_wait_address(inode, bit); init_wait_var_entry(wqe, bit_address, 0); return __var_waitqueue(bit_address); } EXPORT_SYMBOL(inode_bit_waitqueue); /* * Add inode to LRU if needed (inode is unused and clean). * * Needs inode->i_lock held. */ void inode_add_lru(struct inode *inode) { __inode_add_lru(inode, false); } static void inode_lru_list_del(struct inode *inode) { if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } static void inode_pin_lru_isolating(struct inode *inode) { lockdep_assert_held(&inode->i_lock); WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); inode->i_state |= I_LRU_ISOLATING; } static void inode_unpin_lru_isolating(struct inode *inode) { spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); inode->i_state &= ~I_LRU_ISOLATING; /* Called with inode->i_lock which ensures memory ordering. */ inode_wake_up_bit(inode, __I_LRU_ISOLATING); spin_unlock(&inode->i_lock); } static void inode_wait_for_lru_isolating(struct inode *inode) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; lockdep_assert_held(&inode->i_lock); if (!(inode->i_state & I_LRU_ISOLATING)) return; wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING); for (;;) { prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* * Checking I_LRU_ISOLATING with inode->i_lock guarantees * memory ordering. */ if (!(inode->i_state & I_LRU_ISOLATING)) break; spin_unlock(&inode->i_lock); schedule(); spin_lock(&inode->i_lock); } finish_wait(wq_head, &wqe.wq_entry); WARN_ON(inode->i_state & I_LRU_ISOLATING); } /** * inode_sb_list_add - add inode to the superblock list of inodes * @inode: inode to add */ void inode_sb_list_add(struct inode *inode) { struct super_block *sb = inode->i_sb; spin_lock(&sb->s_inode_list_lock); list_add(&inode->i_sb_list, &sb->s_inodes); spin_unlock(&sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { struct super_block *sb = inode->i_sb; if (!list_empty(&inode->i_sb_list)) { spin_lock(&sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); spin_unlock(&sb->s_inode_list_lock); } } static unsigned long hash(struct super_block *sb, unsigned long hashval) { unsigned long tmp; tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / L1_CACHE_BYTES; tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); return tmp & i_hash_mask; } /** * __insert_inode_hash - hash an inode * @inode: unhashed inode * @hashval: unsigned long value used to locate this object in the * inode_hashtable. * * Add an inode to the inode hash for this superblock. */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_add_head_rcu(&inode->i_hash, b); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__insert_inode_hash); /** * __remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * * Remove an inode from the superblock. */ void __remove_inode_hash(struct inode *inode) { spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_del_init_rcu(&inode->i_hash); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__remove_inode_hash); void dump_mapping(const struct address_space *mapping) { struct inode *host; const struct address_space_operations *a_ops; struct hlist_node *dentry_first; struct dentry *dentry_ptr; struct dentry dentry; char fname[64] = {}; unsigned long ino; /* * If mapping is an invalid pointer, we don't want to crash * accessing it, so probe everything depending on it carefully. */ if (get_kernel_nofault(host, &mapping->host) || get_kernel_nofault(a_ops, &mapping->a_ops)) { pr_warn("invalid mapping:%px\n", mapping); return; } if (!host) { pr_warn("aops:%ps\n", a_ops); return; } if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || get_kernel_nofault(ino, &host->i_ino)) { pr_warn("aops:%ps invalid inode:%px\n", a_ops, host); return; } if (!dentry_first) { pr_warn("aops:%ps ino:%lx\n", a_ops, ino); return; } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); if (get_kernel_nofault(dentry, dentry_ptr) || !dentry.d_parent || !dentry.d_name.name) { pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", a_ops, ino, dentry_ptr); return; } if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0) strscpy(fname, "<invalid>"); /* * Even if strncpy_from_kernel_nofault() succeeded, * the fname could be unreliable */ pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n", a_ops, ino, fname); } void clear_inode(struct inode *inode) { /* * We have to cycle the i_pages lock here because reclaim can be in the * process of removing the last page (in __filemap_remove_folio()) * and we must not free the mapping under it. */ xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); /* * Almost always, mapping_empty(&inode->i_data) here; but there are * two known and long-standing ways in which nodes may get left behind * (when deep radix-tree node allocation failed partway; or when THP * collapse_file() failed). Until those two known cases are cleaned up, * or a cleanup function is called here, do not BUG_ON(!mapping_empty), * nor even WARN_ON(!mapping_empty). */ xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.i_private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } EXPORT_SYMBOL(clear_inode); /* * Free the inode passed in, removing it from the lists it is still connected * to. We remove any pages still attached to the inode and wait for any IO that * is still in progress before finally destroying the inode. * * An inode must already be marked I_FREEING so that we avoid the inode being * moved back onto lists if we race with other code that manipulates the lists * (e.g. writeback_single_inode). The caller is responsible for setting this. * * An inode must already be removed from the LRU list before being evicted from * the cache. This should occur atomically with setting the I_FREEING state * flag, so no inodes here should ever be on the LRU when being evicted. */ static void evict(struct inode *inode) { const struct super_operations *op = inode->i_sb->s_op; BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); if (!list_empty(&inode->i_io_list)) inode_io_list_del(inode); inode_sb_list_del(inode); spin_lock(&inode->i_lock); inode_wait_for_lru_isolating(inode); /* * Wait for flusher thread to be done with the inode so that filesystem * does not start destroying it while writeback is still running. Since * the inode has I_FREEING set, flusher thread won't start new work on * the inode. We just have to wait for running writeback to finish. */ inode_wait_for_writeback(inode); spin_unlock(&inode->i_lock); if (op->evict_inode) { op->evict_inode(inode); } else { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); remove_inode_hash(inode); /* * Wake up waiters in __wait_on_freeing_inode(). * * It is an invariant that any thread we need to wake up is already * accounted for before remove_inode_hash() acquires ->i_lock -- both * sides take the lock and sleep is aborted if the inode is found * unhashed. Thus either the sleeper wins and goes off CPU, or removal * wins and the sleeper aborts after testing with the lock. * * This also means we don't need any fences for the call below. */ inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); destroy_inode(inode); } /* * dispose_list - dispose of the contents of a local list * @head: the head of the list to free * * Dispose-list gets a local list with local inodes in it, so it doesn't * need to worry about list corruption and SMP locks. */ static void dispose_list(struct list_head *head) { while (!list_empty(head)) { struct inode *inode; inode = list_first_entry(head, struct inode, i_lru); list_del_init(&inode->i_lru); evict(inode); cond_resched(); } } /** * evict_inodes - evict all evictable inodes for a superblock * @sb: superblock to operate on * * Make sure that no inodes with zero refcount are retained. This is * called by superblock shutdown after having SB_ACTIVE flag removed, * so any inode reaching zero refcount during or after that call will * be immediately evicted. */ void evict_inodes(struct super_block *sb) { struct inode *inode, *next; LIST_HEAD(dispose); again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; spin_lock(&inode->i_lock); if (atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); continue; } if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; } inode->i_state |= I_FREEING; inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); /* * We can have a ton of inodes to evict at unmount time given * enough memory, check to see if we need to go to sleep for a * bit so we don't livelock. */ if (need_resched()) { spin_unlock(&sb->s_inode_list_lock); cond_resched(); dispose_list(&dispose); goto again; } } spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } EXPORT_SYMBOL_GPL(evict_inodes); /* * Isolate the inode from the LRU in preparation for freeing it. * * If the inode has the I_REFERENCED flag set, then it means that it has been * used recently - the flag is set in iput_final(). When we encounter such an * inode, clear the flag and move it to the back of the LRU so it gets another * pass through the LRU before it gets reclaimed. This is necessary because of * the fact we are doing lazy LRU updates to minimise lock contention so the * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ static enum lru_status inode_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); /* * We are inverting the lru lock/inode->i_lock here, so use a * trylock. If we fail to get the lock, just skip it. */ if (!spin_trylock(&inode->i_lock)) return LRU_SKIP; /* * Inodes can get referenced, redirtied, or repopulated while * they're already on the LRU, and this can make them * unreclaimable for a while. Remove them lazily here; iput, * sync, or the last page cache deletion will requeue them. */ if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } /* Recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; } /* * On highmem systems, mapping_shrinkable() permits dropping * page cache in order to free up struct inodes: lowmem might * be under pressure before the cache inside the highmem zone. */ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { inode_pin_lru_isolating(inode); spin_unlock(&inode->i_lock); spin_unlock(&lru->lock); if (remove_inode_buffers(inode)) { unsigned long reap; reap = invalidate_mapping_pages(&inode->i_data, 0, -1); if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); mm_account_reclaimed_pages(reap); } inode_unpin_lru_isolating(inode); return LRU_RETRY; } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } /* * Walk the superblock inode LRU for freeable inodes and attempt to free them. * This is called from the superblock shrinker function with a number of inodes * to trim from the LRU. Inodes to be freed are moved to a temporary list and * then are freed outside inode_lock by dispose_list(). */ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(freeable); long freed; freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, inode_lru_isolate, &freeable); dispose_list(&freeable); return freed; } static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked); /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data, bool is_inode_hash_locked) { struct inode *inode = NULL; if (is_inode_hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); rcu_read_lock(); repeat: hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb != sb) continue; if (!test(inode, data)) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; } rcu_read_unlock(); return NULL; } /* * find_inode_fast is the fast path version of find_inode, see the comment at * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino, bool is_inode_hash_locked) { struct inode *inode = NULL; if (is_inode_hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); rcu_read_lock(); repeat: hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; } rcu_read_unlock(); return NULL; } /* * Each cpu owns a range of LAST_INO_BATCH numbers. * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, * to renew the exhausted range. * * This does not significantly increase overflow rate because every CPU can * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the * 2^32 range, and is a worst-case. Even a 50% wastage would only increase * overflow rate by 2x, which does not seem too significant. * * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ #define LAST_INO_BATCH 1024 static DEFINE_PER_CPU(unsigned int, last_ino); unsigned int get_next_ino(void) { unsigned int *p = &get_cpu_var(last_ino); unsigned int res = *p; #ifdef CONFIG_SMP if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { static atomic_t shared_last_ino; int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); res = next - LAST_INO_BATCH; } #endif res++; /* get_next_ino should not provide a 0 inode number */ if (unlikely(!res)) res++; *p = res; put_cpu_var(last_ino); return res; } EXPORT_SYMBOL(get_next_ino); /** * new_inode - obtain an inode * @sb: superblock * * Allocates a new inode for given superblock. The default gfp_mask * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. * If HIGHMEM pages are unsuitable or it is known that pages allocated * for the page cache are not reclaimable or migratable, * mapping_set_gfp_mask() must be called with suitable flags on the * newly created inode's mapping * */ struct inode *new_inode(struct super_block *sb) { struct inode *inode; inode = alloc_inode(sb); if (inode) inode_sb_list_add(inode); return inode; } EXPORT_SYMBOL(new_inode); #ifdef CONFIG_DEBUG_LOCK_ALLOC void lockdep_annotate_inode_mutex_key(struct inode *inode) { if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { /* * ensure nobody is actually holding i_mutex */ // mutex_destroy(&inode->i_mutex); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &type->i_mutex_dir_key); } } } EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); #endif /** * unlock_new_inode - clear the I_NEW state and wake up any waiters * @inode: new inode to unlock * * Called when the inode is fully initialised to clear the new state of the * inode and wake up anyone waiting for the inode to finish initialisation. */ void unlock_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); void discard_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); } EXPORT_SYMBOL(discard_new_inode); /** * lock_two_nondirectories - take two i_mutexes on non-directory objects * * Lock any non-NULL argument. Passed objects must not be directories. * Zero, one or two objects may be locked by this function. * * @inode1: first inode to lock * @inode2: second inode to lock */ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1) WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); if (inode2) WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); if (inode1 > inode2) swap(inode1, inode2); if (inode1) inode_lock(inode1); if (inode2 && inode2 != inode1) inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); /** * unlock_two_nondirectories - release locks from lock_two_nondirectories() * @inode1: first inode to unlock * @inode2: second inode to unlock */ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1) { WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); inode_unlock(inode1); } if (inode2 && inode2 != inode1) { WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); inode_unlock(inode2); } } EXPORT_SYMBOL(unlock_two_nondirectories); /** * inode_insert5 - obtain an inode from a mounted file system * @inode: pre-allocated inode to use for insert to cache * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present return it with an increased reference count. This is a * variant of iget5_locked() that doesn't allocate an inode. * * If the inode is not present in the cache, insert the pre-allocated inode and * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; again: spin_lock(&inode_hash_lock); old = find_inode(inode->i_sb, head, test, data, true); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. * Use the old inode instead of the preallocated one. */ spin_unlock(&inode_hash_lock); if (IS_ERR(old)) return NULL; wait_on_inode(old); if (unlikely(inode_unhashed(old))) { iput(old); goto again; } return old; } if (set && unlikely(set(inode, data))) { spin_unlock(&inode_hash_lock); return NULL; } /* * Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); /* * Add inode to the sb list if it's not already. It has I_NEW at this * point, so it should be safe to test i_sb_list locklessly. */ if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); return inode; } EXPORT_SYMBOL(inode_insert5); /** * iget5_locked - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present return it with an increased reference count. This is a * generalized version of iget_locked() for file systems where the inode * number is not sufficient for unique identification of an inode. * * If the inode is not present in the cache, allocate and insert a new inode * and return it locked, hashed, and with the I_NEW flag set. The file system * gets to fill it in before unlocking it via unlock_new_inode(). * * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct inode *inode = ilookup5(sb, hashval, test, data); if (!inode) { struct inode *new = alloc_inode(sb); if (new) { inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) destroy_inode(new); } } return inode; } EXPORT_SYMBOL(iget5_locked); /** * iget5_locked_rcu - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * This is equivalent to iget5_locked, except the @test callback must * tolerate the inode not being stable, including being mid-teardown. */ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; again: inode = find_inode(sb, head, test, data, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } return inode; } new = alloc_inode(sb); if (new) { inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) destroy_inode(new); } return inode; } EXPORT_SYMBOL_GPL(iget5_locked_rcu); /** * iget_locked - obtain an inode from a mounted file system * @sb: super block of file system * @ino: inode number to get * * Search for the inode specified by @ino in the inode cache and if present * return it with an increased reference count. This is for file systems * where the inode number is sufficient for unique identification of an inode. * * If the inode is not in cache, allocate a new inode and return it locked, * hashed, and with the I_NEW flag set. The file system gets to fill it in * before unlocking it via unlock_new_inode(). */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } return inode; } inode = alloc_inode(sb); if (inode) { struct inode *old; spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode_fast(sb, head, ino, true); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); inode->i_state = I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); inode_sb_list_add(inode); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents */ return inode; } /* * Uhhuh, somebody else created the same inode under * us. Use the old inode instead of the one we just * allocated. */ spin_unlock(&inode_hash_lock); destroy_inode(inode); if (IS_ERR(old)) return NULL; inode = old; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(iget_locked); /* * search the inode cache for a matching inode number. * If we find one, then the inode number we are trying to * allocate is not unique and so we should not use it. * * Returns 1 if the inode number is unique, 0 if it is not. */ static int test_inode_iunique(struct super_block *sb, unsigned long ino) { struct hlist_head *b = inode_hashtable + hash(sb, ino); struct inode *inode; hlist_for_each_entry_rcu(inode, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb) return 0; } return 1; } /** * iunique - get a unique inode number * @sb: superblock * @max_reserved: highest reserved inode number * * Obtain an inode number that is unique on the system for a given * superblock. This is used by file systems that have no natural * permanent inode numbering system. An inode number is returned that * is higher than the reserved limit but unique. * * BUGS: * With a large number of inodes live on the file system this function * currently becomes quite slow. */ ino_t iunique(struct super_block *sb, ino_t max_reserved) { /* * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ static DEFINE_SPINLOCK(iunique_lock); static unsigned int counter; ino_t res; rcu_read_lock(); spin_lock(&iunique_lock); do { if (counter <= max_reserved) counter = max_reserved + 1; res = counter++; } while (!test_inode_iunique(sb, res)); spin_unlock(&iunique_lock); rcu_read_unlock(); return res; } EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode->i_lock); if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { __iget(inode); spin_unlock(&inode->i_lock); } else { spin_unlock(&inode->i_lock); /* * Handle the case where s_op->clear_inode is not been * called yet, and somebody is calling igrab * while the inode is getting freed. */ inode = NULL; } return inode; } EXPORT_SYMBOL(igrab); /** * ilookup5_nowait - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * * Search for the inode specified by @hashval and @data in the inode cache. * If the inode is in the cache, the inode is returned with an incremented * reference count. * * Note: I_NEW is not waited upon so you have to be very careful what you do * with the returned inode. You probably should be using ilookup5() instead. * * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; spin_lock(&inode_hash_lock); inode = find_inode(sb, head, test, data, true); spin_unlock(&inode_hash_lock); return IS_ERR(inode) ? NULL : inode; } EXPORT_SYMBOL(ilookup5_nowait); /** * ilookup5 - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * * Search for the inode specified by @hashval and @data in the inode cache, * and if the inode is in the cache, return the inode with an incremented * reference count. Waits on I_NEW before returning the inode. * returned with an incremented reference count. * * This is a generalized version of ilookup() for file systems where the * inode number is not sufficient for unique identification of an inode. * * Note: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; again: inode = ilookup5_nowait(sb, hashval, test, data); if (inode) { wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(ilookup5); /** * ilookup - search for an inode in the inode cache * @sb: super block of file system to search * @ino: inode number to search for * * Search for the inode @ino in the inode cache, and if the inode is in the * cache, the inode is returned with an incremented reference count. */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(ilookup); /** * find_inode_nowait - find an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @match: callback used for comparisons between inodes * @data: opaque data pointer to pass to @match * * Search for the inode specified by @hashval and @data in the inode * cache, where the helper function @match will return 0 if the inode * does not match, 1 if the inode does match, and -1 if the search * should be stopped. The @match function must be responsible for * taking the i_lock spin_lock and checking i_state for an inode being * freed or being initialized, and incrementing the reference count * before returning 1. It also must not sleep, since it is called with * the inode_hash_lock spinlock held. * * This is a even more generalized version of ilookup5() when the * function must never block --- find_inode() can block in * __wait_on_freeing_inode() --- or when the caller can not increment * the reference count because the resulting iput() might cause an * inode eviction. The tradeoff is that the @match funtion must be * very carefully implemented. */ struct inode *find_inode_nowait(struct super_block *sb, unsigned long hashval, int (*match)(struct inode *, unsigned long, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *ret_inode = NULL; int mval; spin_lock(&inode_hash_lock); hlist_for_each_entry(inode, head, i_hash) { if (inode->i_sb != sb) continue; mval = match(inode, hashval, data); if (mval == 0) continue; if (mval == 1) ret_inode = inode; goto out; } out: spin_unlock(&inode_hash_lock); return ret_inode; } EXPORT_SYMBOL(find_inode_nowait); /** * find_inode_rcu - find an inode in the inode cache * @sb: Super block of file system to search * @hashval: Key to hash * @test: Function to test match on an inode * @data: Data for test function * * Search for the inode specified by @hashval and @data in the inode cache, * where the helper function @test will return 0 if the inode does not match * and 1 if it does. The @test function must be responsible for taking the * i_lock spin_lock and checking i_state for an inode being freed or being * initialized. * * If successful, this will return the inode for which the @test function * returned 1 and NULL otherwise. * * The @test function is not permitted to take a ref on any inode presented. * It is also not permitted to sleep. * * The caller must hold the RCU read lock. */ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_rcu() usage"); hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) return inode; } return NULL; } EXPORT_SYMBOL(find_inode_rcu); /** * find_inode_by_ino_rcu - Find an inode in the inode cache * @sb: Super block of file system to search * @ino: The inode number to match * * Search for the inode specified by @hashval and @data in the inode cache, * where the helper function @test will return 0 if the inode does not match * and 1 if it does. The @test function must be responsible for taking the * i_lock spin_lock and checking i_state for an inode being freed or being * initialized. * * If successful, this will return the inode for which the @test function * returned 1 and NULL otherwise. * * The @test function is not permitted to take a ref on any inode presented. * It is also not permitted to sleep. * * The caller must hold the RCU read lock. */ struct inode *find_inode_by_ino_rcu(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_by_ino_rcu() usage"); hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) return inode; } return NULL; } EXPORT_SYMBOL(find_inode_by_ino_rcu); int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); hlist_for_each_entry(old, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) continue; spin_lock(&old->i_lock); if (old->i_state & (I_FREEING|I_WILL_FREE)) { spin_unlock(&old->i_lock); continue; } break; } if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW | I_CREATING; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); return 0; } if (unlikely(old->i_state & I_CREATING)) { spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); return -EBUSY; } __iget(old); spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; } iput(old); } } EXPORT_SYMBOL(insert_inode_locked); int insert_inode_locked4(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *old; inode->i_state |= I_CREATING; old = inode_insert5(inode, hashval, test, NULL, data); if (old != inode) { iput(old); return -EBUSY; } return 0; } EXPORT_SYMBOL(insert_inode_locked4); int generic_delete_inode(struct inode *inode) { return 1; } EXPORT_SYMBOL(generic_delete_inode); /* * Called when we're dropping the last reference * to an inode. * * Call the FS "drop_inode()" function, defaulting to * the legacy UNIX filesystem behaviour. If it tells * us to evict inode, do so. Otherwise, retain inode * in cache if fs is alive, sync and evict if fs is * shutting down. */ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; unsigned long state; int drop; WARN_ON(inode->i_state & I_NEW); if (op->drop_inode) drop = op->drop_inode(inode); else drop = generic_drop_inode(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { __inode_add_lru(inode, true); spin_unlock(&inode->i_lock); return; } state = inode->i_state; if (!drop) { WRITE_ONCE(inode->i_state, state | I_WILL_FREE); spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); state = inode->i_state; WARN_ON(state & I_NEW); state &= ~I_WILL_FREE; } WRITE_ONCE(inode->i_state, state | I_FREEING); if (!list_empty(&inode->i_lru)) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); } /** * iput - put an inode * @inode: inode to put * * Puts an inode, dropping its usage count. If the inode use count hits * zero, the inode is then freed and may also be destroyed. * * Consequently, iput() can sleep. */ void iput(struct inode *inode) { if (!inode) return; BUG_ON(inode->i_state & I_CLEAR); retry: if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); trace_writeback_lazytime_iput(inode); mark_inode_dirty_sync(inode); goto retry; } iput_final(inode); } } EXPORT_SYMBOL(iput); #ifdef CONFIG_BLOCK /** * bmap - find a block number in a file * @inode: inode owning the block number being requested * @block: pointer containing the block to find * * Replaces the value in ``*block`` with the block number on the device holding * corresponding to the requested block number in the file. * That is, asked for block 4 of inode 1 the function will replace the * 4 in ``*block``, with disk block relative to the disk start that holds that * block of the file. * * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a * hole, returns 0 and ``*block`` is also set to 0. */ int bmap(struct inode *inode, sector_t *block) { if (!inode->i_mapping->a_ops->bmap) return -EINVAL; *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block); return 0; } EXPORT_SYMBOL(bmap); #endif /* * With relative atime, only update atime if the previous atime is * earlier than or equal to either the ctime or mtime, * or if at least a day has passed since the last atime update. */ static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { struct timespec64 atime, mtime, ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) return true; /* * Is mtime younger than or equal to atime? If yes, update atime: */ atime = inode_get_atime(inode); mtime = inode_get_mtime(inode); if (timespec64_compare(&mtime, &atime) >= 0) return true; /* * Is ctime younger than or equal to atime? If yes, update atime: */ ctime = inode_get_ctime(inode); if (timespec64_compare(&ctime, &atime) >= 0) return true; /* * Is the previous atime value older than a day? If yes, * update atime: */ if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60) return true; /* * Good, we can skip the atime update: */ return false; } /** * inode_update_timestamps - update the timestamps on the inode * @inode: inode to be updated * @flags: S_* flags that needed to be updated * * The update_time function is called when an inode's timestamps need to be * updated for a read or write operation. This function handles updating the * actual timestamps. It's up to the caller to ensure that the inode is marked * dirty appropriately. * * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated, * attempt to update all three of them. S_ATIME updates can be handled * independently of the rest. * * Returns a set of S_* flags indicating which values changed. */ int inode_update_timestamps(struct inode *inode, int flags) { int updated = 0; struct timespec64 now; if (flags & (S_MTIME|S_CTIME|S_VERSION)) { struct timespec64 ctime = inode_get_ctime(inode); struct timespec64 mtime = inode_get_mtime(inode); now = inode_set_ctime_current(inode); if (!timespec64_equal(&now, &ctime)) updated |= S_CTIME; if (!timespec64_equal(&now, &mtime)) { inode_set_mtime_to_ts(inode, now); updated |= S_MTIME; } if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated)) updated |= S_VERSION; } else { now = current_time(inode); } if (flags & S_ATIME) { struct timespec64 atime = inode_get_atime(inode); if (!timespec64_equal(&now, &atime)) { inode_set_atime_to_ts(inode, now); updated |= S_ATIME; } } return updated; } EXPORT_SYMBOL(inode_update_timestamps); /** * generic_update_time - update the timestamps on the inode * @inode: inode to be updated * @flags: S_* flags that needed to be updated * * The update_time function is called when an inode's timestamps need to be * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME, * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME * updates can be handled done independently of the rest. * * Returns a S_* mask indicating which fields were updated. */ int generic_update_time(struct inode *inode, int flags) { int updated = inode_update_timestamps(inode, flags); int dirty_flags = 0; if (updated & (S_ATIME|S_MTIME|S_CTIME)) dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC; if (updated & S_VERSION) dirty_flags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, dirty_flags); return updated; } EXPORT_SYMBOL(generic_update_time); /* * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ int inode_update_time(struct inode *inode, int flags) { if (inode->i_op->update_time) return inode->i_op->update_time(inode, flags); generic_update_time(inode, flags); return 0; } EXPORT_SYMBOL(inode_update_time); /** * atime_needs_update - update the access time * @path: the &struct path to update * @inode: inode to update * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ bool atime_needs_update(const struct path *path, struct inode *inode) { struct vfsmount *mnt = path->mnt; struct timespec64 now, atime; if (inode->i_flags & S_NOATIME) return false; /* Atime updates will likely cause i_uid and i_gid to be written * back improprely if their true value is unknown to the vfs. */ if (HAS_UNMAPPED_ID(mnt_idmap(mnt), inode)) return false; if (IS_NOATIME(inode)) return false; if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; if (mnt->mnt_flags & MNT_NOATIME) return false; if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; now = current_time(inode); if (!relatime_need_update(mnt, inode, now)) return false; atime = inode_get_atime(inode); if (timespec64_equal(&atime, &now)) return false; return true; } void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = d_inode(path->dentry); if (!atime_needs_update(path, inode)) return; if (!sb_start_write_trylock(inode->i_sb)) return; if (mnt_get_write_access(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to * allocate new space to modify an inode (such is the case for * Btrfs), but since we touch atime while walking down the path we * really don't care if we failed to update the atime of the file, * so just ignore the return value. * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ inode_update_time(inode, S_ATIME); mnt_put_write_access(mnt); skip_update: sb_end_write(inode->i_sb); } EXPORT_SYMBOL(touch_atime); /* * Return mask of changes for notify_change() that need to be done as a * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ int dentry_needs_remove_privs(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int mask = 0; int ret; if (IS_NOSEC(inode)) return 0; mask = setattr_should_drop_suidgid(idmap, inode); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; if (ret) mask |= ATTR_KILL_PRIV; return mask; } static int __remove_privs(struct mnt_idmap *idmap, struct dentry *dentry, int kill) { struct iattr newattrs; newattrs.ia_valid = ATTR_FORCE | kill; /* * Note we call this on write, so notify_change will not * encounter any conflicting delegations: */ return notify_change(idmap, dentry, &newattrs, NULL); } int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); int error = 0; int kill; if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(file_mnt_idmap(file), dentry); if (kill < 0) return kill; if (kill) { if (flags & IOCB_NOWAIT) return -EAGAIN; error = __remove_privs(file_mnt_idmap(file), dentry, kill); } if (!error) inode_has_no_xattr(inode); return error; } EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) * @file: file to remove privileges from * * When file is modified by a write or truncation ensure that special * file privileges are removed. * * Return: 0 on success, negative errno on failure. */ int file_remove_privs(struct file *file) { return file_remove_privs_flags(file, 0); } EXPORT_SYMBOL(file_remove_privs); /** * current_time - Return FS time (possibly fine-grained) * @inode: inode. * * Return the current time truncated to the time granularity supported by * the fs, as suitable for a ctime/mtime change. If the ctime is flagged * as having been QUERIED, get a fine-grained timestamp, but don't update * the floor. * * For a multigrain inode, this is effectively an estimate of the timestamp * that a file would receive. An actual update must go through * inode_set_ctime_current(). */ struct timespec64 current_time(struct inode *inode) { struct timespec64 now; u32 cns; ktime_get_coarse_real_ts64_mg(&now); if (!is_mgtime(inode)) goto out; /* If nothing has queried it, then coarse time is fine */ cns = smp_load_acquire(&inode->i_ctime_nsec); if (cns & I_CTIME_QUERIED) { /* * If there is no apparent change, then get a fine-grained * timestamp. */ if (now.tv_nsec == (cns & ~I_CTIME_QUERIED)) ktime_get_real_ts64(&now); } out: return timestamp_truncate(now, inode); } EXPORT_SYMBOL(current_time); static int inode_needs_update_time(struct inode *inode) { struct timespec64 now, ts; int sync_it = 0; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; now = current_time(inode); ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) sync_it |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) sync_it |= S_VERSION; return sync_it; } static int __file_update_time(struct file *file, int sync_mode) { int ret = 0; struct inode *inode = file_inode(file); /* try to update time settings */ if (!mnt_get_write_access_file(file)) { ret = inode_update_time(inode, sync_mode); mnt_put_write_access_file(file); } return ret; } /** * file_update_time - update mtime and ctime time * @file: file accessed * * Update the mtime and ctime members of an inode and mark the inode for * writeback. Note that this function is meant exclusively for usage in * the file write path of filesystems, and filesystems may choose to * explicitly ignore updates via this function with the _NOCMTIME inode * flag, e.g. for network filesystem where these imestamps are handled * by the server. This can return an error for file systems who need to * allocate space in order to update an inode. * * Return: 0 on success, negative errno on failure. */ int file_update_time(struct file *file) { int ret; struct inode *inode = file_inode(file); ret = inode_needs_update_time(inode); if (ret <= 0) return ret; return __file_update_time(file, ret); } EXPORT_SYMBOL(file_update_time); /** * file_modified_flags - handle mandated vfs changes when modifying a file * @file: file that was modified * @flags: kiocb flags * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * If IOCB_NOWAIT is set, special file privileges will not be removed and * time settings will not be updated. It will return -EAGAIN. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ static int file_modified_flags(struct file *file, int flags) { int ret; struct inode *inode = file_inode(file); /* * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ ret = file_remove_privs_flags(file, flags); if (ret) return ret; if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; ret = inode_needs_update_time(inode); if (ret <= 0) return ret; if (flags & IOCB_NOWAIT) return -EAGAIN; return __file_update_time(file, ret); } /** * file_modified - handle mandated vfs changes when modifying a file * @file: file that was modified * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ int file_modified(struct file *file) { return file_modified_flags(file, 0); } EXPORT_SYMBOL(file_modified); /** * kiocb_modified - handle mandated vfs changes when modifying a file * @iocb: iocb that was modified * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ int kiocb_modified(struct kiocb *iocb) { return file_modified_flags(iocb->ki_filp, iocb->ki_flags); } EXPORT_SYMBOL_GPL(kiocb_modified); int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) return 1; if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) return 1; return 0; } EXPORT_SYMBOL(inode_needs_sync); /* * If we try to find an inode in the inode hash while it is being * deleted, we have to wait until the filesystem completes its * deletion before reporting that it isn't found. This function waits * until the deletion _might_ have completed. Callers are responsible * to recheck inode state. * * It doesn't matter if I_NEW is not set initially, a call to * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; /* * Handle racing against evict(), see that routine for more details. */ if (unlikely(inode_unhashed(inode))) { WARN_ON(is_inode_hash_locked); spin_unlock(&inode->i_lock); return; } wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); rcu_read_unlock(); if (is_inode_hash_locked) spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq_head, &wqe.wq_entry); if (is_inode_hash_locked) spin_lock(&inode_hash_lock); rcu_read_lock(); } static __initdata unsigned long ihash_entries; static int __init set_ihash_entries(char *str) { if (!str) return 0; ihash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("ihash_entries=", set_ihash_entries); /* * Initialize the waitqueues and inode hash table. */ void __init inode_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; inode_hashtable = alloc_large_system_hash("Inode-cache", sizeof(struct hlist_head), ihash_entries, 14, HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); } void __init inode_init(void) { /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ if (!hashdist) return; inode_hashtable = alloc_large_system_hash("Inode-cache", sizeof(struct hlist_head), ihash_entries, 14, HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; if (S_ISCHR(mode)) { inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; } else if (S_ISBLK(mode)) { if (IS_ENABLED(CONFIG_BLOCK)) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; } else if (S_ISFIFO(mode)) inode->i_fop = &pipefifo_fops; else if (S_ISSOCK(mode)) ; /* leave it no_open_fops */ else printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, inode->i_ino); } EXPORT_SYMBOL(init_special_inode); /** * inode_init_owner - Init uid,gid,mode for new inode according to posix standards * @idmap: idmap of the mount the inode was created from * @inode: New inode * @dir: Directory inode * @mode: mode of the new inode * * If the inode has been created through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions * and initializing i_uid and i_gid. On non-idmapped mounts or if permission * checking is to be performed on the raw inode simply pass @nop_mnt_idmap. */ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode) { inode_fsuid_set(inode, idmap); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; /* Directories are special, and always inherit S_ISGID */ if (S_ISDIR(mode)) mode |= S_ISGID; } else inode_fsgid_set(inode, idmap); inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); /** * inode_owner_or_capable - check current task permissions to inode * @idmap: idmap of the mount the inode was found from * @inode: inode being checked * * Return true if current either has CAP_FOWNER in a namespace with the * inode owner uid mapped, or owns the file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode) { vfsuid_t vfsuid; struct user_namespace *ns; vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return true; ns = current_user_ns(); if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } EXPORT_SYMBOL(inode_owner_or_capable); /* * Direct i/o helper functions */ bool inode_dio_finished(const struct inode *inode) { return atomic_read(&inode->i_dio_count) == 0; } EXPORT_SYMBOL(inode_dio_finished); /** * inode_dio_wait - wait for outstanding DIO requests to finish * @inode: inode to wait for * * Waits for all pending direct I/O requests to finish so that we can * proceed with a truncate or equivalent operation. * * Must be called under a lock that serializes taking new references * to i_dio_count, usually by inode->i_mutex. */ void inode_dio_wait(struct inode *inode) { wait_var_event(&inode->i_dio_count, inode_dio_finished(inode)); } EXPORT_SYMBOL(inode_dio_wait); void inode_dio_wait_interruptible(struct inode *inode) { wait_var_event_interruptible(&inode->i_dio_count, inode_dio_finished(inode)); } EXPORT_SYMBOL(inode_dio_wait_interruptible); /* * inode_set_flags - atomically set some inode flags * * Note: the caller should be holding i_mutex, or else be sure that * they have exclusive access to the inode structure (i.e., while the * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify * i_flags actually followed this rule, is that there is at least one * code path which doesn't today so we use cmpxchg() out of an abundance * of caution. * * In the long run, i_mutex is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure * it is so documented in include/linux/fs.h and that all code follows * the locking convention!! */ void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask) { WARN_ON_ONCE(flags & ~mask); set_mask_bits(&inode->i_flags, mask, flags); } EXPORT_SYMBOL(inode_set_flags); void inode_nohighmem(struct inode *inode) { mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } EXPORT_SYMBOL(inode_nohighmem); struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) { trace_inode_set_ctime_to_ts(inode, &ts); set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec); inode->i_ctime_sec = ts.tv_sec; inode->i_ctime_nsec = ts.tv_nsec; return ts; } EXPORT_SYMBOL(inode_set_ctime_to_ts); /** * timestamp_truncate - Truncate timespec to a granularity * @t: Timespec * @inode: inode being updated * * Truncate a timespec to the granularity supported by the fs * containing the inode. Always rounds down. gran must * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). */ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned int gran = sb->s_time_gran; t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max); if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min)) t.tv_nsec = 0; /* Avoid division in the common cases 1 ns and 1 s. */ if (gran == 1) ; /* nothing */ else if (gran == NSEC_PER_SEC) t.tv_nsec = 0; else if (gran > 1 && gran < NSEC_PER_SEC) t.tv_nsec -= t.tv_nsec % gran; else WARN(1, "invalid file time granularity: %u", gran); return t; } EXPORT_SYMBOL(timestamp_truncate); /** * inode_set_ctime_current - set the ctime to current_time * @inode: inode * * Set the inode's ctime to the current value for the inode. Returns the * current value that was assigned. If this is not a multigrain inode, then we * set it to the later of the coarse time and floor value. * * If it is multigrain, then we first see if the coarse-grained timestamp is * distinct from what is already there. If so, then use that. Otherwise, get a * fine-grained timestamp. * * After that, try to swap the new value into i_ctime_nsec. Accept the * resulting ctime, regardless of the outcome of the swap. If it has * already been replaced, then that timestamp is later than the earlier * unacceptable one, and is thus acceptable. */ struct timespec64 inode_set_ctime_current(struct inode *inode) { struct timespec64 now; u32 cns, cur; ktime_get_coarse_real_ts64_mg(&now); now = timestamp_truncate(now, inode); /* Just return that if this is not a multigrain fs */ if (!is_mgtime(inode)) { inode_set_ctime_to_ts(inode, now); goto out; } /* * A fine-grained time is only needed if someone has queried * for timestamps, and the current coarse grained time isn't * later than what's already there. */ cns = smp_load_acquire(&inode->i_ctime_nsec); if (cns & I_CTIME_QUERIED) { struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec, .tv_nsec = cns & ~I_CTIME_QUERIED }; if (timespec64_compare(&now, &ctime) <= 0) { ktime_get_real_ts64_mg(&now); now = timestamp_truncate(now, inode); mgtime_counter_inc(mg_fine_stamps); } } mgtime_counter_inc(mg_ctime_updates); /* No need to cmpxchg if it's exactly the same */ if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) { trace_ctime_xchg_skip(inode, &now); goto out; } cur = cns; retry: /* Try to swap the nsec value into place. */ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) { /* If swap occurred, then we're (mostly) done */ inode->i_ctime_sec = now.tv_sec; trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur); mgtime_counter_inc(mg_ctime_swaps); } else { /* * Was the change due to someone marking the old ctime QUERIED? * If so then retry the swap. This can only happen once since * the only way to clear I_CTIME_QUERIED is to stamp the inode * with a new ctime. */ if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) { cns = cur; goto retry; } /* Otherwise, keep the existing ctime */ now.tv_sec = inode->i_ctime_sec; now.tv_nsec = cur & ~I_CTIME_QUERIED; } out: return now; } EXPORT_SYMBOL(inode_set_ctime_current); /** * inode_set_ctime_deleg - try to update the ctime on a delegated inode * @inode: inode to update * @update: timespec64 to set the ctime * * Attempt to atomically update the ctime on behalf of a delegation holder. * * The nfs server can call back the holder of a delegation to get updated * inode attributes, including the mtime. When updating the mtime, update * the ctime to a value at least equal to that. * * This can race with concurrent updates to the inode, in which * case the update is skipped. * * Note that this works even when multigrain timestamps are not enabled, * so it is used in either case. */ struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update) { struct timespec64 now, cur_ts; u32 cur, old; /* pairs with try_cmpxchg below */ cur = smp_load_acquire(&inode->i_ctime_nsec); cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; cur_ts.tv_sec = inode->i_ctime_sec; /* If the update is older than the existing value, skip it. */ if (timespec64_compare(&update, &cur_ts) <= 0) return cur_ts; ktime_get_coarse_real_ts64_mg(&now); /* Clamp the update to "now" if it's in the future */ if (timespec64_compare(&update, &now) > 0) update = now; update = timestamp_truncate(update, inode); /* No need to update if the values are already the same */ if (timespec64_equal(&update, &cur_ts)) return cur_ts; /* * Try to swap the nsec value into place. If it fails, that means * it raced with an update due to a write or similar activity. That * stamp takes precedence, so just skip the update. */ retry: old = cur; if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) { inode->i_ctime_sec = update.tv_sec; mgtime_counter_inc(mg_ctime_swaps); return update; } /* * Was the change due to another task marking the old ctime QUERIED? * * If so, then retry the swap. This can only happen once since * the only way to clear I_CTIME_QUERIED is to stamp the inode * with a new ctime. */ if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED))) goto retry; /* Otherwise, it was a new timestamp. */ cur_ts.tv_sec = inode->i_ctime_sec; cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; return cur_ts; } EXPORT_SYMBOL(inode_set_ctime_deleg); /** * in_group_or_capable - check whether caller is CAP_FSETID privileged * @idmap: idmap of the mount @inode was found from * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * * Check whether @vfsgid is in the caller's group list or if the caller is * privileged with CAP_FSETID over @inode. This can be used to determine * whether the setgid bit can be kept or must be dropped. * * Return: true if the caller is sufficiently privileged, false if not. */ bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid) { if (vfsgid_in_group_p(vfsgid)) return true; if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) return true; return false; } EXPORT_SYMBOL(in_group_or_capable); /** * mode_strip_sgid - handle the sgid bit for non-directories * @idmap: idmap of the mount the inode was created from * @dir: parent directory inode * @mode: mode of the file to be created in @dir * * If the @mode of the new file has both the S_ISGID and S_IXGRP bit * raised and @dir has the S_ISGID bit raised ensure that the caller is * either in the group of the parent directory or they have CAP_FSETID * in their user namespace and are privileged over the parent directory. * In all other cases, strip the S_ISGID bit from @mode. * * Return: the new mode to use for the file */ umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode) { if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) return mode; if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) return mode; if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, dir))) return mode; return mode & ~S_ISGID; } EXPORT_SYMBOL(mode_strip_sgid); #ifdef CONFIG_DEBUG_VFS /* * Dump an inode. * * TODO: add a proper inode dumping routine, this is a stub to get debug off the * ground. */ void dump_inode(struct inode *inode, const char *reason) { pr_warn("%s encountered for inode %px", reason, inode); } EXPORT_SYMBOL(dump_inode); #endif |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions of the Internet Protocol. * * Version: @(#)in.h 1.0.1 04/21/93 * * Authors: Original taken from the GNU Project <netinet/in.h> file. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IN_H #define _LINUX_IN_H #include <linux/errno.h> #include <uapi/linux/in.h> static inline int proto_ports_offset(int proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: /* SPI */ case IPPROTO_SCTP: case IPPROTO_UDPLITE: return 0; case IPPROTO_AH: /* SPI */ return 4; default: return -EINVAL; } } static inline bool ipv4_is_loopback(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x7f000000); } static inline bool ipv4_is_multicast(__be32 addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } static inline bool ipv4_is_local_multicast(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xe0000000); } static inline bool ipv4_is_lbcast(__be32 addr) { /* limited broadcast */ return addr == htonl(INADDR_BROADCAST); } static inline bool ipv4_is_all_snoopers(__be32 addr) { return addr == htonl(INADDR_ALLSNOOPERS_GROUP); } static inline bool ipv4_is_zeronet(__be32 addr) { return (addr == 0); } /* Special-Use IPv4 Addresses (RFC3330) */ static inline bool ipv4_is_private_10(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x0a000000); } static inline bool ipv4_is_private_172(__be32 addr) { return (addr & htonl(0xfff00000)) == htonl(0xac100000); } static inline bool ipv4_is_private_192(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xc0a80000); } static inline bool ipv4_is_linklocal_169(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000); } static inline bool ipv4_is_anycast_6to4(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0586300); } static inline bool ipv4_is_test_192(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0000200); } static inline bool ipv4_is_test_198(__be32 addr) { return (addr & htonl(0xfffe0000)) == htonl(0xc6120000); } #endif /* _LINUX_IN_H */ |
63 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VGIC_H #include <linux/tracepoint.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm TRACE_EVENT(vgic_update_irq_pending, TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level), TP_ARGS(vcpu_id, irq, level), TP_STRUCT__entry( __field( unsigned long, vcpu_id ) __field( __u32, irq ) __field( bool, level ) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->irq = irq; __entry->level = level; ), TP_printk("VCPU: %ld, IRQ %d, level: %d", __entry->vcpu_id, __entry->irq, __entry->level) ); #endif /* _TRACE_VGIC_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../arch/arm64/kvm/vgic #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace /* This part must be outside protection */ #include <trace/define_trace.h> |
546 550 316 317 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 | // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; static LIST_HEAD(kthreads_hotplug); static DEFINE_MUTEX(kthreads_hotplug_lock); struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; unsigned int node; int started; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; struct task_struct *task; struct list_head hotplug_node; struct cpumask *preferred_affinity; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * Per construction; when: * * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and * begin_new_exec()). */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { strscpy(buf, tsk->comm, buf_size); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); INIT_LIST_HEAD(&kthread->hotplug_node); p->vfork_done = &kthread->exited; kthread->task = p; kthread->node = tsk_fork_get_node(current); p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; if (!list_empty(&kthread->hotplug_node)) { mutex_lock(&kthreads_hotplug_lock); list_del(&kthread->hotplug_node); mutex_unlock(&kthreads_hotplug_lock); if (kthread->preferred_affinity) { kfree(kthread->preferred_affinity); kthread->preferred_affinity = NULL; } } do_exit(0); } EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) { const struct cpumask *pref; if (kthread->preferred_affinity) { pref = kthread->preferred_affinity; } else { if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) return; pref = cpumask_of_node(kthread->node); } cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); if (cpumask_empty(cpumask)) cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); } static void kthread_affine_node(void) { struct kthread *kthread = to_kthread(current); cpumask_var_t affinity; WARN_ON_ONCE(kthread_is_per_cpu(current)); if (kthread->node == NUMA_NO_NODE) { housekeeping_affine(current, HK_TYPE_KTHREAD); } else { if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { WARN_ON_ONCE(1); return; } mutex_lock(&kthreads_hotplug_lock); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); /* * The node cpumask is racy when read from kthread() but: * - a racing CPU going down will either fail on the subsequent * call to set_cpus_allowed_ptr() or be migrated to housekeepers * afterwards by the scheduler. * - a racing CPU going up will be handled by kthreads_online_cpu() */ kthread_fetch_affinity(kthread, affinity); set_cpus_allowed_ptr(current, affinity); mutex_unlock(&kthreads_hotplug_lock); free_cpumask_var(affinity); } } static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); self->started = 1; if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) kthread_affine_node(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { struct kthread *kthread = to_kthread(p); __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { static const char comm[TASK_COMM_LEN] = "kthreadd"; struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, comm); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { WARN_ON(1); return -EINVAL; } WARN_ON_ONCE(kthread->preferred_affinity); if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!kthread->preferred_affinity) { ret = -ENOMEM; goto out; } mutex_lock(&kthreads_hotplug_lock); cpumask_copy(kthread->preferred_affinity, mask); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, affinity); raw_spin_unlock_irqrestore(&p->pi_lock, flags); mutex_unlock(&kthreads_hotplug_lock); out: free_cpumask_var(affinity); return ret; } /* * Re-affine kthreads according to their preferences * and the newly online CPU. The CPU down part is handled * by select_fallback_rq() which default re-affines to * housekeepers from other nodes in case the preferred * affinity doesn't apply anymore. */ static int kthreads_online_cpu(unsigned int cpu) { cpumask_var_t affinity; struct kthread *k; int ret; guard(mutex)(&kthreads_hotplug_lock); if (list_empty(&kthreads_hotplug)) return 0; if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; ret = 0; list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || kthread_is_per_cpu(k->task))) { ret = -EINVAL; continue; } kthread_fetch_affinity(k, affinity); set_cpus_allowed_ptr(k->task, affinity); } free_cpumask_var(affinity); return ret; } static int kthreads_init(void) { return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", kthreads_online_cpu, NULL); } early_initcall(kthreads_init); void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) { schedule(); } else { /* * Handle the case where the current remains * TASK_INTERRUPTIBLE. try_to_freeze() expects * the current to be TASK_RUNNING. */ __set_current_state(TASK_RUNNING); } try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; worker->flags = flags; worker->task = task; return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker * @node: task structure for the thread is allocated on this node * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_node); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]) { struct kthread_worker *worker; worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); if (!IS_ERR(worker)) kthread_bind(worker->task, cpu); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_create_worker(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * timer_delete_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); timer_delete_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif |
280 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ARM64_KVM_NESTED_H #define __ARM64_KVM_NESTED_H #include <linux/bitfield.h> #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_pgtable.h> static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) { return (!__is_defined(__KVM_NVHE_HYPERVISOR__) && cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) && vcpu_has_feature(vcpu, KVM_ARM_VCPU_HAS_EL2)); } /* Translation helpers from non-VHE EL2 to EL1 */ static inline u64 tcr_el2_ps_to_tcr_el1_ips(u64 tcr_el2) { return (u64)FIELD_GET(TCR_EL2_PS_MASK, tcr_el2) << TCR_IPS_SHIFT; } static inline u64 translate_tcr_el2_to_tcr_el1(u64 tcr) { return TCR_EPD1_MASK | /* disable TTBR1_EL1 */ ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) | tcr_el2_ps_to_tcr_el1_ips(tcr) | (tcr & TCR_EL2_TG0_MASK) | (tcr & TCR_EL2_ORGN0_MASK) | (tcr & TCR_EL2_IRGN0_MASK) | (tcr & TCR_EL2_T0SZ_MASK); } static inline u64 translate_cptr_el2_to_cpacr_el1(u64 cptr_el2) { u64 cpacr_el1 = CPACR_EL1_RES1; if (cptr_el2 & CPTR_EL2_TTA) cpacr_el1 |= CPACR_EL1_TTA; if (!(cptr_el2 & CPTR_EL2_TFP)) cpacr_el1 |= CPACR_EL1_FPEN; if (!(cptr_el2 & CPTR_EL2_TZ)) cpacr_el1 |= CPACR_EL1_ZEN; cpacr_el1 |= cptr_el2 & (CPTR_EL2_TCPAC | CPTR_EL2_TAM); return cpacr_el1; } static inline u64 translate_sctlr_el2_to_sctlr_el1(u64 val) { /* Only preserve the minimal set of bits we support */ val &= (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB | SCTLR_ELx_WXN | SCTLR_ELx_EE); val |= SCTLR_EL1_RES1; return val; } static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0) { /* Clear the ASID field */ return ttbr0 & ~GENMASK_ULL(63, 48); } extern bool forward_smc_trap(struct kvm_vcpu *vcpu); extern bool forward_debug_exception(struct kvm_vcpu *vcpu); extern void kvm_init_nested(struct kvm *kvm); extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu); extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu); extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu); union tlbi_info; extern void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid, const union tlbi_info *info, void (*)(struct kvm_s2_mmu *, const union tlbi_info *)); extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu); extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu); extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu); struct kvm_s2_trans { phys_addr_t output; unsigned long block_size; bool writable; bool readable; int level; u32 esr; u64 desc; }; static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans) { return trans->output; } static inline unsigned long kvm_s2_trans_size(struct kvm_s2_trans *trans) { return trans->block_size; } static inline u32 kvm_s2_trans_esr(struct kvm_s2_trans *trans) { return trans->esr; } static inline bool kvm_s2_trans_readable(struct kvm_s2_trans *trans) { return trans->readable; } static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans) { return trans->writable; } static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans) { return !(trans->desc & BIT(54)); } extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, struct kvm_s2_trans *result); extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans); extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2); extern void kvm_nested_s2_wp(struct kvm *kvm); extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block); extern void kvm_nested_s2_flush(struct kvm *kvm); unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val); static inline bool kvm_supported_tlbi_s1e1_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; u8 CRm = sys_reg_CRm(instr); if (!(sys_reg_Op0(instr) == TLBI_Op0 && sys_reg_Op1(instr) == TLBI_Op1_EL1)) return false; if (!(sys_reg_CRn(instr) == TLBI_CRn_XS || (sys_reg_CRn(instr) == TLBI_CRn_nXS && kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)))) return false; if (CRm == TLBI_CRm_nROS && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) return false; if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS || CRm == TLBI_CRm_RNS) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) return false; return true; } static inline bool kvm_supported_tlbi_s1e2_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; u8 CRm = sys_reg_CRm(instr); if (!(sys_reg_Op0(instr) == TLBI_Op0 && sys_reg_Op1(instr) == TLBI_Op1_EL2)) return false; if (!(sys_reg_CRn(instr) == TLBI_CRn_XS || (sys_reg_CRn(instr) == TLBI_CRn_nXS && kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)))) return false; if (CRm == TLBI_CRm_IPAIS || CRm == TLBI_CRm_IPAONS) return false; if (CRm == TLBI_CRm_nROS && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) return false; if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS || CRm == TLBI_CRm_RNS) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) return false; return true; } int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu); u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val); #ifdef CONFIG_ARM64_PTR_AUTH bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr); #else static inline bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr) { /* We really should never execute this... */ WARN_ON_ONCE(1); *elr = 0xbad9acc0debadbad; return false; } #endif #define KVM_NV_GUEST_MAP_SZ (KVM_PGTABLE_PROT_SW1 | KVM_PGTABLE_PROT_SW0) static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans) { return FIELD_PREP(KVM_NV_GUEST_MAP_SZ, trans->level); } /* Adjust alignment for the contiguous bit as per StageOA() */ #define contiguous_bit_shift(d, wi, l) \ ({ \ u8 shift = 0; \ \ if ((d) & PTE_CONT) { \ switch (BIT((wi)->pgshift)) { \ case SZ_4K: \ shift = 4; \ break; \ case SZ_16K: \ shift = (l) == 2 ? 5 : 7; \ break; \ case SZ_64K: \ shift = 5; \ break; \ } \ } \ \ shift; \ }) static inline u64 decode_range_tlbi(u64 val, u64 *range, u16 *asid) { u64 base, tg, num, scale; int shift; tg = FIELD_GET(GENMASK(47, 46), val); switch(tg) { case 1: shift = 12; break; case 2: shift = 14; break; case 3: default: /* IMPDEF: handle tg==0 as 64k */ shift = 16; break; } base = (val & GENMASK(36, 0)) << shift; if (asid) *asid = FIELD_GET(TLBIR_ASID_MASK, val); scale = FIELD_GET(GENMASK(45, 44), val); num = FIELD_GET(GENMASK(43, 39), val); *range = __TLBI_RANGE_PAGES(num, scale) << shift; return base; } static inline unsigned int ps_to_output_size(unsigned int ps) { switch (ps) { case 0: return 32; case 1: return 36; case 2: return 40; case 3: return 42; case 4: return 44; case 5: default: return 48; } } enum trans_regime { TR_EL10, TR_EL20, TR_EL2, }; struct s1_walk_info { u64 baddr; enum trans_regime regime; unsigned int max_oa_bits; unsigned int pgshift; unsigned int txsz; int sl; bool as_el0; bool hpd; bool e0poe; bool poe; bool pan; bool be; bool s2; }; struct s1_walk_result { union { struct { u64 desc; u64 pa; s8 level; u8 APTable; bool nG; u16 asid; bool UXNTable; bool PXNTable; bool uwxn; bool uov; bool ur; bool uw; bool ux; bool pwxn; bool pov; bool pr; bool pw; bool px; }; struct { u8 fst; bool ptw; bool s2; }; }; bool failed; }; int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va); /* VNCR management */ int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu); int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu); void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val); #define vncr_fixmap(c) \ ({ \ u32 __c = (c); \ BUG_ON(__c >= NR_CPUS); \ (FIX_VNCR - __c); \ }) #endif /* __ARM64_KVM_NESTED_H */ |
647 649 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | // SPDX-License-Identifier: GPL-2.0-only /* * A generic implementation of binary search for the Linux kernel * * Copyright (C) 2008-2009 Ksplice, Inc. * Author: Tim Abbott <tabbott@ksplice.com> */ #include <linux/export.h> #include <linux/bsearch.h> #include <linux/kprobes.h> /* * bsearch - binary search an array of elements * @key: pointer to item being searched for * @base: pointer to first element to search * @num: number of elements * @size: size of each element * @cmp: pointer to comparison function * * This function does a binary search on the given array. The * contents of the array should already be in ascending sorted order * under the provided comparison function. * * Note that the key need not have the same type as the elements in * the array, e.g. key could be a string and the comparison function * could compare the string with the struct's name field. However, if * the key and elements in the array are of the same type, you can use * the same comparison function for both sort() and bsearch(). */ void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { return __inline_bsearch(key, base, num, size, cmp); } EXPORT_SYMBOL(bsearch); NOKPROBE_SYMBOL(bsearch); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/traps.h * * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_TRAP_H #define __ASM_TRAP_H #include <linux/list.h> #include <asm/esr.h> #include <asm/ptrace.h> #include <asm/sections.h> #ifdef CONFIG_ARMV8_DEPRECATED bool try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn); #else static inline bool try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn) { return false; } #endif /* CONFIG_ARMV8_DEPRECATED */ void force_signal_inject(int signal, int code, unsigned long address, unsigned long err); void arm64_notify_segfault(unsigned long addr); void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str); void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey); void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str); void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str); int early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs); /* * Move regs->pc to next instruction and do necessary setup before it * is executed. */ void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size); static inline int __in_irqentry_text(unsigned long ptr) { return ptr >= (unsigned long)&__irqentry_text_start && ptr < (unsigned long)&__irqentry_text_end; } static inline int in_entry_text(unsigned long ptr) { return ptr >= (unsigned long)&__entry_text_start && ptr < (unsigned long)&__entry_text_end; } /* * CPUs with the RAS extensions have an Implementation-Defined-Syndrome bit * to indicate whether this ESR has a RAS encoding. CPUs without this feature * have a ISS-Valid bit in the same position. * If this bit is set, we know its not a RAS SError. * If its clear, we need to know if the CPU supports RAS. Uncategorized RAS * errors share the same encoding as an all-zeros encoding from a CPU that * doesn't support RAS. */ static inline bool arm64_is_ras_serror(unsigned long esr) { WARN_ON(preemptible()); if (esr & ESR_ELx_IDS) return false; if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) return true; else return false; } /* * Return the AET bits from a RAS SError's ESR. * * It is implementation defined whether Uncategorized errors are containable. * We treat them as Uncontainable. * Non-RAS SError's are reported as Uncontained/Uncategorized. */ static inline unsigned long arm64_ras_serror_get_severity(unsigned long esr) { unsigned long aet = esr & ESR_ELx_AET; if (!arm64_is_ras_serror(esr)) { /* Not a RAS error, we can't interpret the ESR. */ return ESR_ELx_AET_UC; } /* * AET is RES0 if 'the value returned in the DFSC field is not * [ESR_ELx_FSC_SERROR]' */ if ((esr & ESR_ELx_FSC) != ESR_ELx_FSC_SERROR) { /* No severity information : Uncategorized */ return ESR_ELx_AET_UC; } return aet; } bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr); void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr); static inline void arm64_mops_reset_regs(struct user_pt_regs *regs, unsigned long esr) { bool wrong_option = esr & ESR_ELx_MOPS_ISS_WRONG_OPTION; bool option_a = esr & ESR_ELx_MOPS_ISS_OPTION_A; int dstreg = ESR_ELx_MOPS_ISS_DESTREG(esr); int srcreg = ESR_ELx_MOPS_ISS_SRCREG(esr); int sizereg = ESR_ELx_MOPS_ISS_SIZEREG(esr); unsigned long dst, size; dst = regs->regs[dstreg]; size = regs->regs[sizereg]; /* * Put the registers back in the original format suitable for a * prologue instruction, using the generic return routine from the * Arm ARM (DDI 0487I.a) rules CNTMJ and MWFQH. */ if (esr & ESR_ELx_MOPS_ISS_MEM_INST) { /* SET* instruction */ if (option_a ^ wrong_option) { /* Format is from Option A; forward set */ regs->regs[dstreg] = dst + size; regs->regs[sizereg] = -size; } } else { /* CPY* instruction */ unsigned long src = regs->regs[srcreg]; if (!(option_a ^ wrong_option)) { /* Format is from Option B */ if (regs->pstate & PSR_N_BIT) { /* Backward copy */ regs->regs[dstreg] = dst - size; regs->regs[srcreg] = src - size; } } else { /* Format is from Option A */ if (size & BIT(63)) { /* Forward copy */ regs->regs[dstreg] = dst + size; regs->regs[srcreg] = src + size; regs->regs[sizereg] = -size; } } } if (esr & ESR_ELx_MOPS_ISS_FROM_EPILOGUE) regs->pc -= 8; else regs->pc -= 4; } #endif |
1 1 137 137 163 165 14 106 1 106 165 102 76 33 33 76 4 4 1 2 1 1 1 1 1 1 3 1 2 1 1 2 2 2 1 1 1 1 2 1 1 2 2 1 1 1 1 1 1 1 72 72 72 72 68 21 72 72 72 72 72 72 72 58 15 15 72 72 72 72 72 72 72 6 6 6 1 1 2 4 4 4 1 1 1 1 4 4 1 1 2 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 4 4 3 4 1 1 1 1 1 1 2 2 1 2 2 2 9 9 4 1 2 7 3 7 6 5 7 72 66 65 65 63 64 64 64 64 64 65 64 64 69 72 72 13 72 72 72 71 72 82 83 73 74 74 74 74 2 2 11 11 7 8 8 65 54 12 2 1 1 65 54 12 3 1 1 1 1 1 2 1 1 6 2 5 11 2 10 66 66 65 66 1 68 68 68 68 68 3 3 1 2 9 8 9 11 72 72 9 9 9 9 9 8 8 8 9 8 8 9 8 8 1 1 64 64 64 64 3 3 3 72 72 21 21 20 1 3 17 2 2 2 2 2 2 2 2 244 9 64 3 68 72 72 72 72 72 72 71 72 72 72 58 15 23 23 2 21 12 9 1 92 92 1 91 90 1 1 3 1 5 3 19 19 2 11 13 23 5 17 75 76 55 19 27 40 80 6 74 4 4 8 5 5 8 8 8 8 8 8 8 4 1 4 3 3 2 1 166 165 164 165 149 16 166 165 165 166 166 6 164 166 59 107 165 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/kvm/coproc.c: * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Authors: Rusty Russell <rusty@rustcorp.com.au> * Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/bitfield.h> #include <linux/bsearch.h> #include <linux/cacheinfo.h> #include <linux/debugfs.h> #include <linux/kvm_host.h> #include <linux/mm.h> #include <linux/printk.h> #include <linux/uaccess.h> #include <linux/irqchip/arm-gic-v3.h> #include <asm/arm_pmuv3.h> #include <asm/cacheflush.h> #include <asm/cputype.h> #include <asm/debug-monitors.h> #include <asm/esr.h> #include <asm/kvm_arm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/kvm_nested.h> #include <asm/perf_event.h> #include <asm/sysreg.h> #include <trace/events/kvm.h> #include "sys_regs.h" #include "vgic/vgic.h" #include "trace.h" /* * For AArch32, we only take care of what is being trapped. Anything * that has to do with init and userspace access has to go via the * 64bit interface. */ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val); static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { kvm_inject_undefined(vcpu); return false; } static bool bad_trap(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r, const char *msg) { WARN_ONCE(1, "Unexpected %s\n", msg); print_sys_reg_instr(params); return undef_access(vcpu, params, r); } static bool read_from_write_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { return bad_trap(vcpu, params, r, "sys_reg read to write-only register"); } static bool write_to_read_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { return bad_trap(vcpu, params, r, "sys_reg write to read-only register"); } #define PURE_EL2_SYSREG(el2) \ case el2: { \ *el1r = el2; \ return true; \ } #define MAPPED_EL2_SYSREG(el2, el1, fn) \ case el2: { \ *xlate = fn; \ *el1r = el1; \ return true; \ } static bool get_el2_to_el1_mapping(unsigned int reg, unsigned int *el1r, u64 (**xlate)(u64)) { switch (reg) { PURE_EL2_SYSREG( VPIDR_EL2 ); PURE_EL2_SYSREG( VMPIDR_EL2 ); PURE_EL2_SYSREG( ACTLR_EL2 ); PURE_EL2_SYSREG( HCR_EL2 ); PURE_EL2_SYSREG( MDCR_EL2 ); PURE_EL2_SYSREG( HSTR_EL2 ); PURE_EL2_SYSREG( HACR_EL2 ); PURE_EL2_SYSREG( VTTBR_EL2 ); PURE_EL2_SYSREG( VTCR_EL2 ); PURE_EL2_SYSREG( RVBAR_EL2 ); PURE_EL2_SYSREG( TPIDR_EL2 ); PURE_EL2_SYSREG( HPFAR_EL2 ); PURE_EL2_SYSREG( HCRX_EL2 ); PURE_EL2_SYSREG( HFGRTR_EL2 ); PURE_EL2_SYSREG( HFGWTR_EL2 ); PURE_EL2_SYSREG( HFGITR_EL2 ); PURE_EL2_SYSREG( HDFGRTR_EL2 ); PURE_EL2_SYSREG( HDFGWTR_EL2 ); PURE_EL2_SYSREG( HAFGRTR_EL2 ); PURE_EL2_SYSREG( CNTVOFF_EL2 ); PURE_EL2_SYSREG( CNTHCTL_EL2 ); MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1, translate_sctlr_el2_to_sctlr_el1 ); MAPPED_EL2_SYSREG(CPTR_EL2, CPACR_EL1, translate_cptr_el2_to_cpacr_el1 ); MAPPED_EL2_SYSREG(TTBR0_EL2, TTBR0_EL1, translate_ttbr0_el2_to_ttbr0_el1 ); MAPPED_EL2_SYSREG(TTBR1_EL2, TTBR1_EL1, NULL ); MAPPED_EL2_SYSREG(TCR_EL2, TCR_EL1, translate_tcr_el2_to_tcr_el1 ); MAPPED_EL2_SYSREG(VBAR_EL2, VBAR_EL1, NULL ); MAPPED_EL2_SYSREG(AFSR0_EL2, AFSR0_EL1, NULL ); MAPPED_EL2_SYSREG(AFSR1_EL2, AFSR1_EL1, NULL ); MAPPED_EL2_SYSREG(ESR_EL2, ESR_EL1, NULL ); MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL ); MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL ); MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL ); MAPPED_EL2_SYSREG(PIR_EL2, PIR_EL1, NULL ); MAPPED_EL2_SYSREG(PIRE0_EL2, PIRE0_EL1, NULL ); MAPPED_EL2_SYSREG(POR_EL2, POR_EL1, NULL ); MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL ); MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL ); default: return false; } } u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { u64 val = 0x8badf00d8badf00d; u64 (*xlate)(u64) = NULL; unsigned int el1r; if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) goto memory_read; if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) { if (!is_hyp_ctxt(vcpu)) goto memory_read; /* * CNTHCTL_EL2 requires some special treatment to * account for the bits that can be set via CNTKCTL_EL1. */ switch (reg) { case CNTHCTL_EL2: if (vcpu_el2_e2h_is_set(vcpu)) { val = read_sysreg_el1(SYS_CNTKCTL); val &= CNTKCTL_VALID_BITS; val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS; return val; } break; } /* * If this register does not have an EL1 counterpart, * then read the stored EL2 version. */ if (reg == el1r) goto memory_read; /* * If we have a non-VHE guest and that the sysreg * requires translation to be used at EL1, use the * in-memory copy instead. */ if (!vcpu_el2_e2h_is_set(vcpu) && xlate) goto memory_read; /* Get the current version of the EL1 counterpart. */ WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val)); if (reg >= __SANITISED_REG_START__) val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); return val; } /* EL1 register can't be on the CPU if the guest is in vEL2. */ if (unlikely(is_hyp_ctxt(vcpu))) goto memory_read; if (__vcpu_read_sys_reg_from_cpu(reg, &val)) return val; memory_read: return __vcpu_sys_reg(vcpu, reg); } void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) { u64 (*xlate)(u64) = NULL; unsigned int el1r; if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) goto memory_write; if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) { if (!is_hyp_ctxt(vcpu)) goto memory_write; /* * Always store a copy of the write to memory to avoid having * to reverse-translate virtual EL2 system registers for a * non-VHE guest hypervisor. */ __vcpu_sys_reg(vcpu, reg) = val; switch (reg) { case CNTHCTL_EL2: /* * If E2H=0, CNHTCTL_EL2 is a pure shadow register. * Otherwise, some of the bits are backed by * CNTKCTL_EL1, while the rest is kept in memory. * Yes, this is fun stuff. */ if (vcpu_el2_e2h_is_set(vcpu)) write_sysreg_el1(val, SYS_CNTKCTL); return; } /* No EL1 counterpart? We're done here.? */ if (reg == el1r) return; if (!vcpu_el2_e2h_is_set(vcpu) && xlate) val = xlate(val); /* Redirect this to the EL1 version of the register. */ WARN_ON(!__vcpu_write_sys_reg_to_cpu(val, el1r)); return; } /* EL1 register can't be on the CPU if the guest is in vEL2. */ if (unlikely(is_hyp_ctxt(vcpu))) goto memory_write; if (__vcpu_write_sys_reg_to_cpu(val, reg)) return; memory_write: __vcpu_sys_reg(vcpu, reg) = val; } /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ #define CSSELR_MAX 14 /* * Returns the minimum line size for the selected cache, expressed as * Log2(bytes). */ static u8 get_min_cache_line_size(bool icache) { u64 ctr = read_sanitised_ftr_reg(SYS_CTR_EL0); u8 field; if (icache) field = SYS_FIELD_GET(CTR_EL0, IminLine, ctr); else field = SYS_FIELD_GET(CTR_EL0, DminLine, ctr); /* * Cache line size is represented as Log2(words) in CTR_EL0. * Log2(bytes) can be derived with the following: * * Log2(words) + 2 = Log2(bytes / 4) + 2 * = Log2(bytes) - 2 + 2 * = Log2(bytes) */ return field + 2; } /* Which cache CCSIDR represents depends on CSSELR value. */ static u32 get_ccsidr(struct kvm_vcpu *vcpu, u32 csselr) { u8 line_size; if (vcpu->arch.ccsidr) return vcpu->arch.ccsidr[csselr]; line_size = get_min_cache_line_size(csselr & CSSELR_EL1_InD); /* * Fabricate a CCSIDR value as the overriding value does not exist. * The real CCSIDR value will not be used as it can vary by the * physical CPU which the vcpu currently resides in. * * The line size is determined with get_min_cache_line_size(), which * should be valid for all CPUs even if they have different cache * configuration. * * The associativity bits are cleared, meaning the geometry of all data * and unified caches (which are guaranteed to be PIPT and thus * non-aliasing) are 1 set and 1 way. * Guests should not be doing cache operations by set/way at all, and * for this reason, we trap them and attempt to infer the intent, so * that we can flush the entire guest's address space at the appropriate * time. The exposed geometry minimizes the number of the traps. * [If guests should attempt to infer aliasing properties from the * geometry (which is not permitted by the architecture), they would * only do so for virtually indexed caches.] * * We don't check if the cache level exists as it is allowed to return * an UNKNOWN value if not. */ return SYS_FIELD_PREP(CCSIDR_EL1, LineSize, line_size - 4); } static int set_ccsidr(struct kvm_vcpu *vcpu, u32 csselr, u32 val) { u8 line_size = FIELD_GET(CCSIDR_EL1_LineSize, val) + 4; u32 *ccsidr = vcpu->arch.ccsidr; u32 i; if ((val & CCSIDR_EL1_RES0) || line_size < get_min_cache_line_size(csselr & CSSELR_EL1_InD)) return -EINVAL; if (!ccsidr) { if (val == get_ccsidr(vcpu, csselr)) return 0; ccsidr = kmalloc_array(CSSELR_MAX, sizeof(u32), GFP_KERNEL_ACCOUNT); if (!ccsidr) return -ENOMEM; for (i = 0; i < CSSELR_MAX; i++) ccsidr[i] = get_ccsidr(vcpu, i); vcpu->arch.ccsidr = ccsidr; } ccsidr[csselr] = val; return 0; } static bool access_rw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) vcpu_write_sys_reg(vcpu, p->regval, r->reg); else p->regval = vcpu_read_sys_reg(vcpu, r->reg); return true; } /* * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). */ static bool access_dcsw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (!p->is_write) return read_from_write_only(vcpu, p, r); /* * Only track S/W ops if we don't have FWB. It still indicates * that the guest is a bit broken (S/W operations should only * be done by firmware, knowing that there is only a single * CPU left in the system, and certainly not from non-secure * software). */ if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) kvm_set_way_flush(vcpu); return true; } static bool access_dcgsw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (!kvm_has_mte(vcpu->kvm)) return undef_access(vcpu, p, r); /* Treat MTE S/W ops as we treat the classic ones: with contempt */ return access_dcsw(vcpu, p, r); } static void get_access_mask(const struct sys_reg_desc *r, u64 *mask, u64 *shift) { switch (r->aarch32_map) { case AA32_LO: *mask = GENMASK_ULL(31, 0); *shift = 0; break; case AA32_HI: *mask = GENMASK_ULL(63, 32); *shift = 32; break; default: *mask = GENMASK_ULL(63, 0); *shift = 0; break; } } /* * Generic accessor for VM registers. Only called as long as HCR_TVM * is set. If the guest enables the MMU, we stop trapping the VM * sys_regs and leave it in complete control of the caches. */ static bool access_vm_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { bool was_enabled = vcpu_has_cache_enabled(vcpu); u64 val, mask, shift; BUG_ON(!p->is_write); get_access_mask(r, &mask, &shift); if (~mask) { val = vcpu_read_sys_reg(vcpu, r->reg); val &= ~mask; } else { val = 0; } val |= (p->regval & (mask >> shift)) << shift; vcpu_write_sys_reg(vcpu, val, r->reg); kvm_toggle_cache(vcpu, was_enabled); return true; } static bool access_actlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 mask, shift; if (p->is_write) return ignore_write(vcpu, p); get_access_mask(r, &mask, &shift); p->regval = (vcpu_read_sys_reg(vcpu, r->reg) & mask) >> shift; return true; } /* * Trap handler for the GICv3 SGI generation system register. * Forward the request to the VGIC emulation. * The cp15_64 code makes sure this automatically works * for both AArch64 and AArch32 accesses. */ static bool access_gic_sgi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { bool g1; if (!kvm_has_gicv3(vcpu->kvm)) return undef_access(vcpu, p, r); if (!p->is_write) return read_from_write_only(vcpu, p, r); /* * In a system where GICD_CTLR.DS=1, a ICC_SGI0R_EL1 access generates * Group0 SGIs only, while ICC_SGI1R_EL1 can generate either group, * depending on the SGI configuration. ICC_ASGI1R_EL1 is effectively * equivalent to ICC_SGI0R_EL1, as there is no "alternative" secure * group. */ if (p->Op0 == 0) { /* AArch32 */ switch (p->Op1) { default: /* Keep GCC quiet */ case 0: /* ICC_SGI1R */ g1 = true; break; case 1: /* ICC_ASGI1R */ case 2: /* ICC_SGI0R */ g1 = false; break; } } else { /* AArch64 */ switch (p->Op2) { default: /* Keep GCC quiet */ case 5: /* ICC_SGI1R_EL1 */ g1 = true; break; case 6: /* ICC_ASGI1R_EL1 */ case 7: /* ICC_SGI0R_EL1 */ g1 = false; break; } } vgic_v3_dispatch_sgi(vcpu, p->regval, g1); return true; } static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (!kvm_has_gicv3(vcpu->kvm)) return undef_access(vcpu, p, r); if (p->is_write) return ignore_write(vcpu, p); if (p->Op1 == 4) { /* ICC_SRE_EL2 */ p->regval = (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE | ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB); } else { /* ICC_SRE_EL1 */ p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; } return true; } static bool trap_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return ignore_write(vcpu, p); else return read_zero(vcpu, p); } /* * ARMv8.1 mandates at least a trivial LORegion implementation, where all the * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0 * system, these registers should UNDEF. LORID_EL1 being a RO register, we * treat it separately. */ static bool trap_loregion(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sr = reg_to_encoding(r); if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) return undef_access(vcpu, p, r); if (p->is_write && sr == SYS_LORID_EL1) return write_to_read_only(vcpu, p, r); return trap_raz_wi(vcpu, p, r); } static bool trap_oslar_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (!p->is_write) return read_from_write_only(vcpu, p, r); kvm_debug_handle_oslar(vcpu, p->regval); return true; } static bool trap_oslsr_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = __vcpu_sys_reg(vcpu, r->reg); return true; } static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { /* * The only modifiable bit is the OSLK bit. Refuse the write if * userspace attempts to change any other bit in the register. */ if ((val ^ rd->val) & ~OSLSR_EL1_OSLK) return -EINVAL; __vcpu_sys_reg(vcpu, rd->reg) = val; return 0; } static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { return ignore_write(vcpu, p); } else { p->regval = read_sysreg(dbgauthstatus_el1); return true; } } static bool trap_debug_regs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { access_rw(vcpu, p, r); kvm_debug_set_guest_ownership(vcpu); return true; } /* * reg_to_dbg/dbg_to_reg * * A 32 bit write to a debug register leave top bits alone * A 32 bit read from a debug register only returns the bottom bits */ static void reg_to_dbg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd, u64 *dbg_reg) { u64 mask, shift, val; get_access_mask(rd, &mask, &shift); val = *dbg_reg; val &= ~mask; val |= (p->regval & (mask >> shift)) << shift; *dbg_reg = val; } static void dbg_to_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd, u64 *dbg_reg) { u64 mask, shift; get_access_mask(rd, &mask, &shift); p->regval = (*dbg_reg & mask) >> shift; } static u64 *demux_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { struct kvm_guest_debug_arch *dbg = &vcpu->arch.vcpu_debug_state; switch (rd->Op2) { case 0b100: return &dbg->dbg_bvr[rd->CRm]; case 0b101: return &dbg->dbg_bcr[rd->CRm]; case 0b110: return &dbg->dbg_wvr[rd->CRm]; case 0b111: return &dbg->dbg_wcr[rd->CRm]; default: KVM_BUG_ON(1, vcpu->kvm); return NULL; } } static bool trap_dbg_wb_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { u64 *reg = demux_wb_reg(vcpu, rd); if (!reg) return false; if (p->is_write) reg_to_dbg(vcpu, p, rd, reg); else dbg_to_reg(vcpu, p, rd, reg); kvm_debug_set_guest_ownership(vcpu); return true; } static int set_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { u64 *reg = demux_wb_reg(vcpu, rd); if (!reg) return -EINVAL; *reg = val; return 0; } static int get_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { u64 *reg = demux_wb_reg(vcpu, rd); if (!reg) return -EINVAL; *val = *reg; return 0; } static u64 reset_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { u64 *reg = demux_wb_reg(vcpu, rd); /* * Bail early if we couldn't find storage for the register, the * KVM_BUG_ON() in demux_wb_reg() will prevent this VM from ever * being run. */ if (!reg) return 0; *reg = rd->val; return rd->val; } static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 amair = read_sysreg(amair_el1); vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1); return amair; } static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 actlr = read_sysreg(actlr_el1); vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1); return actlr; } static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 mpidr; /* * Map the vcpu_id into the first three affinity level fields of * the MPIDR. We limit the number of VCPUs in level 0 due to a * limitation to 16 CPUs in that level in the ICC_SGIxR registers * of the GICv3 to be able to address each CPU directly when * sending IPIs. */ mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0); mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1); mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2); mpidr |= (1ULL << 31); vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1); return mpidr; } static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (kvm_vcpu_has_pmu(vcpu)) return 0; return REG_HIDDEN; } static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 mask = BIT(ARMV8_PMU_CYCLE_IDX); u8 n = vcpu->kvm->arch.nr_pmu_counters; if (n) mask |= GENMASK(n - 1, 0); reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= mask; return __vcpu_sys_reg(vcpu, r->reg); } static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); return __vcpu_sys_reg(vcpu, r->reg); } static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { /* This thing will UNDEF, who cares about the reset value? */ if (!kvm_vcpu_has_pmu(vcpu)) return 0; reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= kvm_pmu_evtyper_mask(vcpu->kvm); return __vcpu_sys_reg(vcpu, r->reg); } static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= PMSELR_EL0_SEL_MASK; return __vcpu_sys_reg(vcpu, r->reg); } static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 pmcr = 0; if (!kvm_supports_32bit_el0()) pmcr |= ARMV8_PMU_PMCR_LC; /* * The value of PMCR.N field is included when the * vCPU register is read via kvm_vcpu_read_pmcr(). */ __vcpu_sys_reg(vcpu, r->reg) = pmcr; return __vcpu_sys_reg(vcpu, r->reg); } static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) { u64 reg = __vcpu_sys_reg(vcpu, PMUSERENR_EL0); bool enabled = (reg & flags) || vcpu_mode_priv(vcpu); if (!enabled) kvm_inject_undefined(vcpu); return !enabled; } static bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu) { return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_EN); } static bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu) { return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_SW | ARMV8_PMU_USERENR_EN); } static bool pmu_access_cycle_counter_el0_disabled(struct kvm_vcpu *vcpu) { return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_EN); } static bool pmu_access_event_counter_el0_disabled(struct kvm_vcpu *vcpu) { return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_EN); } static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 val; if (pmu_access_el0_disabled(vcpu)) return false; if (p->is_write) { /* * Only update writeable bits of PMCR (continuing into * kvm_pmu_handle_pmcr() as well) */ val = kvm_vcpu_read_pmcr(vcpu); val &= ~ARMV8_PMU_PMCR_MASK; val |= p->regval & ARMV8_PMU_PMCR_MASK; if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; kvm_pmu_handle_pmcr(vcpu, val); } else { /* PMCR.P & PMCR.C are RAZ */ val = kvm_vcpu_read_pmcr(vcpu) & ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C); p->regval = val; } return true; } static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (pmu_access_event_counter_el0_disabled(vcpu)) return false; if (p->is_write) __vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval; else /* return PMSELR.SEL field */ p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0) & PMSELR_EL0_SEL_MASK; return true; } static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 pmceid, mask, shift; BUG_ON(p->is_write); if (pmu_access_el0_disabled(vcpu)) return false; get_access_mask(r, &mask, &shift); pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1)); pmceid &= mask; pmceid >>= shift; p->regval = pmceid; return true; } static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) { u64 pmcr, val; pmcr = kvm_vcpu_read_pmcr(vcpu); val = FIELD_GET(ARMV8_PMU_PMCR_N, pmcr); if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) { kvm_inject_undefined(vcpu); return false; } return true; } static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { u64 idx; if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) /* PMCCNTR_EL0 */ idx = ARMV8_PMU_CYCLE_IDX; else /* PMEVCNTRn_EL0 */ idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); *val = kvm_pmu_get_counter_value(vcpu, idx); return 0; } static int set_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { u64 idx; if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) /* PMCCNTR_EL0 */ idx = ARMV8_PMU_CYCLE_IDX; else /* PMEVCNTRn_EL0 */ idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); kvm_pmu_set_counter_value_user(vcpu, idx, val); return 0; } static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 idx = ~0UL; if (r->CRn == 9 && r->CRm == 13) { if (r->Op2 == 2) { /* PMXEVCNTR_EL0 */ if (pmu_access_event_counter_el0_disabled(vcpu)) return false; idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); } else if (r->Op2 == 0) { /* PMCCNTR_EL0 */ if (pmu_access_cycle_counter_el0_disabled(vcpu)) return false; idx = ARMV8_PMU_CYCLE_IDX; } } else if (r->CRn == 0 && r->CRm == 9) { /* PMCCNTR */ if (pmu_access_event_counter_el0_disabled(vcpu)) return false; idx = ARMV8_PMU_CYCLE_IDX; } else if (r->CRn == 14 && (r->CRm & 12) == 8) { /* PMEVCNTRn_EL0 */ if (pmu_access_event_counter_el0_disabled(vcpu)) return false; idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); } /* Catch any decoding mistake */ WARN_ON(idx == ~0UL); if (!pmu_counter_idx_valid(vcpu, idx)) return false; if (p->is_write) { if (pmu_access_el0_disabled(vcpu)) return false; kvm_pmu_set_counter_value(vcpu, idx, p->regval); } else { p->regval = kvm_pmu_get_counter_value(vcpu, idx); } return true; } static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 idx, reg; if (pmu_access_el0_disabled(vcpu)) return false; if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) { /* PMXEVTYPER_EL0 */ idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); reg = PMEVTYPER0_EL0 + idx; } else if (r->CRn == 14 && (r->CRm & 12) == 12) { idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); if (idx == ARMV8_PMU_CYCLE_IDX) reg = PMCCFILTR_EL0; else /* PMEVTYPERn_EL0 */ reg = PMEVTYPER0_EL0 + idx; } else { BUG(); } if (!pmu_counter_idx_valid(vcpu, idx)) return false; if (p->is_write) { kvm_pmu_set_counter_event_type(vcpu, p->regval, idx); kvm_vcpu_pmu_restore_guest(vcpu); } else { p->regval = __vcpu_sys_reg(vcpu, reg); } return true; } static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { u64 mask = kvm_pmu_accessible_counter_mask(vcpu); __vcpu_sys_reg(vcpu, r->reg) = val & mask; kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); return 0; } static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { u64 mask = kvm_pmu_accessible_counter_mask(vcpu); *val = __vcpu_sys_reg(vcpu, r->reg) & mask; return 0; } static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 val, mask; if (pmu_access_el0_disabled(vcpu)) return false; mask = kvm_pmu_accessible_counter_mask(vcpu); if (p->is_write) { val = p->regval & mask; if (r->Op2 & 0x1) /* accessing PMCNTENSET_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; else /* accessing PMCNTENCLR_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; kvm_pmu_reprogram_counter_mask(vcpu, val); } else { p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); } return true; } static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (check_pmu_access_disabled(vcpu, 0)) return false; if (p->is_write) { u64 val = p->regval & mask; if (r->Op2 & 0x1) /* accessing PMINTENSET_EL1 */ __vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val; else /* accessing PMINTENCLR_EL1 */ __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val; } else { p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1); } return true; } static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (pmu_access_el0_disabled(vcpu)) return false; if (p->is_write) { if (r->CRm & 0x2) /* accessing PMOVSSET_EL0 */ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask); else /* accessing PMOVSCLR_EL0 */ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask); } else { p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); } return true; } static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 mask; if (!p->is_write) return read_from_write_only(vcpu, p, r); if (pmu_write_swinc_el0_disabled(vcpu)) return false; mask = kvm_pmu_accessible_counter_mask(vcpu); kvm_pmu_software_increment(vcpu, p->regval & mask); return true; } static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { if (!vcpu_mode_priv(vcpu)) return undef_access(vcpu, p, r); __vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval & ARMV8_PMU_USERENR_MASK; } else { p->regval = __vcpu_sys_reg(vcpu, PMUSERENR_EL0) & ARMV8_PMU_USERENR_MASK; } return true; } static int get_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { *val = kvm_vcpu_read_pmcr(vcpu); return 0; } static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { u8 new_n = FIELD_GET(ARMV8_PMU_PMCR_N, val); struct kvm *kvm = vcpu->kvm; mutex_lock(&kvm->arch.config_lock); /* * The vCPU can't have more counters than the PMU hardware * implements. Ignore this error to maintain compatibility * with the existing KVM behavior. */ if (!kvm_vm_has_ran_once(kvm) && !vcpu_has_nv(vcpu) && new_n <= kvm_arm_pmu_get_max_counters(kvm)) kvm->arch.nr_pmu_counters = new_n; mutex_unlock(&kvm->arch.config_lock); /* * Ignore writes to RES0 bits, read only bits that are cleared on * vCPU reset, and writable bits that KVM doesn't support yet. * (i.e. only PMCR.N and bits [7:0] are mutable from userspace) * The LP bit is RES0 when FEAT_PMUv3p5 is not supported on the vCPU. * But, we leave the bit as it is here, as the vCPU's PMUver might * be changed later (NOTE: the bit will be cleared on first vCPU run * if necessary). */ val &= ARMV8_PMU_PMCR_MASK; /* The LC bit is RES1 when AArch32 is not supported */ if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; __vcpu_sys_reg(vcpu, r->reg) = val; kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); return 0; } /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGBCRn_EL1(n)), \ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWVRn_EL1(n)), \ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWCRn_EL1(n)), \ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ get_dbg_wb_reg, set_dbg_wb_reg } #define PMU_SYS_REG(name) \ SYS_DESC(SYS_##name), .reset = reset_pmu_reg, \ .visibility = pmu_visibility /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(PMEVCNTRn_EL0(n)), \ .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \ .set_user = set_pmu_evcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ #define PMU_PMEVTYPER_EL0(n) \ { PMU_SYS_REG(PMEVTYPERn_EL0(n)), \ .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } /* Macro to expand the AMU counter and type registers*/ #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access } #define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access } #define AMU_AMEVCNTR1_EL0(n) { SYS_DESC(SYS_AMEVCNTR1_EL0(n)), undef_access } #define AMU_AMEVTYPER1_EL0(n) { SYS_DESC(SYS_AMEVTYPER1_EL0(n)), undef_access } static unsigned int ptrauth_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN; } /* * If we land here on a PtrAuth access, that is because we didn't * fixup the access on exit by allowing the PtrAuth sysregs. The only * way this happens is when the guest does not have PtrAuth support * enabled. */ #define __PTRAUTH_KEY(k) \ { SYS_DESC(SYS_## k), undef_access, reset_unknown, k, \ .visibility = ptrauth_visibility} #define PTRAUTH_KEY(k) \ __PTRAUTH_KEY(k ## KEYLO_EL1), \ __PTRAUTH_KEY(k ## KEYHI_EL1) static bool access_arch_timer(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { enum kvm_arch_timers tmr; enum kvm_arch_timer_regs treg; u64 reg = reg_to_encoding(r); switch (reg) { case SYS_CNTP_TVAL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HPTIMER; else tmr = TIMER_PTIMER; treg = TIMER_REG_TVAL; break; case SYS_CNTV_TVAL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HVTIMER; else tmr = TIMER_VTIMER; treg = TIMER_REG_TVAL; break; case SYS_AARCH32_CNTP_TVAL: case SYS_CNTP_TVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_TVAL; break; case SYS_CNTV_TVAL_EL02: tmr = TIMER_VTIMER; treg = TIMER_REG_TVAL; break; case SYS_CNTHP_TVAL_EL2: tmr = TIMER_HPTIMER; treg = TIMER_REG_TVAL; break; case SYS_CNTHV_TVAL_EL2: tmr = TIMER_HVTIMER; treg = TIMER_REG_TVAL; break; case SYS_CNTP_CTL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HPTIMER; else tmr = TIMER_PTIMER; treg = TIMER_REG_CTL; break; case SYS_CNTV_CTL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HVTIMER; else tmr = TIMER_VTIMER; treg = TIMER_REG_CTL; break; case SYS_AARCH32_CNTP_CTL: case SYS_CNTP_CTL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CTL; break; case SYS_CNTV_CTL_EL02: tmr = TIMER_VTIMER; treg = TIMER_REG_CTL; break; case SYS_CNTHP_CTL_EL2: tmr = TIMER_HPTIMER; treg = TIMER_REG_CTL; break; case SYS_CNTHV_CTL_EL2: tmr = TIMER_HVTIMER; treg = TIMER_REG_CTL; break; case SYS_CNTP_CVAL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HPTIMER; else tmr = TIMER_PTIMER; treg = TIMER_REG_CVAL; break; case SYS_CNTV_CVAL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HVTIMER; else tmr = TIMER_VTIMER; treg = TIMER_REG_CVAL; break; case SYS_AARCH32_CNTP_CVAL: case SYS_CNTP_CVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CVAL; break; case SYS_CNTV_CVAL_EL02: tmr = TIMER_VTIMER; treg = TIMER_REG_CVAL; break; case SYS_CNTHP_CVAL_EL2: tmr = TIMER_HPTIMER; treg = TIMER_REG_CVAL; break; case SYS_CNTHV_CVAL_EL2: tmr = TIMER_HVTIMER; treg = TIMER_REG_CVAL; break; case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: if (is_hyp_ctxt(vcpu)) tmr = TIMER_HPTIMER; else tmr = TIMER_PTIMER; treg = TIMER_REG_CNT; break; case SYS_AARCH32_CNTPCT: case SYS_AARCH32_CNTPCTSS: tmr = TIMER_PTIMER; treg = TIMER_REG_CNT; break; case SYS_CNTVCT_EL0: case SYS_CNTVCTSS_EL0: if (is_hyp_ctxt(vcpu)) tmr = TIMER_HVTIMER; else tmr = TIMER_VTIMER; treg = TIMER_REG_CNT; break; case SYS_AARCH32_CNTVCT: case SYS_AARCH32_CNTVCTSS: tmr = TIMER_VTIMER; treg = TIMER_REG_CNT; break; default: print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); return undef_access(vcpu, p, r); } if (p->is_write) kvm_arm_timer_write_sysreg(vcpu, tmr, treg, p->regval); else p->regval = kvm_arm_timer_read_sysreg(vcpu, tmr, treg); return true; } static bool access_hv_timer(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (!vcpu_el2_e2h_is_set(vcpu)) return undef_access(vcpu, p, r); return access_arch_timer(vcpu, p, r); } static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { struct arm64_ftr_bits kvm_ftr = *ftrp; /* Some features have different safe value type in KVM than host features */ switch (id) { case SYS_ID_AA64DFR0_EL1: switch (kvm_ftr.shift) { case ID_AA64DFR0_EL1_PMUVer_SHIFT: kvm_ftr.type = FTR_LOWER_SAFE; break; case ID_AA64DFR0_EL1_DebugVer_SHIFT: kvm_ftr.type = FTR_LOWER_SAFE; break; } break; case SYS_ID_DFR0_EL1: if (kvm_ftr.shift == ID_DFR0_EL1_PerfMon_SHIFT) kvm_ftr.type = FTR_LOWER_SAFE; break; } return arm64_ftr_safe_value(&kvm_ftr, new, cur); } /* * arm64_check_features() - Check if a feature register value constitutes * a subset of features indicated by the idreg's KVM sanitised limit. * * This function will check if each feature field of @val is the "safe" value * against idreg's KVM sanitised limit return from reset() callback. * If a field value in @val is the same as the one in limit, it is always * considered the safe value regardless For register fields that are not in * writable, only the value in limit is considered the safe value. * * Return: 0 if all the fields are safe. Otherwise, return negative errno. */ static int arm64_check_features(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { const struct arm64_ftr_reg *ftr_reg; const struct arm64_ftr_bits *ftrp = NULL; u32 id = reg_to_encoding(rd); u64 writable_mask = rd->val; u64 limit = rd->reset(vcpu, rd); u64 mask = 0; /* * Hidden and unallocated ID registers may not have a corresponding * struct arm64_ftr_reg. Of course, if the register is RAZ we know the * only safe value is 0. */ if (sysreg_visible_as_raz(vcpu, rd)) return val ? -E2BIG : 0; ftr_reg = get_arm64_ftr_reg(id); if (!ftr_reg) return -EINVAL; ftrp = ftr_reg->ftr_bits; for (; ftrp && ftrp->width; ftrp++) { s64 f_val, f_lim, safe_val; u64 ftr_mask; ftr_mask = arm64_ftr_mask(ftrp); if ((ftr_mask & writable_mask) != ftr_mask) continue; f_val = arm64_ftr_value(ftrp, val); f_lim = arm64_ftr_value(ftrp, limit); mask |= ftr_mask; if (f_val == f_lim) safe_val = f_val; else safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim); if (safe_val != f_val) return -E2BIG; } /* For fields that are not writable, values in limit are the safe values. */ if ((val & ~mask) != (limit & ~mask)) return -E2BIG; return 0; } static u8 pmuver_to_perfmon(u8 pmuver) { switch (pmuver) { case ID_AA64DFR0_EL1_PMUVer_IMP: return ID_DFR0_EL1_PerfMon_PMUv3; case ID_AA64DFR0_EL1_PMUVer_IMP_DEF: return ID_DFR0_EL1_PerfMon_IMPDEF; default: /* Anything ARMv8.1+ and NI have the same value. For now. */ return pmuver; } } static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u32 id = reg_to_encoding(r); u64 val; if (sysreg_visible_as_raz(vcpu, r)) return 0; val = read_sanitised_ftr_reg(id); switch (id) { case SYS_ID_AA64DFR0_EL1: val = sanitise_id_aa64dfr0_el1(vcpu, val); break; case SYS_ID_AA64PFR0_EL1: val = sanitise_id_aa64pfr0_el1(vcpu, val); break; case SYS_ID_AA64PFR1_EL1: if (!kvm_has_mte(vcpu->kvm)) { val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac); } val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac); break; case SYS_ID_AA64PFR2_EL1: /* We only expose FPMR */ val &= ID_AA64PFR2_EL1_FPMR; break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); break; case SYS_ID_AA64ISAR2_EL1: if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); if (!cpus_have_final_cap(ARM64_HAS_WFXT) || has_broken_cntvoff()) val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); break; case SYS_ID_AA64ISAR3_EL1: val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX; break; case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; val &= ~ID_AA64MMFR2_EL1_NV; break; case SYS_ID_AA64MMFR3_EL1: val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE | ID_AA64MMFR3_EL1_S1PIE; break; case SYS_ID_MMFR4_EL1: val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); break; } if (vcpu_has_nv(vcpu)) val = limit_nv_id_reg(vcpu->kvm, id, val); return val; } static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return __kvm_read_sanitised_id_reg(vcpu, r); } static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return kvm_read_vm_id_reg(vcpu->kvm, reg_to_encoding(r)); } static bool is_feature_id_reg(u32 encoding) { return (sys_reg_Op0(encoding) == 3 && (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) && sys_reg_CRn(encoding) == 0 && sys_reg_CRm(encoding) <= 7); } /* * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8, which is the range of ID * registers KVM maintains on a per-VM basis. * * Additionally, the implementation ID registers and CTR_EL0 are handled as * per-VM registers. */ static inline bool is_vm_ftr_id_reg(u32 id) { switch (id) { case SYS_CTR_EL0: case SYS_MIDR_EL1: case SYS_REVIDR_EL1: case SYS_AIDR_EL1: return true; default: return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && sys_reg_CRm(id) < 8); } } static inline bool is_vcpu_ftr_id_reg(u32 id) { return is_feature_id_reg(id) && !is_vm_ftr_id_reg(id); } static inline bool is_aa32_id_reg(u32 id) { return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && sys_reg_CRm(id) <= 3); } static unsigned int id_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u32 id = reg_to_encoding(r); switch (id) { case SYS_ID_AA64ZFR0_EL1: if (!vcpu_has_sve(vcpu)) return REG_RAZ; break; } return 0; } static unsigned int aa32_id_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { /* * AArch32 ID registers are UNKNOWN if AArch32 isn't implemented at any * EL. Promote to RAZ/WI in order to guarantee consistency between * systems. */ if (!kvm_supports_32bit_el0()) return REG_RAZ | REG_USER_WI; return id_visibility(vcpu, r); } static unsigned int raz_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return REG_RAZ; } /* cpufeature ID register access trap handlers */ static bool access_id_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = read_id_reg(vcpu, r); return true; } /* Visibility overrides for SVE-specific control registers */ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (vcpu_has_sve(vcpu)) return 0; return REG_HIDDEN; } static unsigned int sme_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP)) return 0; return REG_HIDDEN; } static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_fpmr(vcpu->kvm)) return 0; return REG_HIDDEN; } static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { if (!vcpu_has_sve(vcpu)) val &= ~ID_AA64PFR0_EL1_SVE_MASK; /* * The default is to expose CSV2 == 1 if the HW isn't affected. * Although this is a per-CPU feature, we make it global because * asymmetric systems are just a nuisance. * * Userspace can override this as long as it doesn't promise * the impossible. */ if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) { val &= ~ID_AA64PFR0_EL1_CSV2_MASK; val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV2, IMP); } if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) { val &= ~ID_AA64PFR0_EL1_CSV3_MASK; val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP); } if (kvm_vgic_global_state.type == VGIC_V3) { val &= ~ID_AA64PFR0_EL1_GIC_MASK; val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); } val &= ~ID_AA64PFR0_EL1_AMU_MASK; /* * MPAM is disabled by default as KVM also needs a set of PARTID to * program the MPAMVPMx_EL2 PARTID remapping registers with. But some * older kernels let the guest see the ID bit. */ val &= ~ID_AA64PFR0_EL1_MPAM_MASK; return val; } static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); /* * Only initialize the PMU version if the vCPU was configured with one. */ val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; if (kvm_vcpu_has_pmu(vcpu)) val |= SYS_FIELD_PREP(ID_AA64DFR0_EL1, PMUVer, kvm_arm_pmu_get_pmuver_limit()); /* Hide SPE from guests */ val &= ~ID_AA64DFR0_EL1_PMSVer_MASK; /* Hide BRBE from guests */ val &= ~ID_AA64DFR0_EL1_BRBE_MASK; return val; } static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { u8 debugver = SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, val); u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); /* * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the * ID_AA64DFR0_EL1.PMUver limit to VM creation"), KVM erroneously * exposed an IMP_DEF PMU to userspace and the guest on systems w/ * non-architectural PMUs. Of course, PMUv3 is the only game in town for * PMU virtualization, so the IMP_DEF value was rather user-hostile. * * At minimum, we're on the hook to allow values that were given to * userspace by KVM. Cover our tracks here and replace the IMP_DEF value * with a more sensible NI. The value of an ID register changing under * the nose of the guest is unfortunate, but is certainly no more * surprising than an ill-guided PMU driver poking at impdef system * registers that end in an UNDEF... */ if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; /* * ID_AA64DFR0_EL1.DebugVer is one of those awkward fields with a * nonzero minimum safe value. */ if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP) return -EINVAL; return set_id_reg(vcpu, rd, val); } static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { u8 perfmon; u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1); val &= ~ID_DFR0_EL1_PerfMon_MASK; if (kvm_vcpu_has_pmu(vcpu)) { perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon); } val = ID_REG_LIMIT_FIELD_ENUM(val, ID_DFR0_EL1, CopDbg, Debugv8p8); return val; } static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { u8 perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val); u8 copdbg = SYS_FIELD_GET(ID_DFR0_EL1, CopDbg, val); if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) { val &= ~ID_DFR0_EL1_PerfMon_MASK; perfmon = 0; } /* * Allow DFR0_EL1.PerfMon to be set from userspace as long as * it doesn't promise more than what the HW gives us on the * AArch64 side (as everything is emulated with that), and * that this is a PMUv3. */ if (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3) return -EINVAL; if (copdbg < ID_DFR0_EL1_CopDbg_Armv8) return -EINVAL; return set_id_reg(vcpu, rd, val); } static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK; /* * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to * guests, but didn't add trap handling. KVM doesn't support MPAM and * always returns an UNDEF for these registers. The guest must see 0 * for this field. * * But KVM must also accept values from user-space that were provided * by KVM. On CPUs that support MPAM, permit user-space to write * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field. */ if ((hw_val & mpam_mask) == (user_val & mpam_mask)) user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK; return set_id_reg(vcpu, rd, user_val); } static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK; u8 mte = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE, hw_val); u8 user_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, user_val); u8 hw_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, hw_val); /* See set_id_aa64pfr0_el1 for comment about MPAM */ if ((hw_val & mpam_mask) == (user_val & mpam_mask)) user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; /* * Previously MTE_frac was hidden from guest. However, if the * hardware supports MTE2 but not MTE_ASYM_FAULT then a value * of 0 for this field indicates that the hardware supports * MTE_ASYNC. Whereas, 0xf indicates MTE_ASYNC is not supported. * * As KVM must accept values from KVM provided by user-space, * when ID_AA64PFR1_EL1.MTE is 2 allow user-space to set * ID_AA64PFR1_EL1.MTE_frac to 0. However, ignore it to avoid * incorrectly claiming hardware support for MTE_ASYNC in the * guest. */ if (mte == ID_AA64PFR1_EL1_MTE_MTE2 && hw_mte_frac == ID_AA64PFR1_EL1_MTE_frac_NI && user_mte_frac == ID_AA64PFR1_EL1_MTE_frac_ASYNC) { user_val &= ~ID_AA64PFR1_EL1_MTE_frac_MASK; user_val |= hw_val & ID_AA64PFR1_EL1_MTE_frac_MASK; } return set_id_reg(vcpu, rd, user_val); } static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { u64 sanitized_val = kvm_read_sanitised_id_reg(vcpu, rd); u64 tgran2_mask = ID_AA64MMFR0_EL1_TGRAN4_2_MASK | ID_AA64MMFR0_EL1_TGRAN16_2_MASK | ID_AA64MMFR0_EL1_TGRAN64_2_MASK; if (vcpu_has_nv(vcpu) && ((sanitized_val & tgran2_mask) != (user_val & tgran2_mask))) return -EINVAL; return set_id_reg(vcpu, rd, user_val); } static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); u64 nv_mask = ID_AA64MMFR2_EL1_NV_MASK; /* * We made the mistake to expose the now deprecated NV field, * so allow userspace to write it, but silently ignore it. */ if ((hw_val & nv_mask) == (user_val & nv_mask)) user_val &= ~nv_mask; return set_id_reg(vcpu, rd, user_val); } static int set_ctr_el0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { u8 user_L1Ip = SYS_FIELD_GET(CTR_EL0, L1Ip, user_val); /* * Both AIVIVT (0b01) and VPIPT (0b00) are documented as reserved. * Hence only allow to set VIPT(0b10) or PIPT(0b11) for L1Ip based * on what hardware reports. * * Using a VIPT software model on PIPT will lead to over invalidation, * but still correct. Hence, we can allow downgrading PIPT to VIPT, * but not the other way around. This is handled via arm64_ftr_safe_value() * as CTR_EL0 ftr_bits has L1Ip field with type FTR_EXACT and safe value * set as VIPT. */ switch (user_L1Ip) { case CTR_EL0_L1Ip_RESERVED_VPIPT: case CTR_EL0_L1Ip_RESERVED_AIVIVT: return -EINVAL; case CTR_EL0_L1Ip_VIPT: case CTR_EL0_L1Ip_PIPT: return set_id_reg(vcpu, rd, user_val); default: return -ENOENT; } } /* * cpufeature ID register user accessors * * For now, these registers are immutable for userspace, so no values * are stored, and for set_id_reg() we don't allow the effective value * to be changed. */ static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { /* * Avoid locking if the VM has already started, as the ID registers are * guaranteed to be invariant at that point. */ if (kvm_vm_has_ran_once(vcpu->kvm)) { *val = read_id_reg(vcpu, rd); return 0; } mutex_lock(&vcpu->kvm->arch.config_lock); *val = read_id_reg(vcpu, rd); mutex_unlock(&vcpu->kvm->arch.config_lock); return 0; } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { u32 id = reg_to_encoding(rd); int ret; mutex_lock(&vcpu->kvm->arch.config_lock); /* * Once the VM has started the ID registers are immutable. Reject any * write that does not match the final register value. */ if (kvm_vm_has_ran_once(vcpu->kvm)) { if (val != read_id_reg(vcpu, rd)) ret = -EBUSY; else ret = 0; mutex_unlock(&vcpu->kvm->arch.config_lock); return ret; } ret = arm64_check_features(vcpu, rd, val); if (!ret) kvm_set_vm_id_reg(vcpu->kvm, id, val); mutex_unlock(&vcpu->kvm->arch.config_lock); /* * arm64_check_features() returns -E2BIG to indicate the register's * feature set is a superset of the maximally-allowed register value. * While it would be nice to precisely describe this to userspace, the * existing UAPI for KVM_SET_ONE_REG has it that invalid register * writes return -EINVAL. */ if (ret == -E2BIG) ret = -EINVAL; return ret; } void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val) { u64 *p = __vm_id_reg(&kvm->arch, reg); lockdep_assert_held(&kvm->arch.config_lock); if (KVM_BUG_ON(kvm_vm_has_ran_once(kvm) || !p, kvm)) return; *p = val; } static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { *val = 0; return 0; } static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { return 0; } static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0); return true; } static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = __vcpu_sys_reg(vcpu, r->reg); return true; } /* * Fabricate a CLIDR_EL1 value instead of using the real value, which can vary * by the physical CPU which the vcpu currently resides in. */ static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); u64 clidr; u8 loc; if ((ctr_el0 & CTR_EL0_IDC)) { /* * Data cache clean to the PoU is not required so LoUU and LoUIS * will not be set and a unified cache, which will be marked as * LoC, will be added. * * If not DIC, let the unified cache L2 so that an instruction * cache can be added as L1 later. */ loc = (ctr_el0 & CTR_EL0_DIC) ? 1 : 2; clidr = CACHE_TYPE_UNIFIED << CLIDR_CTYPE_SHIFT(loc); } else { /* * Data cache clean to the PoU is required so let L1 have a data * cache and mark it as LoUU and LoUIS. As L1 has a data cache, * it can be marked as LoC too. */ loc = 1; clidr = 1 << CLIDR_LOUU_SHIFT; clidr |= 1 << CLIDR_LOUIS_SHIFT; clidr |= CACHE_TYPE_DATA << CLIDR_CTYPE_SHIFT(1); } /* * Instruction cache invalidation to the PoU is required so let L1 have * an instruction cache. If L1 already has a data cache, it will be * CACHE_TYPE_SEPARATE. */ if (!(ctr_el0 & CTR_EL0_DIC)) clidr |= CACHE_TYPE_INST << CLIDR_CTYPE_SHIFT(1); clidr |= loc << CLIDR_LOC_SHIFT; /* * Add tag cache unified to data cache. Allocation tags and data are * unified in a cache line so that it looks valid even if there is only * one cache line. */ if (kvm_has_mte(vcpu->kvm)) clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc); __vcpu_sys_reg(vcpu, r->reg) = clidr; return __vcpu_sys_reg(vcpu, r->reg); } static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); u64 idc = !CLIDR_LOC(val) || (!CLIDR_LOUIS(val) && !CLIDR_LOUU(val)); if ((val & CLIDR_EL1_RES0) || (!(ctr_el0 & CTR_EL0_IDC) && idc)) return -EINVAL; __vcpu_sys_reg(vcpu, rd->reg) = val; return 0; } static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { int reg = r->reg; if (p->is_write) vcpu_write_sys_reg(vcpu, p->regval, reg); else p->regval = vcpu_read_sys_reg(vcpu, reg); return true; } static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 csselr; if (p->is_write) return write_to_read_only(vcpu, p, r); csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1); csselr &= CSSELR_EL1_Level | CSSELR_EL1_InD; if (csselr < CSSELR_MAX) p->regval = get_ccsidr(vcpu, csselr); return true; } static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_mte(vcpu->kvm)) return 0; return REG_HIDDEN; } #define MTE_REG(name) { \ SYS_DESC(SYS_##name), \ .access = undef_access, \ .reset = reset_unknown, \ .reg = name, \ .visibility = mte_visibility, \ } static unsigned int el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (vcpu_has_nv(vcpu)) return 0; return REG_HIDDEN; } static bool bad_vncr_trap(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { /* * We really shouldn't be here, and this is likely the result * of a misconfigured trap, as this register should target the * VNCR page, and nothing else. */ return bad_trap(vcpu, p, r, "trap of VNCR-backed register"); } static bool bad_redir_trap(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { /* * We really shouldn't be here, and this is likely the result * of a misconfigured trap, as this register should target the * corresponding EL1, and nothing else. */ return bad_trap(vcpu, p, r, "trap of EL2 register redirected to EL1"); } #define EL2_REG_FILTERED(name, acc, rst, v, filter) { \ SYS_DESC(SYS_##name), \ .access = acc, \ .reset = rst, \ .reg = name, \ .visibility = filter, \ .val = v, \ } #define EL2_REG(name, acc, rst, v) \ EL2_REG_FILTERED(name, acc, rst, v, el2_visibility) #define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) #define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) /* * Since reset() callback and field val are not used for idregs, they will be * used for specific purposes for idregs. * The reset() would return KVM sanitised register value. The value would be the * same as the host kernel sanitised value if there is no KVM sanitisation. * The val would be used as a mask indicating writable fields for the idreg. * Only bits with 1 are writable from userspace. This mask might not be * necessary in the future whenever all ID registers are enabled as writable * from userspace. */ #define ID_DESC_DEFAULT_CALLBACKS \ .access = access_id_reg, \ .get_user = get_id_reg, \ .set_user = set_id_reg, \ .visibility = id_visibility, \ .reset = kvm_read_sanitised_id_reg #define ID_DESC(name) \ SYS_DESC(SYS_##name), \ ID_DESC_DEFAULT_CALLBACKS /* sys_reg_desc initialiser for known cpufeature ID registers */ #define ID_SANITISED(name) { \ ID_DESC(name), \ .val = 0, \ } /* sys_reg_desc initialiser for known cpufeature ID registers */ #define AA32_ID_SANITISED(name) { \ ID_DESC(name), \ .visibility = aa32_id_visibility, \ .val = 0, \ } /* sys_reg_desc initialiser for writable ID registers */ #define ID_WRITABLE(name, mask) { \ ID_DESC(name), \ .val = mask, \ } /* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ #define ID_FILTERED(sysreg, name, mask) { \ ID_DESC(sysreg), \ .set_user = set_##name, \ .val = (mask), \ } /* * sys_reg_desc initialiser for architecturally unallocated cpufeature ID * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 * (1 <= crm < 8, 0 <= Op2 < 8). */ #define ID_UNALLOCATED(crm, op2) { \ .name = "S3_0_0_" #crm "_" #op2, \ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ ID_DESC_DEFAULT_CALLBACKS, \ .visibility = raz_visibility, \ .val = 0, \ } /* * sys_reg_desc initialiser for known ID registers that we hide from guests. * For now, these are exposed just like unallocated ID regs: they appear * RAZ for the guest. */ #define ID_HIDDEN(name) { \ ID_DESC(name), \ .visibility = raz_visibility, \ .val = 0, \ } static bool access_sp_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) __vcpu_sys_reg(vcpu, SP_EL1) = p->regval; else p->regval = __vcpu_sys_reg(vcpu, SP_EL1); return true; } static bool access_elr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) vcpu_write_sys_reg(vcpu, p->regval, ELR_EL1); else p->regval = vcpu_read_sys_reg(vcpu, ELR_EL1); return true; } static bool access_spsr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) __vcpu_sys_reg(vcpu, SPSR_EL1) = p->regval; else p->regval = __vcpu_sys_reg(vcpu, SPSR_EL1); return true; } static bool access_cntkctl_el12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) __vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval; else p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1); return true; } static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 val = r->val; if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) val |= HCR_E2H; return __vcpu_sys_reg(vcpu, r->reg) = val; } static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, unsigned int (*fn)(const struct kvm_vcpu *, const struct sys_reg_desc *)) { return el2_visibility(vcpu, rd) ?: fn(vcpu, rd); } static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { return __el2_visibility(vcpu, rd, sve_visibility); } static unsigned int vncr_el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (el2_visibility(vcpu, rd) == 0 && kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) return 0; return REG_HIDDEN; } static bool access_zcr_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { unsigned int vq; if (guest_hyp_sve_traps_enabled(vcpu)) { kvm_inject_nested_sve_trap(vcpu); return true; } if (!p->is_write) { p->regval = vcpu_read_sys_reg(vcpu, ZCR_EL2); return true; } vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1; vq = min(vq, vcpu_sve_max_vq(vcpu)); vcpu_write_sys_reg(vcpu, vq - 1, ZCR_EL2); return true; } static bool access_gic_vtr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = kvm_vgic_global_state.ich_vtr_el2; p->regval &= ~(ICH_VTR_EL2_DVIM | ICH_VTR_EL2_A3V | ICH_VTR_EL2_IDbits); p->regval |= ICH_VTR_EL2_nV4; return true; } static bool access_gic_misr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = vgic_v3_get_misr(vcpu); return true; } static bool access_gic_eisr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = vgic_v3_get_eisr(vcpu); return true; } static bool access_gic_elrsr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = vgic_v3_get_elrsr(vcpu); return true; } static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_s1poe(vcpu->kvm)) return 0; return REG_HIDDEN; } static unsigned int s1poe_el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { return __el2_visibility(vcpu, rd, s1poe_visibility); } static unsigned int tcr2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_tcr2(vcpu->kvm)) return 0; return REG_HIDDEN; } static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { return __el2_visibility(vcpu, rd, tcr2_visibility); } static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_s1pie(vcpu->kvm)) return 0; return REG_HIDDEN; } static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { return __el2_visibility(vcpu, rd, s1pie_visibility); } static bool access_mdcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 hpmn, val, old = __vcpu_sys_reg(vcpu, MDCR_EL2); if (!p->is_write) { p->regval = old; return true; } val = p->regval; hpmn = FIELD_GET(MDCR_EL2_HPMN, val); /* * If HPMN is out of bounds, limit it to what we actually * support. This matches the UNKNOWN definition of the field * in that case, and keeps the emulation simple. Sort of. */ if (hpmn > vcpu->kvm->arch.nr_pmu_counters) { hpmn = vcpu->kvm->arch.nr_pmu_counters; u64_replace_bits(val, hpmn, MDCR_EL2_HPMN); } __vcpu_sys_reg(vcpu, MDCR_EL2) = val; /* * Request a reload of the PMU to enable/disable the counters * affected by HPME. */ if ((old ^ val) & MDCR_EL2_HPME) kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); return true; } /* * For historical (ahem ABI) reasons, KVM treated MIDR_EL1, REVIDR_EL1, and * AIDR_EL1 as "invariant" registers, meaning userspace cannot change them. * The values made visible to userspace were the register values of the boot * CPU. * * At the same time, reads from these registers at EL1 previously were not * trapped, allowing the guest to read the actual hardware value. On big-little * machines, this means the VM can see different values depending on where a * given vCPU got scheduled. * * These registers are now trapped as collateral damage from SME, and what * follows attempts to give a user / guest view consistent with the existing * ABI. */ static bool access_imp_id_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); /* * Return the VM-scoped implementation ID register values if userspace * has made them writable. */ if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &vcpu->kvm->arch.flags)) return access_id_reg(vcpu, p, r); /* * Otherwise, fall back to the old behavior of returning the value of * the current CPU. */ switch (reg_to_encoding(r)) { case SYS_REVIDR_EL1: p->regval = read_sysreg(revidr_el1); break; case SYS_AIDR_EL1: p->regval = read_sysreg(aidr_el1); break; default: WARN_ON_ONCE(1); } return true; } static u64 __ro_after_init boot_cpu_midr_val; static u64 __ro_after_init boot_cpu_revidr_val; static u64 __ro_after_init boot_cpu_aidr_val; static void init_imp_id_regs(void) { boot_cpu_midr_val = read_sysreg(midr_el1); boot_cpu_revidr_val = read_sysreg(revidr_el1); boot_cpu_aidr_val = read_sysreg(aidr_el1); } static u64 reset_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { switch (reg_to_encoding(r)) { case SYS_MIDR_EL1: return boot_cpu_midr_val; case SYS_REVIDR_EL1: return boot_cpu_revidr_val; case SYS_AIDR_EL1: return boot_cpu_aidr_val; default: KVM_BUG_ON(1, vcpu->kvm); return 0; } } static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { struct kvm *kvm = vcpu->kvm; u64 expected; guard(mutex)(&kvm->arch.config_lock); expected = read_id_reg(vcpu, r); if (expected == val) return 0; if (!test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags)) return -EINVAL; /* * Once the VM has started the ID registers are immutable. Reject the * write if userspace tries to change it. */ if (kvm_vm_has_ran_once(kvm)) return -EBUSY; /* * Any value is allowed for the implementation ID registers so long as * it is within the writable mask. */ if ((val & r->val) != val) return -EINVAL; kvm_set_vm_id_reg(kvm, reg_to_encoding(r), val); return 0; } #define IMPLEMENTATION_ID(reg, mask) { \ SYS_DESC(SYS_##reg), \ .access = access_imp_id_reg, \ .get_user = get_id_reg, \ .set_user = set_imp_id_reg, \ .reset = reset_imp_id_reg, \ .val = mask, \ } static u64 reset_mdcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { __vcpu_sys_reg(vcpu, r->reg) = vcpu->kvm->arch.nr_pmu_counters; return vcpu->kvm->arch.nr_pmu_counters; } /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 * * Debug handling: We do trap most, if not all debug related system * registers. The implementation is good enough to ensure that a guest * can use these with minimal performance degradation. The drawback is * that we don't implement any of the external debug architecture. * This should be revisited if we ever encounter a more demanding * guest... */ static const struct sys_reg_desc sys_reg_descs[] = { DBG_BCR_BVR_WCR_WVR_EL1(0), DBG_BCR_BVR_WCR_WVR_EL1(1), { SYS_DESC(SYS_MDCCINT_EL1), trap_debug_regs, reset_val, MDCCINT_EL1, 0 }, { SYS_DESC(SYS_MDSCR_EL1), trap_debug_regs, reset_val, MDSCR_EL1, 0 }, DBG_BCR_BVR_WCR_WVR_EL1(2), DBG_BCR_BVR_WCR_WVR_EL1(3), DBG_BCR_BVR_WCR_WVR_EL1(4), DBG_BCR_BVR_WCR_WVR_EL1(5), DBG_BCR_BVR_WCR_WVR_EL1(6), DBG_BCR_BVR_WCR_WVR_EL1(7), DBG_BCR_BVR_WCR_WVR_EL1(8), DBG_BCR_BVR_WCR_WVR_EL1(9), DBG_BCR_BVR_WCR_WVR_EL1(10), DBG_BCR_BVR_WCR_WVR_EL1(11), DBG_BCR_BVR_WCR_WVR_EL1(12), DBG_BCR_BVR_WCR_WVR_EL1(13), DBG_BCR_BVR_WCR_WVR_EL1(14), DBG_BCR_BVR_WCR_WVR_EL1(15), { SYS_DESC(SYS_MDRAR_EL1), trap_raz_wi }, { SYS_DESC(SYS_OSLAR_EL1), trap_oslar_el1 }, { SYS_DESC(SYS_OSLSR_EL1), trap_oslsr_el1, reset_val, OSLSR_EL1, OSLSR_EL1_OSLM_IMPLEMENTED, .set_user = set_oslsr_el1, }, { SYS_DESC(SYS_OSDLR_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGPRCR_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGCLAIMSET_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGCLAIMCLR_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGAUTHSTATUS_EL1), trap_dbgauthstatus_el1 }, { SYS_DESC(SYS_MDCCSR_EL0), trap_raz_wi }, { SYS_DESC(SYS_DBGDTR_EL0), trap_raz_wi }, // DBGDTR[TR]X_EL0 share the same encoding { SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi }, { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 }, IMPLEMENTATION_ID(MIDR_EL1, GENMASK_ULL(31, 0)), { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, IMPLEMENTATION_ID(REVIDR_EL1, GENMASK_ULL(63, 0)), /* * ID regs: all ID_SANITISED() entries here must have corresponding * entries in arm64_ftr_regs[]. */ /* AArch64 mappings of the AArch32 ID registers */ /* CRm=1 */ AA32_ID_SANITISED(ID_PFR0_EL1), AA32_ID_SANITISED(ID_PFR1_EL1), { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg, .get_user = get_id_reg, .set_user = set_id_dfr0_el1, .visibility = aa32_id_visibility, .reset = read_sanitised_id_dfr0_el1, .val = ID_DFR0_EL1_PerfMon_MASK | ID_DFR0_EL1_CopDbg_MASK, }, ID_HIDDEN(ID_AFR0_EL1), AA32_ID_SANITISED(ID_MMFR0_EL1), AA32_ID_SANITISED(ID_MMFR1_EL1), AA32_ID_SANITISED(ID_MMFR2_EL1), AA32_ID_SANITISED(ID_MMFR3_EL1), /* CRm=2 */ AA32_ID_SANITISED(ID_ISAR0_EL1), AA32_ID_SANITISED(ID_ISAR1_EL1), AA32_ID_SANITISED(ID_ISAR2_EL1), AA32_ID_SANITISED(ID_ISAR3_EL1), AA32_ID_SANITISED(ID_ISAR4_EL1), AA32_ID_SANITISED(ID_ISAR5_EL1), AA32_ID_SANITISED(ID_MMFR4_EL1), AA32_ID_SANITISED(ID_ISAR6_EL1), /* CRm=3 */ AA32_ID_SANITISED(MVFR0_EL1), AA32_ID_SANITISED(MVFR1_EL1), AA32_ID_SANITISED(MVFR2_EL1), ID_UNALLOCATED(3,3), AA32_ID_SANITISED(ID_PFR2_EL1), ID_HIDDEN(ID_DFR1_EL1), AA32_ID_SANITISED(ID_MMFR5_EL1), ID_UNALLOCATED(3,7), /* AArch64 ID registers */ /* CRm=4 */ ID_FILTERED(ID_AA64PFR0_EL1, id_aa64pfr0_el1, ~(ID_AA64PFR0_EL1_AMU | ID_AA64PFR0_EL1_MPAM | ID_AA64PFR0_EL1_SVE | ID_AA64PFR0_EL1_RAS | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP)), ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1, ~(ID_AA64PFR1_EL1_PFAR | ID_AA64PFR1_EL1_DF2 | ID_AA64PFR1_EL1_MTEX | ID_AA64PFR1_EL1_THE | ID_AA64PFR1_EL1_GCS | ID_AA64PFR1_EL1_MTE_frac | ID_AA64PFR1_EL1_NMI | ID_AA64PFR1_EL1_RNDR_trap | ID_AA64PFR1_EL1_SME | ID_AA64PFR1_EL1_RES0 | ID_AA64PFR1_EL1_MPAM_frac | ID_AA64PFR1_EL1_RAS_frac | ID_AA64PFR1_EL1_MTE)), ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), ID_HIDDEN(ID_AA64SMFR0_EL1), ID_UNALLOCATED(4,6), ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0), /* CRm=5 */ /* * Prior to FEAT_Debugv8.9, the architecture defines context-aware * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs). * KVM does not trap + emulate the breakpoint registers, and as such * cannot support a layout that misaligns with the underlying hardware. * While it may be possible to describe a subset that aligns with * hardware, just prevent changes to BRPs and CTX_CMPs altogether for * simplicity. * * See DDI0487K.a, section D2.8.3 Breakpoint types and linking * of breakpoints for more details. */ ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1, ID_AA64DFR0_EL1_DoubleLock_MASK | ID_AA64DFR0_EL1_WRPs_MASK | ID_AA64DFR0_EL1_PMUVer_MASK | ID_AA64DFR0_EL1_DebugVer_MASK), ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), ID_UNALLOCATED(5,3), ID_HIDDEN(ID_AA64AFR0_EL1), ID_HIDDEN(ID_AA64AFR1_EL1), ID_UNALLOCATED(5,6), ID_UNALLOCATED(5,7), /* CRm=6 */ ID_WRITABLE(ID_AA64ISAR0_EL1, ~ID_AA64ISAR0_EL1_RES0), ID_WRITABLE(ID_AA64ISAR1_EL1, ~(ID_AA64ISAR1_EL1_GPI | ID_AA64ISAR1_EL1_GPA | ID_AA64ISAR1_EL1_API | ID_AA64ISAR1_EL1_APA)), ID_WRITABLE(ID_AA64ISAR2_EL1, ~(ID_AA64ISAR2_EL1_RES0 | ID_AA64ISAR2_EL1_APA3 | ID_AA64ISAR2_EL1_GPA3)), ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX)), ID_UNALLOCATED(6,4), ID_UNALLOCATED(6,5), ID_UNALLOCATED(6,6), ID_UNALLOCATED(6,7), /* CRm=7 */ ID_FILTERED(ID_AA64MMFR0_EL1, id_aa64mmfr0_el1, ~(ID_AA64MMFR0_EL1_RES0 | ID_AA64MMFR0_EL1_ASIDBITS)), ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | ID_AA64MMFR1_EL1_HCX | ID_AA64MMFR1_EL1_TWED | ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_VH | ID_AA64MMFR1_EL1_VMIDBits)), ID_FILTERED(ID_AA64MMFR2_EL1, id_aa64mmfr2_el1, ~(ID_AA64MMFR2_EL1_RES0 | ID_AA64MMFR2_EL1_EVT | ID_AA64MMFR2_EL1_FWB | ID_AA64MMFR2_EL1_IDS | ID_AA64MMFR2_EL1_NV | ID_AA64MMFR2_EL1_CCIDX)), ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1PIE | ID_AA64MMFR3_EL1_S1POE)), ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac), ID_UNALLOCATED(7,5), ID_UNALLOCATED(7,6), ID_UNALLOCATED(7,7), { SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 }, { SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 }, { SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 }, MTE_REG(RGSR_EL1), MTE_REG(GCR_EL1), { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility }, { SYS_DESC(SYS_TRFCR_EL1), undef_access }, { SYS_DESC(SYS_SMPRI_EL1), undef_access }, { SYS_DESC(SYS_SMCR_EL1), undef_access }, { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0, .visibility = tcr2_visibility }, PTRAUTH_KEY(APIA), PTRAUTH_KEY(APIB), PTRAUTH_KEY(APDA), PTRAUTH_KEY(APDB), PTRAUTH_KEY(APGA), { SYS_DESC(SYS_SPSR_EL1), access_spsr}, { SYS_DESC(SYS_ELR_EL1), access_elr}, { SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, { SYS_DESC(SYS_ERRIDR_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERRSELR_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXFR_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXCTLR_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXSTATUS_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXADDR_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi }, MTE_REG(TFSR_EL1), MTE_REG(TFSRE0_EL1), { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 }, { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 }, { SYS_DESC(SYS_PMSCR_EL1), undef_access }, { SYS_DESC(SYS_PMSNEVFR_EL1), undef_access }, { SYS_DESC(SYS_PMSICR_EL1), undef_access }, { SYS_DESC(SYS_PMSIRR_EL1), undef_access }, { SYS_DESC(SYS_PMSFCR_EL1), undef_access }, { SYS_DESC(SYS_PMSEVFR_EL1), undef_access }, { SYS_DESC(SYS_PMSLATFR_EL1), undef_access }, { SYS_DESC(SYS_PMSIDR_EL1), undef_access }, { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access }, { SYS_DESC(SYS_PMBPTR_EL1), undef_access }, { SYS_DESC(SYS_PMBSR_EL1), undef_access }, /* PMBIDR_EL1 is not trapped */ { PMU_SYS_REG(PMINTENSET_EL1), .access = access_pminten, .reg = PMINTENSET_EL1, .get_user = get_pmreg, .set_user = set_pmreg }, { PMU_SYS_REG(PMINTENCLR_EL1), .access = access_pminten, .reg = PMINTENSET_EL1, .get_user = get_pmreg, .set_user = set_pmreg }, { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi }, { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1, .visibility = s1pie_visibility }, { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1, .visibility = s1pie_visibility }, { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1, .visibility = s1poe_visibility }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, { SYS_DESC(SYS_LORSA_EL1), trap_loregion }, { SYS_DESC(SYS_LOREA_EL1), trap_loregion }, { SYS_DESC(SYS_LORN_EL1), trap_loregion }, { SYS_DESC(SYS_LORC_EL1), trap_loregion }, { SYS_DESC(SYS_MPAMIDR_EL1), undef_access }, { SYS_DESC(SYS_LORID_EL1), trap_loregion }, { SYS_DESC(SYS_MPAM1_EL1), undef_access }, { SYS_DESC(SYS_MPAM0_EL1), undef_access }, { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, { SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, { SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 }, { SYS_DESC(SYS_ACCDATA_EL1), undef_access }, { SYS_DESC(SYS_SCXTNUM_EL1), undef_access }, { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0}, { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, { SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1, .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 }, { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, { SYS_DESC(SYS_SMIDR_EL1), undef_access }, IMPLEMENTATION_ID(AIDR_EL1, GENMASK_ULL(63, 0)), { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, ID_FILTERED(CTR_EL0, ctr_el0, CTR_EL0_DIC_MASK | CTR_EL0_IDC_MASK | CTR_EL0_DminLine_MASK | CTR_EL0_L1Ip_MASK | CTR_EL0_IminLine_MASK), { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility }, { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility }, { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr }, { PMU_SYS_REG(PMCNTENSET_EL0), .access = access_pmcnten, .reg = PMCNTENSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, { PMU_SYS_REG(PMCNTENCLR_EL0), .access = access_pmcnten, .reg = PMCNTENSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, { PMU_SYS_REG(PMOVSCLR_EL0), .access = access_pmovs, .reg = PMOVSSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, /* * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was * previously (and pointlessly) advertised in the past... */ { PMU_SYS_REG(PMSWINC_EL0), .get_user = get_raz_reg, .set_user = set_wi_reg, .access = access_pmswinc, .reset = NULL }, { PMU_SYS_REG(PMSELR_EL0), .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, { PMU_SYS_REG(PMCEID0_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(PMCEID1_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(PMCCNTR_EL0), .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr, .set_user = set_pmu_evcntr }, { PMU_SYS_REG(PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(PMXEVCNTR_EL0), .access = access_pmu_evcntr, .reset = NULL }, /* * PMUSERENR_EL0 resets as unknown in 64bit mode while it resets as zero * in 32bit mode. Here we choose to reset it as zero for consistency. */ { PMU_SYS_REG(PMUSERENR_EL0), .access = access_pmuserenr, .reset = reset_val, .reg = PMUSERENR_EL0, .val = 0 }, { PMU_SYS_REG(PMOVSSET_EL0), .access = access_pmovs, .reg = PMOVSSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0, .visibility = s1poe_visibility }, { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, { SYS_DESC(SYS_TPIDR2_EL0), undef_access }, { SYS_DESC(SYS_SCXTNUM_EL0), undef_access }, { SYS_DESC(SYS_AMCR_EL0), undef_access }, { SYS_DESC(SYS_AMCFGR_EL0), undef_access }, { SYS_DESC(SYS_AMCGCR_EL0), undef_access }, { SYS_DESC(SYS_AMUSERENR_EL0), undef_access }, { SYS_DESC(SYS_AMCNTENCLR0_EL0), undef_access }, { SYS_DESC(SYS_AMCNTENSET0_EL0), undef_access }, { SYS_DESC(SYS_AMCNTENCLR1_EL0), undef_access }, { SYS_DESC(SYS_AMCNTENSET1_EL0), undef_access }, AMU_AMEVCNTR0_EL0(0), AMU_AMEVCNTR0_EL0(1), AMU_AMEVCNTR0_EL0(2), AMU_AMEVCNTR0_EL0(3), AMU_AMEVCNTR0_EL0(4), AMU_AMEVCNTR0_EL0(5), AMU_AMEVCNTR0_EL0(6), AMU_AMEVCNTR0_EL0(7), AMU_AMEVCNTR0_EL0(8), AMU_AMEVCNTR0_EL0(9), AMU_AMEVCNTR0_EL0(10), AMU_AMEVCNTR0_EL0(11), AMU_AMEVCNTR0_EL0(12), AMU_AMEVCNTR0_EL0(13), AMU_AMEVCNTR0_EL0(14), AMU_AMEVCNTR0_EL0(15), AMU_AMEVTYPER0_EL0(0), AMU_AMEVTYPER0_EL0(1), AMU_AMEVTYPER0_EL0(2), AMU_AMEVTYPER0_EL0(3), AMU_AMEVTYPER0_EL0(4), AMU_AMEVTYPER0_EL0(5), AMU_AMEVTYPER0_EL0(6), AMU_AMEVTYPER0_EL0(7), AMU_AMEVTYPER0_EL0(8), AMU_AMEVTYPER0_EL0(9), AMU_AMEVTYPER0_EL0(10), AMU_AMEVTYPER0_EL0(11), AMU_AMEVTYPER0_EL0(12), AMU_AMEVTYPER0_EL0(13), AMU_AMEVTYPER0_EL0(14), AMU_AMEVTYPER0_EL0(15), AMU_AMEVCNTR1_EL0(0), AMU_AMEVCNTR1_EL0(1), AMU_AMEVCNTR1_EL0(2), AMU_AMEVCNTR1_EL0(3), AMU_AMEVCNTR1_EL0(4), AMU_AMEVCNTR1_EL0(5), AMU_AMEVCNTR1_EL0(6), AMU_AMEVCNTR1_EL0(7), AMU_AMEVCNTR1_EL0(8), AMU_AMEVCNTR1_EL0(9), AMU_AMEVCNTR1_EL0(10), AMU_AMEVCNTR1_EL0(11), AMU_AMEVCNTR1_EL0(12), AMU_AMEVCNTR1_EL0(13), AMU_AMEVCNTR1_EL0(14), AMU_AMEVCNTR1_EL0(15), AMU_AMEVTYPER1_EL0(0), AMU_AMEVTYPER1_EL0(1), AMU_AMEVTYPER1_EL0(2), AMU_AMEVTYPER1_EL0(3), AMU_AMEVTYPER1_EL0(4), AMU_AMEVTYPER1_EL0(5), AMU_AMEVTYPER1_EL0(6), AMU_AMEVTYPER1_EL0(7), AMU_AMEVTYPER1_EL0(8), AMU_AMEVTYPER1_EL0(9), AMU_AMEVTYPER1_EL0(10), AMU_AMEVTYPER1_EL0(11), AMU_AMEVTYPER1_EL0(12), AMU_AMEVTYPER1_EL0(13), AMU_AMEVTYPER1_EL0(14), AMU_AMEVTYPER1_EL0(15), { SYS_DESC(SYS_CNTPCT_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTVCT_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTV_CTL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTV_CVAL_EL0), access_arch_timer }, /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), PMU_PMEVCNTR_EL0(1), PMU_PMEVCNTR_EL0(2), PMU_PMEVCNTR_EL0(3), PMU_PMEVCNTR_EL0(4), PMU_PMEVCNTR_EL0(5), PMU_PMEVCNTR_EL0(6), PMU_PMEVCNTR_EL0(7), PMU_PMEVCNTR_EL0(8), PMU_PMEVCNTR_EL0(9), PMU_PMEVCNTR_EL0(10), PMU_PMEVCNTR_EL0(11), PMU_PMEVCNTR_EL0(12), PMU_PMEVCNTR_EL0(13), PMU_PMEVCNTR_EL0(14), PMU_PMEVCNTR_EL0(15), PMU_PMEVCNTR_EL0(16), PMU_PMEVCNTR_EL0(17), PMU_PMEVCNTR_EL0(18), PMU_PMEVCNTR_EL0(19), PMU_PMEVCNTR_EL0(20), PMU_PMEVCNTR_EL0(21), PMU_PMEVCNTR_EL0(22), PMU_PMEVCNTR_EL0(23), PMU_PMEVCNTR_EL0(24), PMU_PMEVCNTR_EL0(25), PMU_PMEVCNTR_EL0(26), PMU_PMEVCNTR_EL0(27), PMU_PMEVCNTR_EL0(28), PMU_PMEVCNTR_EL0(29), PMU_PMEVCNTR_EL0(30), /* PMEVTYPERn_EL0 */ PMU_PMEVTYPER_EL0(0), PMU_PMEVTYPER_EL0(1), PMU_PMEVTYPER_EL0(2), PMU_PMEVTYPER_EL0(3), PMU_PMEVTYPER_EL0(4), PMU_PMEVTYPER_EL0(5), PMU_PMEVTYPER_EL0(6), PMU_PMEVTYPER_EL0(7), PMU_PMEVTYPER_EL0(8), PMU_PMEVTYPER_EL0(9), PMU_PMEVTYPER_EL0(10), PMU_PMEVTYPER_EL0(11), PMU_PMEVTYPER_EL0(12), PMU_PMEVTYPER_EL0(13), PMU_PMEVTYPER_EL0(14), PMU_PMEVTYPER_EL0(15), PMU_PMEVTYPER_EL0(16), PMU_PMEVTYPER_EL0(17), PMU_PMEVTYPER_EL0(18), PMU_PMEVTYPER_EL0(19), PMU_PMEVTYPER_EL0(20), PMU_PMEVTYPER_EL0(21), PMU_PMEVTYPER_EL0(22), PMU_PMEVTYPER_EL0(23), PMU_PMEVTYPER_EL0(24), PMU_PMEVTYPER_EL0(25), PMU_PMEVTYPER_EL0(26), PMU_PMEVTYPER_EL0(27), PMU_PMEVTYPER_EL0(28), PMU_PMEVTYPER_EL0(29), PMU_PMEVTYPER_EL0(30), /* * PMCCFILTR_EL0 resets as unknown in 64bit mode while it resets as zero * in 32bit mode. Here we choose to reset it as zero for consistency. */ { PMU_SYS_REG(PMCCFILTR_EL0), .access = access_pmu_evtyper, .reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 }, EL2_REG_VNCR(VPIDR_EL2, reset_unknown, 0), EL2_REG_VNCR(VMPIDR_EL2, reset_unknown, 0), EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), EL2_REG_VNCR(HCR_EL2, reset_hcr, 0), EL2_REG(MDCR_EL2, access_mdcr, reset_mdcr, 0), EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), EL2_REG_VNCR(HSTR_EL2, reset_val, 0), EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0), EL2_REG_VNCR(HFGWTR_EL2, reset_val, 0), EL2_REG_VNCR(HFGITR_EL2, reset_val, 0), EL2_REG_VNCR(HACR_EL2, reset_val, 0), EL2_REG_FILTERED(ZCR_EL2, access_zcr_el2, reset_val, 0, sve_el2_visibility), EL2_REG_VNCR(HCRX_EL2, reset_val, 0), EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), EL2_REG_FILTERED(TCR2_EL2, access_rw, reset_val, TCR2_EL2_RES1, tcr2_el2_visibility), EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), EL2_REG_FILTERED(VNCR_EL2, bad_vncr_trap, reset_val, 0, vncr_el2_visibility), { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 }, EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0), EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0), EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0), EL2_REG_REDIR(SPSR_EL2, reset_val, 0), EL2_REG_REDIR(ELR_EL2, reset_val, 0), { SYS_DESC(SYS_SP_EL1), access_sp_el1}, /* AArch32 SPSR_* are RES0 if trapped from a NV guest */ { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi }, { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi }, { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi }, { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi }, { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 }, EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), EL2_REG_REDIR(ESR_EL2, reset_val, 0), { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 }, EL2_REG_REDIR(FAR_EL2, reset_val, 0), EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), EL2_REG(MAIR_EL2, access_rw, reset_val, 0), EL2_REG_FILTERED(PIRE0_EL2, access_rw, reset_val, 0, s1pie_el2_visibility), EL2_REG_FILTERED(PIR_EL2, access_rw, reset_val, 0, s1pie_el2_visibility), EL2_REG_FILTERED(POR_EL2, access_rw, reset_val, 0, s1poe_el2_visibility), EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), { SYS_DESC(SYS_MPAMHCR_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPMV_EL2), undef_access }, { SYS_DESC(SYS_MPAM2_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM0_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM1_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM2_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM3_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM4_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM5_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM6_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM7_EL2), undef_access }, EL2_REG(VBAR_EL2, access_rw, reset_val, 0), EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), { SYS_DESC(SYS_RMR_EL2), undef_access }, EL2_REG_VNCR(ICH_AP0R0_EL2, reset_val, 0), EL2_REG_VNCR(ICH_AP0R1_EL2, reset_val, 0), EL2_REG_VNCR(ICH_AP0R2_EL2, reset_val, 0), EL2_REG_VNCR(ICH_AP0R3_EL2, reset_val, 0), EL2_REG_VNCR(ICH_AP1R0_EL2, reset_val, 0), EL2_REG_VNCR(ICH_AP1R1_EL2, reset_val, 0), EL2_REG_VNCR(ICH_AP1R2_EL2, reset_val, 0), EL2_REG_VNCR(ICH_AP1R3_EL2, reset_val, 0), { SYS_DESC(SYS_ICC_SRE_EL2), access_gic_sre }, EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0), { SYS_DESC(SYS_ICH_VTR_EL2), access_gic_vtr }, { SYS_DESC(SYS_ICH_MISR_EL2), access_gic_misr }, { SYS_DESC(SYS_ICH_EISR_EL2), access_gic_eisr }, { SYS_DESC(SYS_ICH_ELRSR_EL2), access_gic_elrsr }, EL2_REG_VNCR(ICH_VMCR_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR0_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR1_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR2_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR3_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR4_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR5_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR6_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR7_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR8_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR9_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR10_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR11_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR12_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR13_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR14_EL2, reset_val, 0), EL2_REG_VNCR(ICH_LR15_EL2, reset_val, 0), EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer }, EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer }, EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0), EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0), { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, { SYS_DESC(SYS_CNTP_TVAL_EL02), access_arch_timer }, { SYS_DESC(SYS_CNTP_CTL_EL02), access_arch_timer }, { SYS_DESC(SYS_CNTP_CVAL_EL02), access_arch_timer }, { SYS_DESC(SYS_CNTV_TVAL_EL02), access_arch_timer }, { SYS_DESC(SYS_CNTV_CTL_EL02), access_arch_timer }, { SYS_DESC(SYS_CNTV_CVAL_EL02), access_arch_timer }, EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); __kvm_at_s1e01(vcpu, op, p->regval); return true; } static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); /* There is no FGT associated with AT S1E2A :-( */ if (op == OP_AT_S1E2A && !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) { kvm_inject_undefined(vcpu); return false; } __kvm_at_s1e2(vcpu, op, p->regval); return true; } static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); __kvm_at_s12(vcpu, op, p->regval); return true; } static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; u8 CRm = sys_reg_CRm(instr); if (sys_reg_CRn(instr) == TLBI_CRn_nXS && !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) return false; if (CRm == TLBI_CRm_nROS && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) return false; return true; } static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); write_lock(&vcpu->kvm->mmu_lock); /* * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the * corresponding VMIDs. */ kvm_nested_s2_unmap(vcpu->kvm, true); write_unlock(&vcpu->kvm->mmu_lock); return true; } static bool kvm_supported_tlbi_ipas2_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; u8 CRm = sys_reg_CRm(instr); u8 Op2 = sys_reg_Op2(instr); if (sys_reg_CRn(instr) == TLBI_CRn_nXS && !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) return false; if (CRm == TLBI_CRm_IPAIS && (Op2 == 2 || Op2 == 6) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) return false; if (CRm == TLBI_CRm_IPAONS && (Op2 == 0 || Op2 == 4) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) return false; if (CRm == TLBI_CRm_IPAONS && (Op2 == 3 || Op2 == 7) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) return false; return true; } /* Only defined here as this is an internal "abstraction" */ union tlbi_info { struct { u64 start; u64 size; } range; struct { u64 addr; } ipa; struct { u64 addr; u32 encoding; } va; }; static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu, const union tlbi_info *info) { /* * The unmap operation is allowed to drop the MMU lock and block, which * means that @mmu could be used for a different context than the one * currently being invalidated. * * This behavior is still safe, as: * * 1) The vCPU(s) that recycled the MMU are responsible for invalidating * the entire MMU before reusing it, which still honors the intent * of a TLBI. * * 2) Until the guest TLBI instruction is 'retired' (i.e. increment PC * and ERET to the guest), other vCPUs are allowed to use stale * translations. * * 3) Accidentally unmapping an unrelated MMU context is nonfatal, and * at worst may cause more aborts for shadow stage-2 fills. * * Dropping the MMU lock also implies that shadow stage-2 fills could * happen behind the back of the TLBI. This is still safe, though, as * the L1 needs to put its stage-2 in a consistent state before doing * the TLBI. */ kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true); } static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 limit, vttbr; if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm)); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { .range = { .start = 0, .size = limit, }, }, s2_mmu_unmap_range); return true; } static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); u64 base, range; if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); /* * Because the shadow S2 structure doesn't necessarily reflect that * of the guest's S2 (different base granule size, for example), we * decide to ignore TTL and only use the described range. */ base = decode_range_tlbi(p->regval, &range, NULL); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { .range = { .start = base, .size = range, }, }, s2_mmu_unmap_range); return true; } static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu, const union tlbi_info *info) { unsigned long max_size; u64 base_addr; /* * We drop a number of things from the supplied value: * * - NS bit: we're non-secure only. * * - IPA[51:48]: We don't support 52bit IPA just yet... * * And of course, adjust the IPA to be on an actual address. */ base_addr = (info->ipa.addr & GENMASK_ULL(35, 0)) << 12; max_size = compute_tlb_inval_range(mmu, info->ipa.addr); base_addr &= ~(max_size - 1); /* * See comment in s2_mmu_unmap_range() for why this is allowed to * reschedule. */ kvm_stage2_unmap_range(mmu, base_addr, max_size, true); } static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { .ipa = { .addr = p->regval, }, }, s2_mmu_unmap_ipa); return true; } static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu, const union tlbi_info *info) { WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding)); } static bool handle_tlbi_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); if (!kvm_supported_tlbi_s1e2_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval); return true; } static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); /* * If we're here, this is because we've trapped on a EL1 TLBI * instruction that affects the EL1 translation regime while * we're running in a context that doesn't allow us to let the * HW do its thing (aka vEL2): * * - HCR_EL2.E2H == 0 : a non-VHE guest * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode * * Another possibility is that we are invalidating the EL2 context * using EL1 instructions, but that we landed here because we need * additional invalidation for structures that are not held in the * CPU TLBs (such as the VNCR pseudo-TLB and its EL2 mapping). In * that case, we are guaranteed that HCR_EL2.{E2H,TGE} == { 1, 1 } * as we don't allow an NV-capable L1 in a nVHE configuration. * * We don't expect these helpers to ever be called when running * in a vEL1 context. */ WARN_ON(!vcpu_is_el2(vcpu)); if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) { kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval); return true; } kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(__vcpu_sys_reg(vcpu, VTTBR_EL2)), &(union tlbi_info) { .va = { .addr = p->regval, .encoding = sys_encoding, }, }, s2_mmu_tlbi_s1e1); return true; } #define SYS_INSN(insn, access_fn) \ { \ SYS_DESC(OP_##insn), \ .access = (access_fn), \ } static struct sys_reg_desc sys_insn_descs[] = { { SYS_DESC(SYS_DC_ISW), access_dcsw }, { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, SYS_INSN(AT_S1E1R, handle_at_s1e01), SYS_INSN(AT_S1E1W, handle_at_s1e01), SYS_INSN(AT_S1E0R, handle_at_s1e01), SYS_INSN(AT_S1E0W, handle_at_s1e01), SYS_INSN(AT_S1E1RP, handle_at_s1e01), SYS_INSN(AT_S1E1WP, handle_at_s1e01), { SYS_DESC(SYS_DC_CSW), access_dcsw }, { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, { SYS_DESC(SYS_DC_CISW), access_dcsw }, { SYS_DESC(SYS_DC_CIGSW), access_dcgsw }, { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw }, SYS_INSN(TLBI_VMALLE1OS, handle_tlbi_el1), SYS_INSN(TLBI_VAE1OS, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1OS, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1OS, handle_tlbi_el1), SYS_INSN(TLBI_VALE1OS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1OS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1IS, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1IS, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1IS, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1IS, handle_tlbi_el1), SYS_INSN(TLBI_VMALLE1IS, handle_tlbi_el1), SYS_INSN(TLBI_VAE1IS, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1IS, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1IS, handle_tlbi_el1), SYS_INSN(TLBI_VALE1IS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1IS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1OS, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1OS, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1OS, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1OS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1, handle_tlbi_el1), SYS_INSN(TLBI_VMALLE1, handle_tlbi_el1), SYS_INSN(TLBI_VAE1, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1, handle_tlbi_el1), SYS_INSN(TLBI_VALE1, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1, handle_tlbi_el1), SYS_INSN(TLBI_VMALLE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_VALE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_VMALLE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_VALE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VMALLE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VAE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1), SYS_INSN(AT_S1E2R, handle_at_s1e2), SYS_INSN(AT_S1E2W, handle_at_s1e2), SYS_INSN(AT_S12E1R, handle_at_s12), SYS_INSN(AT_S12E1W, handle_at_s12), SYS_INSN(AT_S12E0R, handle_at_s12), SYS_INSN(AT_S12E0W, handle_at_s12), SYS_INSN(AT_S1E2A, handle_at_s1e2), SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is), SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is), SYS_INSN(TLBI_ALLE2OS, handle_tlbi_el2), SYS_INSN(TLBI_VAE2OS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1OS, handle_alle1is), SYS_INSN(TLBI_VALE2OS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is), SYS_INSN(TLBI_RVAE2IS, handle_tlbi_el2), SYS_INSN(TLBI_RVALE2IS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE2IS, handle_tlbi_el2), SYS_INSN(TLBI_VAE2IS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1IS, handle_alle1is), SYS_INSN(TLBI_VALE2IS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2E1, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2E1OS, handle_ripas2e1is), SYS_INSN(TLBI_IPAS2LE1OS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is), SYS_INSN(TLBI_RVAE2OS, handle_tlbi_el2), SYS_INSN(TLBI_RVALE2OS, handle_tlbi_el2), SYS_INSN(TLBI_RVAE2, handle_tlbi_el2), SYS_INSN(TLBI_RVALE2, handle_tlbi_el2), SYS_INSN(TLBI_ALLE2, handle_tlbi_el2), SYS_INSN(TLBI_VAE2, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1, handle_alle1is), SYS_INSN(TLBI_VALE2, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2E1ISNXS, handle_ripas2e1is), SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is), SYS_INSN(TLBI_ALLE2OSNXS, handle_tlbi_el2), SYS_INSN(TLBI_VAE2OSNXS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is), SYS_INSN(TLBI_VALE2OSNXS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is), SYS_INSN(TLBI_RVAE2ISNXS, handle_tlbi_el2), SYS_INSN(TLBI_RVALE2ISNXS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE2ISNXS, handle_tlbi_el2), SYS_INSN(TLBI_VAE2ISNXS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is), SYS_INSN(TLBI_VALE2ISNXS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2E1NXS, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2E1OSNXS, handle_ripas2e1is), SYS_INSN(TLBI_IPAS2LE1OSNXS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is), SYS_INSN(TLBI_RVAE2OSNXS, handle_tlbi_el2), SYS_INSN(TLBI_RVALE2OSNXS, handle_tlbi_el2), SYS_INSN(TLBI_RVAE2NXS, handle_tlbi_el2), SYS_INSN(TLBI_RVALE2NXS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE2NXS, handle_tlbi_el2), SYS_INSN(TLBI_VAE2NXS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1NXS, handle_alle1is), SYS_INSN(TLBI_VALE2NXS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is), }; static bool trap_dbgdidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { return ignore_write(vcpu, p); } else { u64 dfr = kvm_read_vm_id_reg(vcpu->kvm, SYS_ID_AA64DFR0_EL1); u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP); p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) | (SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr) << 24) | (SYS_FIELD_GET(ID_AA64DFR0_EL1, CTX_CMPs, dfr) << 20) | (SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, dfr) << 16) | (1 << 15) | (el3 << 14) | (el3 << 12)); return true; } } /* * AArch32 debug register mappings * * AArch32 DBGBVRn is mapped to DBGBVRn_EL1[31:0] * AArch32 DBGBXVRn is mapped to DBGBVRn_EL1[63:32] * * None of the other registers share their location, so treat them as * if they were 64bit. */ #define DBG_BCR_BVR_WCR_WVR(n) \ /* DBGBVRn */ \ { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), \ trap_dbg_wb_reg, NULL, n }, \ /* DBGBCRn */ \ { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_dbg_wb_reg, NULL, n }, \ /* DBGWVRn */ \ { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_dbg_wb_reg, NULL, n }, \ /* DBGWCRn */ \ { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_dbg_wb_reg, NULL, n } #define DBGBXVR(n) \ { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), \ trap_dbg_wb_reg, NULL, n } /* * Trapped cp14 registers. We generally ignore most of the external * debug, on the principle that they don't really make sense to a * guest. Revisit this one day, would this principle change. */ static const struct sys_reg_desc cp14_regs[] = { /* DBGDIDR */ { Op1( 0), CRn( 0), CRm( 0), Op2( 0), trap_dbgdidr }, /* DBGDTRRXext */ { Op1( 0), CRn( 0), CRm( 0), Op2( 2), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(0), /* DBGDSCRint */ { Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(1), /* DBGDCCINT */ { Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug_regs, NULL, MDCCINT_EL1 }, /* DBGDSCRext */ { Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug_regs, NULL, MDSCR_EL1 }, DBG_BCR_BVR_WCR_WVR(2), /* DBGDTR[RT]Xint */ { Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi }, /* DBGDTR[RT]Xext */ { Op1( 0), CRn( 0), CRm( 3), Op2( 2), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(3), DBG_BCR_BVR_WCR_WVR(4), DBG_BCR_BVR_WCR_WVR(5), /* DBGWFAR */ { Op1( 0), CRn( 0), CRm( 6), Op2( 0), trap_raz_wi }, /* DBGOSECCR */ { Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(6), /* DBGVCR */ { Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug_regs, NULL, DBGVCR32_EL2 }, DBG_BCR_BVR_WCR_WVR(7), DBG_BCR_BVR_WCR_WVR(8), DBG_BCR_BVR_WCR_WVR(9), DBG_BCR_BVR_WCR_WVR(10), DBG_BCR_BVR_WCR_WVR(11), DBG_BCR_BVR_WCR_WVR(12), DBG_BCR_BVR_WCR_WVR(13), DBG_BCR_BVR_WCR_WVR(14), DBG_BCR_BVR_WCR_WVR(15), /* DBGDRAR (32bit) */ { Op1( 0), CRn( 1), CRm( 0), Op2( 0), trap_raz_wi }, DBGBXVR(0), /* DBGOSLAR */ { Op1( 0), CRn( 1), CRm( 0), Op2( 4), trap_oslar_el1 }, DBGBXVR(1), /* DBGOSLSR */ { Op1( 0), CRn( 1), CRm( 1), Op2( 4), trap_oslsr_el1, NULL, OSLSR_EL1 }, DBGBXVR(2), DBGBXVR(3), /* DBGOSDLR */ { Op1( 0), CRn( 1), CRm( 3), Op2( 4), trap_raz_wi }, DBGBXVR(4), /* DBGPRCR */ { Op1( 0), CRn( 1), CRm( 4), Op2( 4), trap_raz_wi }, DBGBXVR(5), DBGBXVR(6), DBGBXVR(7), DBGBXVR(8), DBGBXVR(9), DBGBXVR(10), DBGBXVR(11), DBGBXVR(12), DBGBXVR(13), DBGBXVR(14), DBGBXVR(15), /* DBGDSAR (32bit) */ { Op1( 0), CRn( 2), CRm( 0), Op2( 0), trap_raz_wi }, /* DBGDEVID2 */ { Op1( 0), CRn( 7), CRm( 0), Op2( 7), trap_raz_wi }, /* DBGDEVID1 */ { Op1( 0), CRn( 7), CRm( 1), Op2( 7), trap_raz_wi }, /* DBGDEVID */ { Op1( 0), CRn( 7), CRm( 2), Op2( 7), trap_raz_wi }, /* DBGCLAIMSET */ { Op1( 0), CRn( 7), CRm( 8), Op2( 6), trap_raz_wi }, /* DBGCLAIMCLR */ { Op1( 0), CRn( 7), CRm( 9), Op2( 6), trap_raz_wi }, /* DBGAUTHSTATUS */ { Op1( 0), CRn( 7), CRm(14), Op2( 6), trap_dbgauthstatus_el1 }, }; /* Trapped cp14 64bit registers */ static const struct sys_reg_desc cp14_64_regs[] = { /* DBGDRAR (64bit) */ { Op1( 0), CRm( 1), .access = trap_raz_wi }, /* DBGDSAR (64bit) */ { Op1( 0), CRm( 2), .access = trap_raz_wi }, }; #define CP15_PMU_SYS_REG(_map, _Op1, _CRn, _CRm, _Op2) \ AA32(_map), \ Op1(_Op1), CRn(_CRn), CRm(_CRm), Op2(_Op2), \ .visibility = pmu_visibility /* Macro to expand the PMEVCNTRn register */ #define PMU_PMEVCNTR(n) \ { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110, \ (0b1000 | (((n) >> 3) & 0x3)), ((n) & 0x7)), \ .access = access_pmu_evcntr } /* Macro to expand the PMEVTYPERn register */ #define PMU_PMEVTYPER(n) \ { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110, \ (0b1100 | (((n) >> 3) & 0x3)), ((n) & 0x7)), \ .access = access_pmu_evtyper } /* * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding, * depending on the way they are accessed (as a 32bit or a 64bit * register). */ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn( 0), CRm( 0), Op2( 1), access_ctr }, { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, SCTLR_EL1 }, /* ACTLR */ { AA32(LO), Op1( 0), CRn( 1), CRm( 0), Op2( 1), access_actlr, NULL, ACTLR_EL1 }, /* ACTLR2 */ { AA32(HI), Op1( 0), CRn( 1), CRm( 0), Op2( 3), access_actlr, NULL, ACTLR_EL1 }, { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, TTBR1_EL1 }, /* TTBCR */ { AA32(LO), Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, TCR_EL1 }, /* TTBCR2 */ { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 }, { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 }, { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, /* DFSR */ { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 }, { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 }, /* ADFSR */ { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, AFSR0_EL1 }, /* AIFSR */ { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, AFSR1_EL1 }, /* DFAR */ { AA32(LO), Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, FAR_EL1 }, /* IFAR */ { AA32(HI), Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, FAR_EL1 }, /* * DC{C,I,CI}SW operations: */ { Op1( 0), CRn( 7), CRm( 6), Op2( 2), access_dcsw }, { Op1( 0), CRn( 7), CRm(10), Op2( 2), access_dcsw }, { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw }, /* PMU */ { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 0), .access = access_pmcr }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 1), .access = access_pmcnten }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 2), .access = access_pmcnten }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 3), .access = access_pmovs }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 4), .access = access_pmswinc }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 5), .access = access_pmselr }, { CP15_PMU_SYS_REG(LO, 0, 9, 12, 6), .access = access_pmceid }, { CP15_PMU_SYS_REG(LO, 0, 9, 12, 7), .access = access_pmceid }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 0), .access = access_pmu_evcntr }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 1), .access = access_pmu_evtyper }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 2), .access = access_pmu_evcntr }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 0), .access = access_pmuserenr }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 1), .access = access_pminten }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 2), .access = access_pminten }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 3), .access = access_pmovs }, { CP15_PMU_SYS_REG(HI, 0, 9, 14, 4), .access = access_pmceid }, { CP15_PMU_SYS_REG(HI, 0, 9, 14, 5), .access = access_pmceid }, /* PMMIR */ { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 6), .access = trap_raz_wi }, /* PRRR/MAIR0 */ { AA32(LO), Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, MAIR_EL1 }, /* NMRR/MAIR1 */ { AA32(HI), Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, MAIR_EL1 }, /* AMAIR0 */ { AA32(LO), Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, AMAIR_EL1 }, /* AMAIR1 */ { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 }, { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 }, /* Arch Tmers */ { SYS_DESC(SYS_AARCH32_CNTP_TVAL), access_arch_timer }, { SYS_DESC(SYS_AARCH32_CNTP_CTL), access_arch_timer }, /* PMEVCNTRn */ PMU_PMEVCNTR(0), PMU_PMEVCNTR(1), PMU_PMEVCNTR(2), PMU_PMEVCNTR(3), PMU_PMEVCNTR(4), PMU_PMEVCNTR(5), PMU_PMEVCNTR(6), PMU_PMEVCNTR(7), PMU_PMEVCNTR(8), PMU_PMEVCNTR(9), PMU_PMEVCNTR(10), PMU_PMEVCNTR(11), PMU_PMEVCNTR(12), PMU_PMEVCNTR(13), PMU_PMEVCNTR(14), PMU_PMEVCNTR(15), PMU_PMEVCNTR(16), PMU_PMEVCNTR(17), PMU_PMEVCNTR(18), PMU_PMEVCNTR(19), PMU_PMEVCNTR(20), PMU_PMEVCNTR(21), PMU_PMEVCNTR(22), PMU_PMEVCNTR(23), PMU_PMEVCNTR(24), PMU_PMEVCNTR(25), PMU_PMEVCNTR(26), PMU_PMEVCNTR(27), PMU_PMEVCNTR(28), PMU_PMEVCNTR(29), PMU_PMEVCNTR(30), /* PMEVTYPERn */ PMU_PMEVTYPER(0), PMU_PMEVTYPER(1), PMU_PMEVTYPER(2), PMU_PMEVTYPER(3), PMU_PMEVTYPER(4), PMU_PMEVTYPER(5), PMU_PMEVTYPER(6), PMU_PMEVTYPER(7), PMU_PMEVTYPER(8), PMU_PMEVTYPER(9), PMU_PMEVTYPER(10), PMU_PMEVTYPER(11), PMU_PMEVTYPER(12), PMU_PMEVTYPER(13), PMU_PMEVTYPER(14), PMU_PMEVTYPER(15), PMU_PMEVTYPER(16), PMU_PMEVTYPER(17), PMU_PMEVTYPER(18), PMU_PMEVTYPER(19), PMU_PMEVTYPER(20), PMU_PMEVTYPER(21), PMU_PMEVTYPER(22), PMU_PMEVTYPER(23), PMU_PMEVTYPER(24), PMU_PMEVTYPER(25), PMU_PMEVTYPER(26), PMU_PMEVTYPER(27), PMU_PMEVTYPER(28), PMU_PMEVTYPER(29), PMU_PMEVTYPER(30), /* PMCCFILTR */ { CP15_PMU_SYS_REG(DIRECT, 0, 14, 15, 7), .access = access_pmu_evtyper }, { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr }, { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr }, /* CCSIDR2 */ { Op1(1), CRn( 0), CRm( 0), Op2(2), undef_access }, { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, CSSELR_EL1 }, }; static const struct sys_reg_desc cp15_64_regs[] = { { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, { CP15_PMU_SYS_REG(DIRECT, 0, 0, 9, 0), .access = access_pmu_evcntr }, { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */ { SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer }, { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ { SYS_DESC(SYS_AARCH32_CNTVCT), access_arch_timer }, { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, { SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer }, { SYS_DESC(SYS_AARCH32_CNTVCTSS), access_arch_timer }, }; static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, bool is_32) { unsigned int i; for (i = 0; i < n; i++) { if (!is_32 && table[i].reg && !table[i].reset) { kvm_err("sys_reg table %pS entry %d (%s) lacks reset\n", &table[i], i, table[i].name); return false; } if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) { kvm_err("sys_reg table %pS entry %d (%s -> %s) out of order\n", &table[i], i, table[i - 1].name, table[i].name); return false; } } return true; } int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); return 1; } static void perform_access(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); /* Check for regs disabled by runtime config */ if (sysreg_hidden(vcpu, r)) { kvm_inject_undefined(vcpu); return; } /* * Not having an accessor means that we have configured a trap * that we don't know how to handle. This certainly qualifies * as a gross bug that should be fixed right away. */ BUG_ON(!r->access); /* Skip instruction if instructed so */ if (likely(r->access(vcpu, params, r))) kvm_incr_pc(vcpu); } /* * emulate_cp -- tries to match a sys_reg access in a handling table, and * call the corresponding trap handler. * * @params: pointer to the descriptor of the access * @table: array of trap descriptors * @num: size of the trap descriptor array * * Return true if the access has been handled, false if not. */ static bool emulate_cp(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *table, size_t num) { const struct sys_reg_desc *r; if (!table) return false; /* Not handled */ r = find_reg(params, table, num); if (r) { perform_access(vcpu, params, r); return true; } /* Not handled */ return false; } static void unhandled_cp_access(struct kvm_vcpu *vcpu, struct sys_reg_params *params) { u8 esr_ec = kvm_vcpu_trap_get_class(vcpu); int cp = -1; switch (esr_ec) { case ESR_ELx_EC_CP15_32: case ESR_ELx_EC_CP15_64: cp = 15; break; case ESR_ELx_EC_CP14_MR: case ESR_ELx_EC_CP14_64: cp = 14; break; default: WARN_ON(1); } print_sys_reg_msg(params, "Unsupported guest CP%d access at: %08lx [%08lx]\n", cp, *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); kvm_inject_undefined(vcpu); } /** * kvm_handle_cp_64 -- handles a mrrc/mcrr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer * @global: &struct sys_reg_desc * @nr_global: size of the @global array */ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, const struct sys_reg_desc *global, size_t nr_global) { struct sys_reg_params params; u64 esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); int Rt2 = (esr >> 10) & 0x1f; params.CRm = (esr >> 1) & 0xf; params.is_write = ((esr & 1) == 0); params.Op0 = 0; params.Op1 = (esr >> 16) & 0xf; params.Op2 = 0; params.CRn = 0; /* * Make a 64-bit value out of Rt and Rt2. As we use the same trap * backends between AArch32 and AArch64, we get away with it. */ if (params.is_write) { params.regval = vcpu_get_reg(vcpu, Rt) & 0xffffffff; params.regval |= vcpu_get_reg(vcpu, Rt2) << 32; } /* * If the table contains a handler, handle the * potential register operation in the case of a read and return * with success. */ if (emulate_cp(vcpu, ¶ms, global, nr_global)) { /* Split up the value between registers for the read side */ if (!params.is_write) { vcpu_set_reg(vcpu, Rt, lower_32_bits(params.regval)); vcpu_set_reg(vcpu, Rt2, upper_32_bits(params.regval)); } return 1; } unhandled_cp_access(vcpu, ¶ms); return 1; } static bool emulate_sys_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params); /* * The CP10 ID registers are architecturally mapped to AArch64 feature * registers. Abuse that fact so we can rely on the AArch64 handler for accesses * from AArch32. */ static bool kvm_esr_cp10_id_to_sys64(u64 esr, struct sys_reg_params *params) { u8 reg_id = (esr >> 10) & 0xf; bool valid; params->is_write = ((esr & 1) == 0); params->Op0 = 3; params->Op1 = 0; params->CRn = 0; params->CRm = 3; /* CP10 ID registers are read-only */ valid = !params->is_write; switch (reg_id) { /* MVFR0 */ case 0b0111: params->Op2 = 0; break; /* MVFR1 */ case 0b0110: params->Op2 = 1; break; /* MVFR2 */ case 0b0101: params->Op2 = 2; break; default: valid = false; } if (valid) return true; kvm_pr_unimpl("Unhandled cp10 register %s: %u\n", params->is_write ? "write" : "read", reg_id); return false; } /** * kvm_handle_cp10_id() - Handles a VMRS trap on guest access to a 'Media and * VFP Register' from AArch32. * @vcpu: The vCPU pointer * * MVFR{0-2} are architecturally mapped to the AArch64 MVFR{0-2}_EL1 registers. * Work out the correct AArch64 system register encoding and reroute to the * AArch64 system register emulation. */ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu) { int Rt = kvm_vcpu_sys_get_rt(vcpu); u64 esr = kvm_vcpu_get_esr(vcpu); struct sys_reg_params params; /* UNDEF on any unhandled register access */ if (!kvm_esr_cp10_id_to_sys64(esr, ¶ms)) { kvm_inject_undefined(vcpu); return 1; } if (emulate_sys_reg(vcpu, ¶ms)) vcpu_set_reg(vcpu, Rt, params.regval); return 1; } /** * kvm_emulate_cp15_id_reg() - Handles an MRC trap on a guest CP15 access where * CRn=0, which corresponds to the AArch32 feature * registers. * @vcpu: the vCPU pointer * @params: the system register access parameters. * * Our cp15 system register tables do not enumerate the AArch32 feature * registers. Conveniently, our AArch64 table does, and the AArch32 system * register encoding can be trivially remapped into the AArch64 for the feature * registers: Append op0=3, leaving op1, CRn, CRm, and op2 the same. * * According to DDI0487G.b G7.3.1, paragraph "Behavior of VMSAv8-32 32-bit * System registers with (coproc=0b1111, CRn==c0)", read accesses from this * range are either UNKNOWN or RES0. Rerouting remains architectural as we * treat undefined registers in this range as RAZ. */ static int kvm_emulate_cp15_id_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params) { int Rt = kvm_vcpu_sys_get_rt(vcpu); /* Treat impossible writes to RO registers as UNDEFINED */ if (params->is_write) { unhandled_cp_access(vcpu, params); return 1; } params->Op0 = 3; /* * All registers where CRm > 3 are known to be UNKNOWN/RAZ from AArch32. * Avoid conflicting with future expansion of AArch64 feature registers * and simply treat them as RAZ here. */ if (params->CRm > 3) params->regval = 0; else if (!emulate_sys_reg(vcpu, params)) return 1; vcpu_set_reg(vcpu, Rt, params->regval); return 1; } /** * kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer * @params: &struct sys_reg_params * @global: &struct sys_reg_desc * @nr_global: size of the @global array */ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *global, size_t nr_global) { int Rt = kvm_vcpu_sys_get_rt(vcpu); params->regval = vcpu_get_reg(vcpu, Rt); if (emulate_cp(vcpu, params, global, nr_global)) { if (!params->is_write) vcpu_set_reg(vcpu, Rt, params->regval); return 1; } unhandled_cp_access(vcpu, params); return 1; } int kvm_handle_cp15_64(struct kvm_vcpu *vcpu) { return kvm_handle_cp_64(vcpu, cp15_64_regs, ARRAY_SIZE(cp15_64_regs)); } int kvm_handle_cp15_32(struct kvm_vcpu *vcpu) { struct sys_reg_params params; params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu)); /* * Certain AArch32 ID registers are handled by rerouting to the AArch64 * system register table. Registers in the ID range where CRm=0 are * excluded from this scheme as they do not trivially map into AArch64 * system register encodings, except for AIDR/REVIDR. */ if (params.Op1 == 0 && params.CRn == 0 && (params.CRm || params.Op2 == 6 /* REVIDR */)) return kvm_emulate_cp15_id_reg(vcpu, ¶ms); if (params.Op1 == 1 && params.CRn == 0 && params.CRm == 0 && params.Op2 == 7 /* AIDR */) return kvm_emulate_cp15_id_reg(vcpu, ¶ms); return kvm_handle_cp_32(vcpu, ¶ms, cp15_regs, ARRAY_SIZE(cp15_regs)); } int kvm_handle_cp14_64(struct kvm_vcpu *vcpu) { return kvm_handle_cp_64(vcpu, cp14_64_regs, ARRAY_SIZE(cp14_64_regs)); } int kvm_handle_cp14_32(struct kvm_vcpu *vcpu) { struct sys_reg_params params; params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu)); return kvm_handle_cp_32(vcpu, ¶ms, cp14_regs, ARRAY_SIZE(cp14_regs)); } /** * emulate_sys_reg - Emulate a guest access to an AArch64 system register * @vcpu: The VCPU pointer * @params: Decoded system register parameters * * Return: true if the system register access was successful, false otherwise. */ static bool emulate_sys_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params) { const struct sys_reg_desc *r; r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); if (likely(r)) { perform_access(vcpu, params, r); return true; } print_sys_reg_msg(params, "Unsupported guest sys_reg access at: %lx [%08lx]\n", *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); kvm_inject_undefined(vcpu); return false; } static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos) { unsigned long i, idreg_idx = 0; for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { const struct sys_reg_desc *r = &sys_reg_descs[i]; if (!is_vm_ftr_id_reg(reg_to_encoding(r))) continue; if (idreg_idx == pos) return r; idreg_idx++; } return NULL; } static void *idregs_debug_start(struct seq_file *s, loff_t *pos) { struct kvm *kvm = s->private; u8 *iter; mutex_lock(&kvm->arch.config_lock); iter = &kvm->arch.idreg_debugfs_iter; if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) && *iter == (u8)~0) { *iter = *pos; if (!idregs_debug_find(kvm, *iter)) iter = NULL; } else { iter = ERR_PTR(-EBUSY); } mutex_unlock(&kvm->arch.config_lock); return iter; } static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos) { struct kvm *kvm = s->private; (*pos)++; if (idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter + 1)) { kvm->arch.idreg_debugfs_iter++; return &kvm->arch.idreg_debugfs_iter; } return NULL; } static void idregs_debug_stop(struct seq_file *s, void *v) { struct kvm *kvm = s->private; if (IS_ERR(v)) return; mutex_lock(&kvm->arch.config_lock); kvm->arch.idreg_debugfs_iter = ~0; mutex_unlock(&kvm->arch.config_lock); } static int idregs_debug_show(struct seq_file *s, void *v) { const struct sys_reg_desc *desc; struct kvm *kvm = s->private; desc = idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter); if (!desc->name) return 0; seq_printf(s, "%20s:\t%016llx\n", desc->name, kvm_read_vm_id_reg(kvm, reg_to_encoding(desc))); return 0; } static const struct seq_operations idregs_debug_sops = { .start = idregs_debug_start, .next = idregs_debug_next, .stop = idregs_debug_stop, .show = idregs_debug_show, }; DEFINE_SEQ_ATTRIBUTE(idregs_debug); void kvm_sys_regs_create_debugfs(struct kvm *kvm) { kvm->arch.idreg_debugfs_iter = ~0; debugfs_create_file("idregs", 0444, kvm->debugfs_dentry, kvm, &idregs_debug_fops); } static void reset_vm_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg) { u32 id = reg_to_encoding(reg); struct kvm *kvm = vcpu->kvm; if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) return; kvm_set_vm_id_reg(kvm, id, reg->reset(vcpu, reg)); } static void reset_vcpu_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg) { if (kvm_vcpu_initialized(vcpu)) return; reg->reset(vcpu, reg); } /** * kvm_reset_sys_regs - sets system registers to reset value * @vcpu: The VCPU pointer * * This function finds the right table above and sets the registers on the * virtual CPU struct to their architecturally defined reset values. */ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; unsigned long i; for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { const struct sys_reg_desc *r = &sys_reg_descs[i]; if (!r->reset) continue; if (is_vm_ftr_id_reg(reg_to_encoding(r))) reset_vm_ftr_id_reg(vcpu, r); else if (is_vcpu_ftr_id_reg(reg_to_encoding(r))) reset_vcpu_ftr_id_reg(vcpu, r); else r->reset(vcpu, r); if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS) (void)__vcpu_sys_reg(vcpu, r->reg); } set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); if (kvm_vcpu_has_pmu(vcpu)) kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); } /** * kvm_handle_sys_reg -- handles a system instruction or mrs/msr instruction * trap on a guest execution * @vcpu: The VCPU pointer */ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) { const struct sys_reg_desc *desc = NULL; struct sys_reg_params params; unsigned long esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); int sr_idx; trace_kvm_handle_sys_reg(esr); if (triage_sysreg_trap(vcpu, &sr_idx)) return 1; params = esr_sys64_to_params(esr); params.regval = vcpu_get_reg(vcpu, Rt); /* System registers have Op0=={2,3}, as per DDI487 J.a C5.1.2 */ if (params.Op0 == 2 || params.Op0 == 3) desc = &sys_reg_descs[sr_idx]; else desc = &sys_insn_descs[sr_idx]; perform_access(vcpu, ¶ms, desc); /* Read from system register? */ if (!params.is_write && (params.Op0 == 2 || params.Op0 == 3)) vcpu_set_reg(vcpu, Rt, params.regval); return 1; } /****************************************************************************** * Userspace API *****************************************************************************/ static bool index_to_params(u64 id, struct sys_reg_params *params) { switch (id & KVM_REG_SIZE_MASK) { case KVM_REG_SIZE_U64: /* Any unused index bits means it's not valid. */ if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK | KVM_REG_ARM64_SYSREG_OP0_MASK | KVM_REG_ARM64_SYSREG_OP1_MASK | KVM_REG_ARM64_SYSREG_CRN_MASK | KVM_REG_ARM64_SYSREG_CRM_MASK | KVM_REG_ARM64_SYSREG_OP2_MASK)) return false; params->Op0 = ((id & KVM_REG_ARM64_SYSREG_OP0_MASK) >> KVM_REG_ARM64_SYSREG_OP0_SHIFT); params->Op1 = ((id & KVM_REG_ARM64_SYSREG_OP1_MASK) >> KVM_REG_ARM64_SYSREG_OP1_SHIFT); params->CRn = ((id & KVM_REG_ARM64_SYSREG_CRN_MASK) >> KVM_REG_ARM64_SYSREG_CRN_SHIFT); params->CRm = ((id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT); params->Op2 = ((id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT); return true; default: return false; } } const struct sys_reg_desc *get_reg_by_id(u64 id, const struct sys_reg_desc table[], unsigned int num) { struct sys_reg_params params; if (!index_to_params(id, ¶ms)) return NULL; return find_reg(¶ms, table, num); } /* Decode an index value, and find the sys_reg_desc entry. */ static const struct sys_reg_desc * id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, const struct sys_reg_desc table[], unsigned int num) { const struct sys_reg_desc *r; /* We only do sys_reg for now. */ if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG) return NULL; r = get_reg_by_id(id, table, num); /* Not saved in the sys_reg array and not otherwise accessible? */ if (r && (!(r->reg || r->get_user) || sysreg_hidden(vcpu, r))) r = NULL; return r; } static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val; u32 __user *uval = uaddr; /* Fail if we have unknown bits set. */ if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1))) return -ENOENT; switch (id & KVM_REG_ARM_DEMUX_ID_MASK) { case KVM_REG_ARM_DEMUX_ID_CCSIDR: if (KVM_REG_SIZE(id) != 4) return -ENOENT; val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) >> KVM_REG_ARM_DEMUX_VAL_SHIFT; if (val >= CSSELR_MAX) return -ENOENT; return put_user(get_ccsidr(vcpu, val), uval); default: return -ENOENT; } } static int demux_c15_set(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val, newval; u32 __user *uval = uaddr; /* Fail if we have unknown bits set. */ if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1))) return -ENOENT; switch (id & KVM_REG_ARM_DEMUX_ID_MASK) { case KVM_REG_ARM_DEMUX_ID_CCSIDR: if (KVM_REG_SIZE(id) != 4) return -ENOENT; val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) >> KVM_REG_ARM_DEMUX_VAL_SHIFT; if (val >= CSSELR_MAX) return -ENOENT; if (get_user(newval, uval)) return -EFAULT; return set_ccsidr(vcpu, val, newval); default: return -ENOENT; } } int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num) { u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; const struct sys_reg_desc *r; u64 val; int ret; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (r->get_user) { ret = (r->get_user)(vcpu, r, &val); } else { val = __vcpu_sys_reg(vcpu, r->reg); ret = 0; } if (!ret) ret = put_user(val, uaddr); return ret; } int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(unsigned long)reg->addr; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_get(vcpu, reg->id, uaddr); return kvm_sys_reg_get_user(vcpu, reg, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num) { u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; const struct sys_reg_desc *r; u64 val; int ret; if (get_user(val, uaddr)) return -EFAULT; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (sysreg_user_write_ignore(vcpu, r)) return 0; if (r->set_user) { ret = (r->set_user)(vcpu, r, val); } else { __vcpu_sys_reg(vcpu, r->reg) = val; ret = 0; } return ret; } int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(unsigned long)reg->addr; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_set(vcpu, reg->id, uaddr); return kvm_sys_reg_set_user(vcpu, reg, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } static unsigned int num_demux_regs(void) { return CSSELR_MAX; } static int write_demux_regids(u64 __user *uindices) { u64 val = KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX; unsigned int i; val |= KVM_REG_ARM_DEMUX_ID_CCSIDR; for (i = 0; i < CSSELR_MAX; i++) { if (put_user(val | i, uindices)) return -EFAULT; uindices++; } return 0; } static u64 sys_reg_to_index(const struct sys_reg_desc *reg) { return (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG | (reg->Op0 << KVM_REG_ARM64_SYSREG_OP0_SHIFT) | (reg->Op1 << KVM_REG_ARM64_SYSREG_OP1_SHIFT) | (reg->CRn << KVM_REG_ARM64_SYSREG_CRN_SHIFT) | (reg->CRm << KVM_REG_ARM64_SYSREG_CRM_SHIFT) | (reg->Op2 << KVM_REG_ARM64_SYSREG_OP2_SHIFT)); } static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind) { if (!*uind) return true; if (put_user(sys_reg_to_index(reg), *uind)) return false; (*uind)++; return true; } static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 __user **uind, unsigned int *total) { /* * Ignore registers we trap but don't save, * and for which no custom user accessor is provided. */ if (!(rd->reg || rd->get_user)) return 0; if (sysreg_hidden(vcpu, rd)) return 0; if (!copy_reg_to_user(rd, uind)) return -EFAULT; (*total)++; return 0; } /* Assumed ordered tables, see kvm_sys_reg_table_init. */ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) { const struct sys_reg_desc *i2, *end2; unsigned int total = 0; int err; i2 = sys_reg_descs; end2 = sys_reg_descs + ARRAY_SIZE(sys_reg_descs); while (i2 != end2) { err = walk_one_sys_reg(vcpu, i2++, &uind, &total); if (err) return err; } return total; } unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu) { return num_demux_regs() + walk_sys_regs(vcpu, (u64 __user *)NULL); } int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { int err; err = walk_sys_regs(vcpu, uindices); if (err < 0) return err; uindices += err; return write_demux_regids(uindices); } #define KVM_ARM_FEATURE_ID_RANGE_INDEX(r) \ KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(r), \ sys_reg_Op1(r), \ sys_reg_CRn(r), \ sys_reg_CRm(r), \ sys_reg_Op2(r)) int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range) { const void *zero_page = page_to_virt(ZERO_PAGE(0)); u64 __user *masks = (u64 __user *)range->addr; /* Only feature id range is supported, reserved[13] must be zero. */ if (range->range || memcmp(range->reserved, zero_page, sizeof(range->reserved))) return -EINVAL; /* Wipe the whole thing first */ if (clear_user(masks, KVM_ARM_FEATURE_ID_RANGE_SIZE * sizeof(__u64))) return -EFAULT; for (int i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { const struct sys_reg_desc *reg = &sys_reg_descs[i]; u32 encoding = reg_to_encoding(reg); u64 val; if (!is_feature_id_reg(encoding) || !reg->set_user) continue; if (!reg->val || (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) { continue; } val = reg->val; if (put_user(val, (masks + KVM_ARM_FEATURE_ID_RANGE_INDEX(encoding)))) return -EFAULT; } return 0; } static void vcpu_set_hcr(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; if (has_vhe() || has_hvhe()) vcpu->arch.hcr_el2 |= HCR_E2H; if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { /* route synchronous external abort exceptions to EL2 */ vcpu->arch.hcr_el2 |= HCR_TEA; /* trap error record accesses */ vcpu->arch.hcr_el2 |= HCR_TERR; } if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) vcpu->arch.hcr_el2 |= HCR_FWB; if (cpus_have_final_cap(ARM64_HAS_EVT) && !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && kvm_read_vm_id_reg(kvm, SYS_CTR_EL0) == read_sanitised_ftr_reg(SYS_CTR_EL0)) vcpu->arch.hcr_el2 |= HCR_TID4; else vcpu->arch.hcr_el2 |= HCR_TID2; if (vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 &= ~HCR_RW; if (kvm_has_mte(vcpu->kvm)) vcpu->arch.hcr_el2 |= HCR_ATA; /* * In the absence of FGT, we cannot independently trap TLBI * Range instructions. This isn't great, but trapping all * TLBIs would be far worse. Live with it... */ if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) vcpu->arch.hcr_el2 |= HCR_TTLBOS; } void kvm_calculate_traps(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; mutex_lock(&kvm->arch.config_lock); vcpu_set_hcr(vcpu); vcpu_set_ich_hcr(vcpu); vcpu_set_hcrx(vcpu); if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) goto out; compute_fgu(kvm, HFGRTR_GROUP); compute_fgu(kvm, HFGITR_GROUP); compute_fgu(kvm, HDFGRTR_GROUP); compute_fgu(kvm, HAFGRTR_GROUP); compute_fgu(kvm, HFGRTR2_GROUP); compute_fgu(kvm, HFGITR2_GROUP); compute_fgu(kvm, HDFGRTR2_GROUP); set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); out: mutex_unlock(&kvm->arch.config_lock); } /* * Perform last adjustments to the ID registers that are implied by the * configuration outside of the ID regs themselves, as well as any * initialisation that directly depend on these ID registers (such as * RES0/RES1 behaviours). This is not the place to configure traps though. * * Because this can be called once per CPU, changes must be idempotent. */ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; guard(mutex)(&kvm->arch.config_lock); if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) && irqchip_in_kernel(kvm) && kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) { kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK; kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK; } if (vcpu_has_nv(vcpu)) { int ret = kvm_init_nv_sysregs(vcpu); if (ret) return ret; } return 0; } int __init kvm_sys_reg_table_init(void) { bool valid = true; unsigned int i; int ret = 0; /* Make sure tables are unique and in order. */ valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false); valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true); valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true); valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true); valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true); valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false); if (!valid) return -EINVAL; init_imp_id_regs(); ret = populate_nv_trap_config(); check_feature_map(); for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++) ret = populate_sysreg_config(sys_reg_descs + i, i); for (i = 0; !ret && i < ARRAY_SIZE(sys_insn_descs); i++) ret = populate_sysreg_config(sys_insn_descs + i, i); return ret; } |
166 166 162 3 3 163 165 166 165 166 166 2 2 2 2 1 162 127 124 1 1 3 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 | // SPDX-License-Identifier: GPL-2.0-only /* * Debug and Guest Debug support * * Copyright (C) 2015 - Linaro Ltd * Authors: Alex Bennée <alex.bennee@linaro.org> * Oliver Upton <oliver.upton@linux.dev> */ #include <linux/kvm_host.h> #include <linux/hw_breakpoint.h> #include <asm/debug-monitors.h> #include <asm/kvm_asm.h> #include <asm/kvm_arm.h> #include <asm/kvm_emulate.h> /** * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value * * @vcpu: the vcpu pointer * * This ensures we will trap access to: * - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR) * - Debug ROM Address (MDCR_EL2_TDRA) * - OS related registers (MDCR_EL2_TDOSA) * - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB) * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF) * - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB) */ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) { preempt_disable(); /* * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK * to disable guest access to the profiling and trace buffers */ vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, *host_data_ptr(nr_event_counters)); vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | MDCR_EL2_TPMS | MDCR_EL2_TTRF | MDCR_EL2_TPMCR | MDCR_EL2_TDRA | MDCR_EL2_TDOSA); /* Is the VM being debugged by userspace? */ if (vcpu->guest_debug) /* Route all software debug exceptions to EL2 */ vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; /* * Trap debug registers if the guest doesn't have ownership of them. */ if (!kvm_guest_owns_debug_regs(vcpu)) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; /* Write MDCR_EL2 directly if we're already at EL2 */ if (has_vhe()) write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); preempt_enable(); } void kvm_init_host_debug_data(void) { u64 dfr0 = read_sysreg(id_aa64dfr0_el1); if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0) *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N, read_sysreg(pmcr_el0)); *host_data_ptr(debug_brps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0); *host_data_ptr(debug_wrps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0); if (has_vhe()) return; if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P)) host_data_set_flag(HAS_SPE); if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) { /* Force disable trace in protected mode in case of no TRBE */ if (is_protected_kvm_enabled()) host_data_set_flag(EL1_TRACING_CONFIGURED); if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) host_data_set_flag(HAS_TRBE); } } /* * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host * has taken over MDSCR_EL1. * * - Userspace is single-stepping the guest, and MDSCR_EL1.SS is forced to 1. * * - Userspace is using the breakpoint/watchpoint registers to debug the * guest, and MDSCR_EL1.MDE is forced to 1. * * - The guest has enabled the OS Lock, and KVM is forcing MDSCR_EL1.MDE to 0, * masking all debug exceptions affected by the OS Lock. */ static void setup_external_mdscr(struct kvm_vcpu *vcpu) { /* * Use the guest's MDSCR_EL1 as a starting point, since there are * several other features controlled by MDSCR_EL1 that are not relevant * to the host. * * Clear the bits that KVM may use which also satisfies emulation of * the OS Lock as MDSCR_EL1.MDE is cleared. */ u64 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1) & ~(MDSCR_EL1_SS | MDSCR_EL1_MDE | MDSCR_EL1_KDE); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) mdscr |= MDSCR_EL1_SS; if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) mdscr |= MDSCR_EL1_MDE | MDSCR_EL1_KDE; vcpu->arch.external_mdscr_el1 = mdscr; } void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) { u64 mdscr; /* Must be called before kvm_vcpu_load_vhe() */ KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm); /* * Determine which of the possible debug states we're in: * * - VCPU_DEBUG_HOST_OWNED: KVM has taken ownership of the guest's * breakpoint/watchpoint registers, or needs to use MDSCR_EL1 to do * software step or emulate the effects of the OS Lock being enabled. * * - VCPU_DEBUG_GUEST_OWNED: The guest has debug exceptions enabled, and * the breakpoint/watchpoint registers need to be loaded eagerly. * * - VCPU_DEBUG_FREE: Neither of the above apply, no breakpoint/watchpoint * context needs to be loaded on the CPU. */ if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED; setup_external_mdscr(vcpu); /* * Steal the guest's single-step state machine if userspace wants * single-step the guest. */ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { if (*vcpu_cpsr(vcpu) & DBG_SPSR_SS) vcpu_clear_flag(vcpu, GUEST_SS_ACTIVE_PENDING); else vcpu_set_flag(vcpu, GUEST_SS_ACTIVE_PENDING); if (!vcpu_get_flag(vcpu, HOST_SS_ACTIVE_PENDING)) *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; else *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; } } else { mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); if (mdscr & (MDSCR_EL1_KDE | MDSCR_EL1_MDE)) vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; else vcpu->arch.debug_owner = VCPU_DEBUG_FREE; } kvm_arm_setup_mdcr_el2(vcpu); } void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu) { if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) return; /* * Save the host's software step state and restore the guest's before * potentially returning to userspace. */ if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) vcpu_set_flag(vcpu, HOST_SS_ACTIVE_PENDING); else vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); if (vcpu_get_flag(vcpu, GUEST_SS_ACTIVE_PENDING)) *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; else *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; } /* * Updates ownership of the debug registers after a trapped guest access to a * breakpoint/watchpoint register. Host ownership of the debug registers is of * strictly higher priority, and it is the responsibility of the VMM to emulate * guest debug exceptions in this configuration. */ void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu) { if (kvm_host_owns_debug_regs(vcpu)) return; vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; kvm_arm_setup_mdcr_el2(vcpu); } void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val) { if (val & OSLAR_EL1_OSLK) __vcpu_sys_reg(vcpu, OSLSR_EL1) |= OSLSR_EL1_OSLK; else __vcpu_sys_reg(vcpu, OSLSR_EL1) &= ~OSLSR_EL1_OSLK; preempt_disable(); kvm_arch_vcpu_put(vcpu); kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); } void kvm_enable_trbe(void) { if (has_vhe() || is_protected_kvm_enabled() || WARN_ON_ONCE(preemptible())) return; host_data_set_flag(TRBE_ENABLED); } EXPORT_SYMBOL_GPL(kvm_enable_trbe); void kvm_disable_trbe(void) { if (has_vhe() || is_protected_kvm_enabled() || WARN_ON_ONCE(preemptible())) return; host_data_clear_flag(TRBE_ENABLED); } EXPORT_SYMBOL_GPL(kvm_disable_trbe); void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest) { if (is_protected_kvm_enabled() || WARN_ON_ONCE(preemptible())) return; if (has_vhe()) { write_sysreg_s(trfcr_while_in_guest, SYS_TRFCR_EL12); return; } *host_data_ptr(trfcr_while_in_guest) = trfcr_while_in_guest; if (read_sysreg_s(SYS_TRFCR_EL1) != trfcr_while_in_guest) host_data_set_flag(EL1_TRACING_CONFIGURED); else host_data_clear_flag(EL1_TRACING_CONFIGURED); } EXPORT_SYMBOL_GPL(kvm_tracing_set_el1_configuration); |
7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 | /* SPDX-License-Identifier: GPL-2.0-only */ /* Authors: Karl MacMillan <kmacmillan@tresys.com> * Frank Mayer <mayerf@tresys.com> * Copyright (C) 2003 - 2004 Tresys Technology, LLC */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/slab.h> #include "security.h" #include "conditional.h" #include "services.h" /* * cond_evaluate_expr evaluates a conditional expr * in reverse polish notation. It returns true (1), false (0), * or undefined (-1). Undefined occurs when the expression * exceeds the stack depth of COND_EXPR_MAXDEPTH. */ static int cond_evaluate_expr(struct policydb *p, struct cond_expr *expr) { u32 i; int s[COND_EXPR_MAXDEPTH]; int sp = -1; if (expr->len == 0) return -1; for (i = 0; i < expr->len; i++) { struct cond_expr_node *node = &expr->nodes[i]; switch (node->expr_type) { case COND_BOOL: if (sp == (COND_EXPR_MAXDEPTH - 1)) return -1; sp++; s[sp] = p->bool_val_to_struct[node->boolean - 1]->state; break; case COND_NOT: if (sp < 0) return -1; s[sp] = !s[sp]; break; case COND_OR: if (sp < 1) return -1; sp--; s[sp] |= s[sp + 1]; break; case COND_AND: if (sp < 1) return -1; sp--; s[sp] &= s[sp + 1]; break; case COND_XOR: if (sp < 1) return -1; sp--; s[sp] ^= s[sp + 1]; break; case COND_EQ: if (sp < 1) return -1; sp--; s[sp] = (s[sp] == s[sp + 1]); break; case COND_NEQ: if (sp < 1) return -1; sp--; s[sp] = (s[sp] != s[sp + 1]); break; default: return -1; } } return s[0]; } /* * evaluate_cond_node evaluates the conditional stored in * a struct cond_node and if the result is different than the * current state of the node it sets the rules in the true/false * list appropriately. If the result of the expression is undefined * all of the rules are disabled for safety. */ static void evaluate_cond_node(struct policydb *p, struct cond_node *node) { struct avtab_node *avnode; int new_state; u32 i; new_state = cond_evaluate_expr(p, &node->expr); if (new_state != node->cur_state) { node->cur_state = new_state; if (new_state == -1) pr_err("SELinux: expression result was undefined - disabling all rules.\n"); /* turn the rules on or off */ for (i = 0; i < node->true_list.len; i++) { avnode = node->true_list.nodes[i]; if (new_state <= 0) avnode->key.specified &= ~AVTAB_ENABLED; else avnode->key.specified |= AVTAB_ENABLED; } for (i = 0; i < node->false_list.len; i++) { avnode = node->false_list.nodes[i]; /* -1 or 1 */ if (new_state) avnode->key.specified &= ~AVTAB_ENABLED; else avnode->key.specified |= AVTAB_ENABLED; } } } void evaluate_cond_nodes(struct policydb *p) { u32 i; for (i = 0; i < p->cond_list_len; i++) evaluate_cond_node(p, &p->cond_list[i]); } void cond_policydb_init(struct policydb *p) { p->bool_val_to_struct = NULL; p->cond_list = NULL; p->cond_list_len = 0; avtab_init(&p->te_cond_avtab); } static void cond_node_destroy(struct cond_node *node) { kfree(node->expr.nodes); /* the avtab_ptr_t nodes are destroyed by the avtab */ kfree(node->true_list.nodes); kfree(node->false_list.nodes); } static void cond_list_destroy(struct policydb *p) { u32 i; for (i = 0; i < p->cond_list_len; i++) cond_node_destroy(&p->cond_list[i]); kfree(p->cond_list); p->cond_list = NULL; p->cond_list_len = 0; } void cond_policydb_destroy(struct policydb *p) { kfree(p->bool_val_to_struct); avtab_destroy(&p->te_cond_avtab); cond_list_destroy(p); } int cond_init_bool_indexes(struct policydb *p) { kfree(p->bool_val_to_struct); p->bool_val_to_struct = kmalloc_array( p->p_bools.nprim, sizeof(*p->bool_val_to_struct), GFP_KERNEL); if (!p->bool_val_to_struct) return -ENOMEM; avtab_hash_eval(&p->te_cond_avtab, "conditional_rules"); return 0; } int cond_destroy_bool(void *key, void *datum, void *p) { kfree(key); kfree(datum); return 0; } int cond_index_bool(void *key, void *datum, void *datap) { struct policydb *p; struct cond_bool_datum *booldatum; booldatum = datum; p = datap; if (!booldatum->value || booldatum->value > p->p_bools.nprim) return -EINVAL; p->sym_val_to_name[SYM_BOOLS][booldatum->value - 1] = key; p->bool_val_to_struct[booldatum->value - 1] = booldatum; return 0; } static int bool_isvalid(struct cond_bool_datum *b) { if (!(b->state == 0 || b->state == 1)) return 0; return 1; } int cond_read_bool(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct cond_bool_datum *booldatum; __le32 buf[3]; u32 len; int rc; booldatum = kzalloc(sizeof(*booldatum), GFP_KERNEL); if (!booldatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof(buf)); if (rc) goto err; booldatum->value = le32_to_cpu(buf[0]); booldatum->state = le32_to_cpu(buf[1]); rc = -EINVAL; if (!bool_isvalid(booldatum)) goto err; len = le32_to_cpu(buf[2]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto err; rc = symtab_insert(s, key, booldatum); if (rc) goto err; return 0; err: cond_destroy_bool(key, booldatum, NULL); return rc; } struct cond_insertf_data { struct policydb *p; struct avtab_node **dst; struct cond_av_list *other; }; static int cond_insertf(struct avtab *a, const struct avtab_key *k, const struct avtab_datum *d, void *ptr) { struct cond_insertf_data *data = ptr; struct policydb *p = data->p; struct cond_av_list *other = data->other; struct avtab_node *node_ptr; u32 i; bool found; /* * For type rules we have to make certain there aren't any * conflicting rules by searching the te_avtab and the * cond_te_avtab. */ if (k->specified & AVTAB_TYPE) { if (avtab_search_node(&p->te_avtab, k)) { pr_err("SELinux: type rule already exists outside of a conditional.\n"); return -EINVAL; } /* * If we are reading the false list other will be a pointer to * the true list. We can have duplicate entries if there is only * 1 other entry and it is in our true list. * * If we are reading the true list (other == NULL) there shouldn't * be any other entries. */ if (other) { node_ptr = avtab_search_node(&p->te_cond_avtab, k); if (node_ptr) { if (avtab_search_node_next(node_ptr, k->specified)) { pr_err("SELinux: too many conflicting type rules.\n"); return -EINVAL; } found = false; for (i = 0; i < other->len; i++) { if (other->nodes[i] == node_ptr) { found = true; break; } } if (!found) { pr_err("SELinux: conflicting type rules.\n"); return -EINVAL; } } } else { if (avtab_search_node(&p->te_cond_avtab, k)) { pr_err("SELinux: conflicting type rules when adding type rule for true.\n"); return -EINVAL; } } } node_ptr = avtab_insert_nonunique(&p->te_cond_avtab, k, d); if (!node_ptr) { pr_err("SELinux: could not insert rule.\n"); return -ENOMEM; } *data->dst = node_ptr; return 0; } static int cond_read_av_list(struct policydb *p, struct policy_file *fp, struct cond_av_list *list, struct cond_av_list *other) { int rc; __le32 buf[1]; u32 i, len; struct cond_insertf_data data; rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; len = le32_to_cpu(buf[0]); if (len == 0) return 0; list->nodes = kcalloc(len, sizeof(*list->nodes), GFP_KERNEL); if (!list->nodes) return -ENOMEM; data.p = p; data.other = other; for (i = 0; i < len; i++) { data.dst = &list->nodes[i]; rc = avtab_read_item(&p->te_cond_avtab, fp, p, cond_insertf, &data, true); if (rc) { kfree(list->nodes); list->nodes = NULL; return rc; } } list->len = len; return 0; } static int expr_node_isvalid(struct policydb *p, struct cond_expr_node *expr) { if (expr->expr_type <= 0 || expr->expr_type > COND_LAST) { pr_err("SELinux: conditional expressions uses unknown operator.\n"); return 0; } if (expr->boolean > p->p_bools.nprim) { pr_err("SELinux: conditional expressions uses unknown bool.\n"); return 0; } return 1; } static int cond_read_node(struct policydb *p, struct cond_node *node, struct policy_file *fp) { __le32 buf[2]; u32 i, len; int rc; rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) return rc; node->cur_state = le32_to_cpu(buf[0]); /* expr */ len = le32_to_cpu(buf[1]); node->expr.nodes = kcalloc(len, sizeof(*node->expr.nodes), GFP_KERNEL); if (!node->expr.nodes) return -ENOMEM; node->expr.len = len; for (i = 0; i < len; i++) { struct cond_expr_node *expr = &node->expr.nodes[i]; rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) return rc; expr->expr_type = le32_to_cpu(buf[0]); expr->boolean = le32_to_cpu(buf[1]); if (!expr_node_isvalid(p, expr)) return -EINVAL; } rc = cond_read_av_list(p, fp, &node->true_list, NULL); if (rc) return rc; return cond_read_av_list(p, fp, &node->false_list, &node->true_list); } int cond_read_list(struct policydb *p, struct policy_file *fp) { __le32 buf[1]; u32 i, len; int rc; rc = next_entry(buf, fp, sizeof(buf)); if (rc) return rc; len = le32_to_cpu(buf[0]); p->cond_list = kcalloc(len, sizeof(*p->cond_list), GFP_KERNEL); if (!p->cond_list) return -ENOMEM; rc = avtab_alloc(&(p->te_cond_avtab), p->te_avtab.nel); if (rc) goto err; p->cond_list_len = len; for (i = 0; i < len; i++) { rc = cond_read_node(p, &p->cond_list[i], fp); if (rc) goto err; } return 0; err: cond_list_destroy(p); return rc; } int cond_write_bool(void *vkey, void *datum, void *ptr) { char *key = vkey; struct cond_bool_datum *booldatum = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; __le32 buf[3]; u32 len; int rc; len = strlen(key); buf[0] = cpu_to_le32(booldatum->value); buf[1] = cpu_to_le32(booldatum->state); buf[2] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; return 0; } /* * cond_write_cond_av_list doesn't write out the av_list nodes. * Instead it writes out the key/value pairs from the avtab. This * is necessary because there is no way to uniquely identifying rules * in the avtab so it is not possible to associate individual rules * in the avtab with a conditional without saving them as part of * the conditional. This means that the avtab with the conditional * rules will not be saved but will be rebuilt on policy load. */ static int cond_write_av_list(struct policydb *p, struct cond_av_list *list, struct policy_file *fp) { __le32 buf[1]; u32 i; int rc; buf[0] = cpu_to_le32(list->len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (i = 0; i < list->len; i++) { rc = avtab_write_item(p, list->nodes[i], fp); if (rc) return rc; } return 0; } static int cond_write_node(struct policydb *p, struct cond_node *node, struct policy_file *fp) { __le32 buf[2]; int rc; u32 i; buf[0] = cpu_to_le32(node->cur_state); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; buf[0] = cpu_to_le32(node->expr.len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (i = 0; i < node->expr.len; i++) { buf[0] = cpu_to_le32(node->expr.nodes[i].expr_type); buf[1] = cpu_to_le32(node->expr.nodes[i].boolean); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; } rc = cond_write_av_list(p, &node->true_list, fp); if (rc) return rc; rc = cond_write_av_list(p, &node->false_list, fp); if (rc) return rc; return 0; } int cond_write_list(struct policydb *p, struct policy_file *fp) { u32 i; __le32 buf[1]; int rc; buf[0] = cpu_to_le32(p->cond_list_len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (i = 0; i < p->cond_list_len; i++) { rc = cond_write_node(p, &p->cond_list[i], fp); if (rc) return rc; } return 0; } void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key, struct extended_perms_decision *xpermd) { struct avtab_node *node; if (!ctab || !key || !xpermd) return; for (node = avtab_search_node(ctab, key); node; node = avtab_search_node_next(node, key->specified)) { if (node->key.specified & AVTAB_ENABLED) services_compute_xperms_decision(xpermd, node); } } /* Determine whether additional permissions are granted by the conditional * av table, and if so, add them to the result */ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd, struct extended_perms *xperms) { struct avtab_node *node; if (!ctab || !key || !avd) return; for (node = avtab_search_node(ctab, key); node; node = avtab_search_node_next(node, key->specified)) { if ((u16)(AVTAB_ALLOWED | AVTAB_ENABLED) == (node->key.specified & (AVTAB_ALLOWED | AVTAB_ENABLED))) avd->allowed |= node->datum.u.data; if ((u16)(AVTAB_AUDITDENY | AVTAB_ENABLED) == (node->key.specified & (AVTAB_AUDITDENY | AVTAB_ENABLED))) /* Since a '0' in an auditdeny mask represents a * permission we do NOT want to audit (dontaudit), we use * the '&' operand to ensure that all '0's in the mask * are retained (much unlike the allow and auditallow cases). */ avd->auditdeny &= node->datum.u.data; if ((u16)(AVTAB_AUDITALLOW | AVTAB_ENABLED) == (node->key.specified & (AVTAB_AUDITALLOW | AVTAB_ENABLED))) avd->auditallow |= node->datum.u.data; if (xperms && (node->key.specified & AVTAB_ENABLED) && (node->key.specified & AVTAB_XPERMS)) services_compute_xperms_drivers(xperms, node); } } static int cond_dup_av_list(struct cond_av_list *new, const struct cond_av_list *orig, struct avtab *avtab) { u32 i; memset(new, 0, sizeof(*new)); new->nodes = kcalloc(orig->len, sizeof(*new->nodes), GFP_KERNEL); if (!new->nodes) return -ENOMEM; for (i = 0; i < orig->len; i++) { new->nodes[i] = avtab_insert_nonunique( avtab, &orig->nodes[i]->key, &orig->nodes[i]->datum); if (!new->nodes[i]) return -ENOMEM; new->len++; } return 0; } static int duplicate_policydb_cond_list(struct policydb *newp, const struct policydb *origp) { int rc; u32 i; rc = avtab_alloc_dup(&newp->te_cond_avtab, &origp->te_cond_avtab); if (rc) return rc; newp->cond_list_len = 0; newp->cond_list = kcalloc(origp->cond_list_len, sizeof(*newp->cond_list), GFP_KERNEL); if (!newp->cond_list) goto error; for (i = 0; i < origp->cond_list_len; i++) { struct cond_node *newn = &newp->cond_list[i]; const struct cond_node *orign = &origp->cond_list[i]; newp->cond_list_len++; newn->cur_state = orign->cur_state; newn->expr.nodes = kmemdup(orign->expr.nodes, orign->expr.len * sizeof(*orign->expr.nodes), GFP_KERNEL); if (!newn->expr.nodes) goto error; newn->expr.len = orign->expr.len; rc = cond_dup_av_list(&newn->true_list, &orign->true_list, &newp->te_cond_avtab); if (rc) goto error; rc = cond_dup_av_list(&newn->false_list, &orign->false_list, &newp->te_cond_avtab); if (rc) goto error; } return 0; error: avtab_destroy(&newp->te_cond_avtab); cond_list_destroy(newp); return -ENOMEM; } static int cond_bools_destroy(void *key, void *datum, void *args) { /* key was not copied so no need to free here */ kfree(datum); return 0; } static int cond_bools_copy(struct hashtab_node *new, const struct hashtab_node *orig, void *args) { struct cond_bool_datum *datum; datum = kmemdup(orig->datum, sizeof(struct cond_bool_datum), GFP_KERNEL); if (!datum) return -ENOMEM; new->key = orig->key; /* No need to copy, never modified */ new->datum = datum; return 0; } static int cond_bools_index(void *key, void *datum, void *args) { struct cond_bool_datum *booldatum, **cond_bool_array; booldatum = datum; cond_bool_array = args; cond_bool_array[booldatum->value - 1] = booldatum; return 0; } static int duplicate_policydb_bools(struct policydb *newdb, const struct policydb *orig) { struct cond_bool_datum **cond_bool_array; int rc; cond_bool_array = kmalloc_array(orig->p_bools.nprim, sizeof(*orig->bool_val_to_struct), GFP_KERNEL); if (!cond_bool_array) return -ENOMEM; rc = hashtab_duplicate(&newdb->p_bools.table, &orig->p_bools.table, cond_bools_copy, cond_bools_destroy, NULL); if (rc) { kfree(cond_bool_array); return -ENOMEM; } hashtab_map(&newdb->p_bools.table, cond_bools_index, cond_bool_array); newdb->bool_val_to_struct = cond_bool_array; newdb->p_bools.nprim = orig->p_bools.nprim; return 0; } void cond_policydb_destroy_dup(struct policydb *p) { hashtab_map(&p->p_bools.table, cond_bools_destroy, NULL); hashtab_destroy(&p->p_bools.table); cond_policydb_destroy(p); } int cond_policydb_dup(struct policydb *new, const struct policydb *orig) { cond_policydb_init(new); if (duplicate_policydb_bools(new, orig)) return -ENOMEM; if (duplicate_policydb_cond_list(new, orig)) { cond_policydb_destroy_dup(new); return -ENOMEM; } return 0; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_SYSCALL_H #define __ASM_SYSCALL_H #include <uapi/linux/audit.h> #include <linux/compat.h> #include <linux/err.h> typedef long (*syscall_fn_t)(const struct pt_regs *regs); extern const syscall_fn_t sys_call_table[]; #ifdef CONFIG_COMPAT extern const syscall_fn_t compat_sys_call_table[]; #endif static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { return regs->syscallno; } static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { regs->regs[0] = regs->orig_x0; } static inline long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs) { unsigned long val = regs->regs[0]; if (is_compat_thread(task_thread_info(task))) val = sign_extend64(val, 31); return val; } static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { unsigned long error = syscall_get_return_value(task, regs); return IS_ERR_VALUE(error) ? error : 0; } static inline void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, int error, long val) { if (error) val = error; if (is_compat_thread(task_thread_info(task))) val = lower_32_bits(val); regs->regs[0] = val; } #define SYSCALL_MAX_ARGS 6 static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args) { args[0] = regs->orig_x0; args++; memcpy(args, ®s->regs[1], 5 * sizeof(args[0])); } /* * We don't care about endianness (__AUDIT_ARCH_LE bit) here because * AArch64 has the same system calls both on little- and big- endian. */ static inline int syscall_get_arch(struct task_struct *task) { if (is_compat_thread(task_thread_info(task))) return AUDIT_ARCH_ARM; return AUDIT_ARCH_AARCH64; } int syscall_trace_enter(struct pt_regs *regs); void syscall_trace_exit(struct pt_regs *regs); #endif /* __ASM_SYSCALL_H */ |
787 787 724 738 738 739 738 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMZONE_H #define _LINUX_MMZONE_H #ifndef __ASSEMBLY__ #ifndef __GENERATING_BOUNDS_H #include <linux/spinlock.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/cache.h> #include <linux/threads.h> #include <linux/numa.h> #include <linux/init.h> #include <linux/seqlock.h> #include <linux/nodemask.h> #include <linux/pageblock-flags.h> #include <linux/page-flags-layout.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/local_lock.h> #include <linux/zswap.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER #define MAX_PAGE_ORDER 10 #else #define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif #define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) #define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should * coalesce naturally under reasonable reclaim pressure and those which * will not. */ #define PAGE_ALLOC_COSTLY_ORDER 3 enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. */ MIGRATE_CMA, #endif #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif MIGRATE_TYPES }; /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) # define is_migrate_cma_folio(folio, pfn) (MIGRATE_CMA == \ get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK)) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false # define is_migrate_cma_folio(folio, pfn) false #endif static inline bool is_migrate_movable(int mt) { return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } /* * Check whether a migratetype can be merged with another migratetype. * * It is only mergeable when it can fall back to other migratetypes for * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c. */ static inline bool migratetype_is_mergeable(int mt) { return mt < MIGRATE_PCPTYPES; } #define for_each_migratetype_order(order, type) \ for (order = 0; order < NR_PAGE_ORDERS; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; #define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1) #define get_pageblock_migratetype(page) \ get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) #define folio_migratetype(folio) \ get_pfnblock_flags_mask(&folio->page, folio_pfn(folio), \ MIGRATETYPE_MASK) struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; }; struct pglist_data; #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ NUMA_FOREIGN, /* was intended here, hit elsewhere */ NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ NR_VM_NUMA_EVENT_ITEMS }; #else #define NR_VM_NUMA_EVENT_ITEMS 0 #endif enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_FREE_PAGES_BLOCKS, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif NR_FREE_CMA_PAGES, #ifdef CONFIG_UNACCEPTED_MEMORY NR_UNACCEPTED, #endif NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_FILE, WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_FILE, WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_FILE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_PAGETABLE, /* used for pagetables */ NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */ #ifdef CONFIG_IOMMU_SUPPORT NR_IOMMU_PAGES, /* # of pages allocated by IOMMU */ #endif #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ PGPROMOTE_CANDIDATE, /* candidate pages to promote */ #endif /* PGDEMOTE_*: pages demoted */ PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, PGDEMOTE_PROACTIVE, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif NR_BALLOON_PAGES, NR_VM_NODE_STAT_ITEMS }; /* * Returns true if the item should be printed in THPs (/proc/vmstat * currently prints number of anon, file and shmem THPs. But the item * is charged in pages). */ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return false; return item == NR_ANON_THPS || item == NR_FILE_THPS || item == NR_SHMEM_THPS || item == NR_SHMEM_PMDMAPPED || item == NR_FILE_PMDMAPPED; } /* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation * might be different. */ static __always_inline bool vmstat_item_in_bytes(int idx) { /* * Global and per-node slab counters track slab pages. * It's expected that changes are multiples of PAGE_SIZE. * Internally values are stored in pages. * * Per-memcg and per-lruvec counters track memory, consumed * by individual slab objects. These counters are actually * byte-precise. */ return (idx == NR_SLAB_RECLAIMABLE_B || idx == NR_SLAB_UNRECLAIMABLE_B); } /* * We do arithmetic on the LRU lists in various places in the code, * so it is important to keep the active lists LRU_ACTIVE higher in * the array than the corresponding inactive lists, and to keep * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. * * This has to be kept in sync with the statistics in zone_stat_item * above and the descriptions in vmstat_text in mm/vmstat.c */ #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS }; enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, VMSCAN_THROTTLE_ISOLATED, VMSCAN_THROTTLE_NOPROGRESS, VMSCAN_THROTTLE_CONGESTED, NR_VMSCAN_THROTTLE, }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) static inline bool is_file_lru(enum lru_list lru) { return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } #define WORKINGSET_ANON 0 #define WORKINGSET_FILE 1 #define ANON_AND_FILE 2 enum lruvec_flags { /* * An lruvec has many dirty pages backed by a congested BDI: * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim. * It can be cleared by cgroup reclaim or kswapd. * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim. * It can only be cleared by kswapd. * * Essentially, kswapd can unthrottle an lruvec throttled by cgroup * reclaim, but not vice versa. This only applies to the root cgroup. * The goal is to prevent cgroup reclaim on the root cgroup (e.g. * memory.reclaim) to unthrottle an unbalanced node (that was throttled * by kswapd). */ LRUVEC_CGROUP_CONGESTED, LRUVEC_NODE_CONGESTED, }; #endif /* !__GENERATING_BOUNDS_H */ /* * Evictable folios are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while * a folio is on one of lrugen->folios[]. Otherwise it stores 0. * * After a folio is faulted in, the aging needs to check the accessed bit at * least twice before handing this folio over to the eviction. The first check * clears the accessed bit from the initial fault; the second check makes sure * this folio hasn't been used since then. This process, AKA second chance, * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two * generations are considered active; the rest of generations, if they exist, * are considered inactive. See lru_gen_is_active(). * * PG_active is always cleared while a folio is on one of lrugen->folios[] so * that the sliding window needs not to worry about it. And it's set again when * a folio considered active is isolated for non-reclaiming purposes, e.g., * migration. See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits * in folio->flags, masked by LRU_GEN_MASK. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* * Each generation is divided into multiple tiers. A folio accessed N times * through file descriptors is in tier order_base_2(N). A folio in the first * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by * PG_workingset. A folio in any other tier (1<N<5) between the first and last * is marked by additional bits of LRU_REFS_WIDTH in folio->flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, * comparisons of refaulted/(evicted+protected) from the first tier and the rest * infer whether folios accessed multiple times through file descriptors are * statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in * folio->flags, masked by LRU_REFS_MASK. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) /* * For folios accessed multiple times through file descriptors, * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily * promoted into the second oldest generation in the eviction path. And when * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is * only valid when PG_referenced is set. * * For folios accessed multiple times through page tables, folio_update_gen() * from a page table walk or lru_gen_set_refs() from a rmap walk sets * PG_referenced after the accessed bit is cleared for the first time. * Thereafter, those two paths set PG_workingset and promote folios to the * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears * PG_referenced. Note that for this case, LRU_REFS_MASK is not used. * * For both cases above, after PG_workingset is set on a folio, it remains until * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It * can be set again if lru_gen_test_recent() returns true upon a refault. */ #define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced)) struct lruvec; struct page_vma_mapped_walk; #ifdef CONFIG_LRU_GEN enum { LRU_GEN_ANON, LRU_GEN_FILE, }; enum { LRU_GEN_CORE, LRU_GEN_MM_WALK, LRU_GEN_NONLEAF_YOUNG, NR_LRU_GEN_CAPS }; #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) /* whether to keep historical stats from evicted generations */ #ifdef CONFIG_LRU_GEN_STATS #define NR_HIST_GENS MAX_NR_GENS #else #define NR_HIST_GENS 1U #endif /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are * stored in min_seq[] separately for anon and file types so that they can be * incremented independently. Ideally min_seq[] are kept in sync when both anon * and file types are evictable. However, to adapt to situations like extreme * swappiness, they are allowed to be out of sync by at most * MAX_NR_GENS-MIN_NR_GENS-1. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ unsigned long min_seq[ANON_AND_FILE]; /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; /* can only be modified under the LRU lock */ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; /* the memcg generation this lru_gen_folio belongs to */ u8 gen; /* the list segment this lru_gen_folio belongs to */ u8 seg; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_node list; }; enum { MM_LEAF_TOTAL, /* total leaf entries */ MM_LEAF_YOUNG, /* young leaf entries */ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ NR_MM_STATS }; /* double-buffering Bloom filters */ #define NR_BLOOM_FILTERS 2 struct lru_gen_mm_state { /* synced with max_seq after each iteration */ unsigned long seq; /* where the current iteration continues after */ struct list_head *head; /* where the last iteration ended before */ struct list_head *tail; /* Bloom filters flip after each iteration */ unsigned long *filters[NR_BLOOM_FILTERS]; /* the mm stats for debugging */ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; }; struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; /* max_seq from lru_gen_folio: can be out of date */ unsigned long seq; /* the next address within an mm to scan */ unsigned long next_addr; /* to batch promoted pages */ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* to batch the mm stats */ int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; int swappiness; bool force_scan; }; /* * For each node, memcgs are divided into two generations: the old and the * young. For each generation, memcgs are randomly sharded into multiple bins * to improve scalability. For each bin, the hlist_nulls is virtually divided * into three segments: the head, the tail and the default. * * An onlining memcg is added to the tail of a random bin in the old generation. * The eviction starts at the head of a random bin in the old generation. The * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes * the old generation, is incremented when all its bins become empty. * * There are four operations: * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its * current generation (old or young) and updates its "seg" to "head"; * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its * current generation (old or young) and updates its "seg" to "tail"; * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old * generation, updates its "gen" to "old" and resets its "seg" to "default"; * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the * young generation, updates its "gen" to "young" and resets its "seg" to * "default". * * The events that trigger the above operations are: * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; * 2. The first attempt to reclaim a memcg below low, which triggers * MEMCG_LRU_TAIL; * 3. The first attempt to reclaim a memcg offlined or below reclaimable size * threshold, which triggers MEMCG_LRU_TAIL; * 4. The second attempt to reclaim a memcg offlined or below reclaimable size * threshold, which triggers MEMCG_LRU_YOUNG; * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG; * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD. * * Notes: * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing * of their max_seq counters ensures the eventual fairness to all eligible * memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). * 2. There are only two valid generations: old (seq) and young (seq+1). * MEMCG_NR_GENS is set to three so that when reading the generation counter * locklessly, a stale value (seq-1) does not wraparound to young. */ #define MEMCG_NR_GENS 3 #define MEMCG_NR_BINS 8 struct lru_gen_memcg { /* the per-node memcg generation counter */ unsigned long seq; /* each memcg has one lru_gen_folio per node */ unsigned long nr_memcgs[MEMCG_NR_GENS]; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; /* protects the above */ spinlock_t lock; }; void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_lruvec(struct lruvec *lruvec); bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); #else /* !CONFIG_LRU_GEN */ static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) { } static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { return false; } static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } #endif /* CONFIG_LRU_GEN */ struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ spinlock_t lru_lock; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU * increases, the reclaim scan balance tips toward the other. */ unsigned long anon_cost; unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; #ifdef CONFIG_LRU_GEN_WALKS_MMU /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif #endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif struct zswap_lruvec_state zswap_lruvec_state; }; /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) /* Isolate unevictable pages */ #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) /* LRU Isolation modes. */ typedef unsigned __bitwise isolate_mode_t; enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, WMARK_PROMO, NR_WMARK }; /* * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_PCP_THP 2 #else #define NR_PCP_THP 0 #endif #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) /* * Flags used in pcp->flags field. * * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the * previous page freeing. To avoid to drain PCP for an accident * high-order page freeing. * * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before * draining PCP for consecutive high-order pages freeing without * allocation if data cache slice of CPU is large enough. To reduce * zone lock contention and keep cache-hot pages reusing. */ #define PCPF_PREV_FREE_HIGH_ORDER BIT(0) #define PCPF_FREE_HIGH_BATCH BIT(1) struct per_cpu_pages { spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int high_min; /* min high watermark */ int high_max; /* max high watermark */ int batch; /* chunk size for buddy add/remove */ u8 flags; /* protected by pcp->lock */ u8 alloc_factor; /* batch scaling factor during allocate */ #ifdef CONFIG_NUMA u8 expire; /* When 0, remote pagesets are drained */ #endif short free_count; /* consecutive free count */ /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS]; } ____cacheline_aligned_in_smp; struct per_cpu_zonestat { #ifdef CONFIG_SMP s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; s8 stat_threshold; #endif #ifdef CONFIG_NUMA /* * Low priority inaccurate counters that are only folded * on demand. Use a large type to avoid the overhead of * folding during refresh_cpu_vm_stats. */ unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #endif }; struct per_cpu_nodestat { s8 stat_threshold; s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; }; #endif /* !__GENERATING_BOUNDS.H */ enum zone_type { /* * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able * to DMA to all of the addressable memory (ZONE_NORMAL). * On architectures where this area covers the whole 32 bit address * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller * DMA addressing constraints. This distinction is important as a 32bit * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations. */ #ifdef CONFIG_ZONE_DMA ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif /* * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains * movable pages with few exceptional cases described below. Main use * cases for ZONE_MOVABLE are to make memory offlining/unplug more * likely to succeed, and to locally limit unmovable allocations - e.g., * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might * essentially turn such pages unmovable. Therefore, we do not allow * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and * faulted, they come from the right zone right away. However, it is * still possible that address space already has pages in * ZONE_MOVABLE at the time when pages are pinned (i.e. user has * touches that memory before pinning). In such case we migrate them * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. * 3. Memory holes: kernelcore/movablecore setups might create very rare * situations where ZONE_MOVABLE contains memory holes after boot, * for example, if we have sections that are only partially * populated. Memory offlining and allocations fail early. * 4. PG_hwpoison pages: while poisoned pages can be skipped during * memory offlining, such pages cannot be allocated. * 5. Unmovable PG_offline pages: in paravirtualized environments, * hotplugged memory blocks might only partially be managed by the * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The * parts not manged by the buddy are unmovable PG_offline pages. In * some cases (virtio-mem), such pages can be skipped during * memory offlining, however, cannot be moved/allocated. These * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. * 7. Memory-hotplug: when using memmap_on_memory and onlining the * memory to the MOVABLE zone, the vmemmap pages are also placed in * such zone. Such pages cannot be really moved around as they are * self-stored in the range, but they are treated as movable when * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) * have to expect that migrating pages in ZONE_MOVABLE can fail (even * if has_unmovable_pages() states that there are no unmovable pages, * there can be false negatives). */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES }; #ifndef __GENERATING_BOUNDS_H #define ASYNC_AND_SYNC 2 struct zone { /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long _watermark[NR_WMARK]; unsigned long watermark_boost; unsigned long nr_reserved_highatomic; unsigned long nr_free_highatomic; /* * We don't know if the memory that we're going to allocate will be * freeable or/and it will be released eventually, so to avoid totally * wasting several GB of ram we must reserve some of the lower zone * memory (otherwise we risk to run OOM on the lower zones despite * there being tons of freeable ram on the higher zones). This array is * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl * changes. */ long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; #endif struct pglist_data *zone_pgdat; struct per_cpu_pages __percpu *per_cpu_pageset; struct per_cpu_zonestat __percpu *per_cpu_zonestats; /* * the high and batch values are copied to individual pagesets for * faster access */ int pageset_high_min; int pageset_high_max; int pageset_batch; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * present_early_pages is present pages existing within the zone * located on memory available since early boot, excluding hotplugged * memory. * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * cma pages is present pages that are assigned for CMA use * (MIGRATE_CMA). * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/done(). Any reader who can't tolerant drift of * present_pages should use get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; #if defined(CONFIG_MEMORY_HOTPLUG) unsigned long present_early_pages; #endif #ifdef CONFIG_CMA unsigned long cma_pages; #endif const char *name; #ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock. */ unsigned long nr_isolate_pageblock; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif int initialized; /* Write-intensive fields used from the page allocator */ CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ struct free_area free_area[NR_PAGE_ORDERS]; #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages; /* To be called once the last page in the zone is accepted */ struct work_struct unaccepted_cleanup; #endif /* zone flags, see below */ unsigned long flags; /* Primarily protects free_area */ spinlock_t lock; /* Pages to be freed when next trylock succeeds */ struct llist_head trylock_free_pages; /* Write-intensive fields used by compaction and vmstats. */ CACHELINE_PADDING(_pad2_); /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */ unsigned long percpu_drift_mark; #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; /* pfn where compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. * compact_order_failed is the minimum compaction failed order. */ unsigned int compact_considered; unsigned int compact_defer_shift; int compact_order_failed; #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif bool contiguous; CACHELINE_PADDING(_pad3_); /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; enum zone_flags { ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken. */ ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ ZONE_BELOW_HIGH, /* zone is below high watermark. */ }; static inline unsigned long wmark_pages(const struct zone *z, enum zone_watermarks w) { return z->_watermark[w] + z->watermark_boost; } static inline unsigned long min_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_MIN); } static inline unsigned long low_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_LOW); } static inline unsigned long high_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_HIGH); } static inline unsigned long promo_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_PROMO); } static inline unsigned long zone_managed_pages(struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } static inline unsigned long zone_cma_pages(struct zone *zone) { #ifdef CONFIG_CMA return zone->cma_pages; #else return 0; #endif } static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; } static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) { return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } static inline bool zone_is_initialized(struct zone *zone) { return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) { return zone->spanned_pages == 0; } #ifndef BUILD_VDSO32_64 /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. */ /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) #define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) #define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) /* * Define the bit shifts to access each section. For non-existent * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) #define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) #define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF) ? \ SECTIONS_PGOFF : ZONES_PGOFF) #else #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF) ? \ NODES_PGOFF : ZONES_PGOFF) #endif #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) { ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } static inline enum zone_type folio_zonenum(const struct folio *folio) { return page_zonenum(&folio->page); } #ifdef CONFIG_ZONE_DEVICE static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } static inline struct dev_pagemap *page_pgmap(const struct page *page) { VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page); return page_folio(page)->pgmap; } /* * Consecutive zone device pages should not be merged into the same sgl * or bvec segment with other types of pages or if they belong to different * pgmaps. Otherwise getting the pgmap of a given segment is not possible * without scanning the entire segment. This helper returns true either if * both pages are not zone device pages or both pages are zone device pages * with the same pgmap. */ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { if (is_zone_device_page(a) != is_zone_device_page(b)) return false; if (!is_zone_device_page(a)) return true; return page_pgmap(a) == page_pgmap(b); } extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else static inline bool is_zone_device_page(const struct page *page) { return false; } static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { return true; } static inline struct dev_pagemap *page_pgmap(const struct page *page) { return NULL; } #endif static inline bool folio_is_zone_device(const struct folio *folio) { return is_zone_device_page(&folio->page); } static inline bool is_zone_movable_page(const struct page *page) { return page_zonenum(page) == ZONE_MOVABLE; } static inline bool folio_is_zone_movable(const struct folio *folio) { return folio_zonenum(folio) == ZONE_MOVABLE; } #endif /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ static inline bool zone_intersects(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) return false; if (start_pfn >= zone_end_pfn(zone) || start_pfn + nr_pages <= zone->zone_start_pfn) return false; return true; } /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. */ #define DEF_PRIORITY 12 /* Maximum number of zones on a zonelist */ #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) enum { ZONELIST_FALLBACK, /* zonelist with fallback */ #ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled because we need zonelists that * restrict the allocations to a single node for __GFP_THISNODE. */ ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ #endif MAX_ZONELISTS }; /* * This struct contains information about a zone in a zonelist. It is stored * here to avoid dereferences into large structures and lookups of tables */ struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ }; /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; }; /* * The array of struct pages for flatmem. * It must be declared for SPARSEMEM as well because there are configurations * that rely on that. */ extern struct page *mem_map; #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split { spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; }; #endif #ifdef CONFIG_MEMORY_FAILURE /* * Per NUMA node memory failure handling statistics. */ struct memory_failure_stats { /* * Number of raw pages poisoned. * Cases not accounted: memory outside kernel control, offline page, * arch-specific memory_failure (SGX), hwpoison_filter() filtered * error events, and unpoison actions from hwpoison_unpoison. */ unsigned long total; /* * Recovery results of poisoned raw pages handled by memory_failure, * in sync with mf_result. * total = ignored + failed + delayed + recovered. * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. */ unsigned long ignored; unsigned long failed; unsigned long delayed; unsigned long recovered; }; #endif /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ typedef struct pglist_data { /* * node_zones contains just the zones for THIS node. Not all of the * zones may be populated, but it is the full list. It is referenced by * this node's node_zonelists as well as other node's node_zonelists. */ struct zone node_zones[MAX_NR_ZONES]; /* * node_zonelists contains references to all zones in all nodes. * Generally the first zones will be references to this node's * node_zones. */ struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; /* number of populated zones in this node */ #ifdef CONFIG_FLATMEM /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; #endif #endif #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. * Also synchronizes pgdat->first_deferred_pfn during deferred page * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; /* workqueues for throttling reclaim for different reasons. */ wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ #ifdef CONFIG_MEMORY_HOTPLUG struct mutex kswapd_lock; #endif struct task_struct *kswapd; /* Protected by kswapd_lock */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available * to userspace allocations. */ unsigned long totalreserve_pages; #ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ CACHELINE_PADDING(_pad1_); #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif #ifdef CONFIG_NUMA_BALANCING /* start time in ms of current promote rate limit period */ unsigned int nbp_rl_start; /* number of promote candidate pages at start time of current rate limit period */ unsigned long nbp_rl_nr_cand; /* promote threshold in ms */ unsigned int nbp_threshold; /* start time in ms of current promote threshold adjustment period */ unsigned int nbp_th_start; /* * number of promote candidate pages at start time of current promote * threshold adjustment period */ unsigned long nbp_th_nr_cand; #endif /* Fields commonly accessed by the page reclaim scanner */ /* * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. * * Use mem_cgroup_lruvec() to look up lruvecs. */ struct lruvec __lruvec; unsigned long flags; #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; /* lru_gen_folio list */ struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; #ifdef CONFIG_NUMA struct memory_tier __rcu *memtier; #endif #ifdef CONFIG_MEMORY_FAILURE struct memory_failure_stats mf_stats; #endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; } #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. */ enum meminit_context { MEMINIT_EARLY, MEMINIT_HOTPLUG, }; extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); extern void lruvec_init(struct lruvec *lruvec); static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) { #ifdef CONFIG_MEMCG return lruvec->pgdat; #else return container_of(lruvec, struct pglist_data, __lruvec); #endif } #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else static inline int local_memory_node(int node_id) { return node_id; }; #endif /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) #ifdef CONFIG_ZONE_DEVICE static inline bool zone_is_zone_device(struct zone *zone) { return zone_idx(zone) == ZONE_DEVICE; } #else static inline bool zone_is_zone_device(struct zone *zone) { return false; } #endif /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ static inline bool managed_zone(struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ static inline bool populated_zone(struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NUMA static inline int zone_to_nid(struct zone *zone) { return zone->node; } static inline void zone_set_nid(struct zone *zone, int nid) { zone->node = nid; } #else static inline int zone_to_nid(struct zone *zone) { return 0; } static inline void zone_set_nid(struct zone *zone, int nid) {} #endif extern int movable_zone; static inline int is_highmem_idx(enum zone_type idx) { #ifdef CONFIG_HIGHMEM return (idx == ZONE_HIGHMEM || (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM)); #else return 0; #endif } /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone: pointer to struct zone variable * Return: 1 for a highmem zone, 0 otherwise */ static inline int is_highmem(struct zone *zone) { return is_highmem_idx(zone_idx(zone)); } #ifdef CONFIG_ZONE_DMA bool has_managed_dma(void); #else static inline bool has_managed_dma(void) { return false; } #endif #ifndef CONFIG_NUMA extern struct pglist_data contig_page_data; static inline struct pglist_data *NODE_DATA(int nid) { return &contig_page_data; } #else /* CONFIG_NUMA */ #include <asm/mmzone.h> #endif /* !CONFIG_NUMA */ extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); extern struct zone *next_zone(struct zone *zone); /** * for_each_online_pgdat - helper macro to iterate over all online nodes * @pgdat: pointer to a pg_data_t variable */ #define for_each_online_pgdat(pgdat) \ for (pgdat = first_online_pgdat(); \ pgdat; \ pgdat = next_online_pgdat(pgdat)) /** * for_each_zone - helper macro to iterate over all memory zones * @zone: pointer to struct zone variable * * The user only needs to declare the zone variable, for_each_zone * fills it in. */ #define for_each_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) #define for_each_populated_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) \ if (!populated_zone(zone)) \ ; /* do nothing */ \ else static inline struct zone *zonelist_zone(struct zoneref *zoneref) { return zoneref->zone; } static inline int zonelist_zone_idx(struct zoneref *zoneref) { return zoneref->zone_idx; } static inline int zonelist_node_idx(struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes); /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z: The cursor used as a starting point for the search * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. * * Return: the next zone at or below highest_zoneidx within the allowed * nodemask using a cursor within a zonelist as a starting point */ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) return z; return __next_zones_zonelist(z, highest_zoneidx, nodes); } /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist * @zonelist: The zonelist to search for a suitable zone * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be * used to iterate the zonelist with next_zones_zonelist by advancing it by * one before calling. * * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. * * Return: Zoneref pointer for the first suitable zone found */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes) { return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); } /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask * @zone: The current zone in the iterator * @z: The current pointer within zonelist->_zonerefs being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * @nodemask: Nodemask allowed by the allocator * * This iterator iterates though all zones at or below a given zone index and * within a given nodemask */ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ for (zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index * @zone: The current zone in the iterator * @z: The current pointer within zonelist->zones being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * * This iterator iterates though all zones at or below a given zone index. */ #define for_each_zone_zonelist(zone, z, zlist, highidx) \ for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) /* Whether the 'nodes' are all movable nodes */ static inline bool movable_only_nodes(nodemask_t *nodes) { struct zonelist *zonelist; struct zoneref *z; int nid; if (nodes_empty(*nodes)) return false; /* * We can chose arbitrary node from the nodemask to get a * zonelist as they are interlinked. We just need to find * at least one zone that can satisfy kernel allocations. */ nid = first_node(*nodes); zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); return (!zonelist_zone(z)) ? true : false; } #ifdef CONFIG_SPARSEMEM #include <asm/sparsemem.h> #endif #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif #ifdef CONFIG_SPARSEMEM /* * PA_SECTION_SHIFT physical address to/from section number * PFN_SECTION_SHIFT pfn to/from section number */ #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) #if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS #error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) { return pfn >> PFN_SECTION_SHIFT; } static inline unsigned long section_nr_to_pfn(unsigned long sec) { return sec << PFN_SECTION_SHIFT; } #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) #define SUBSECTION_SHIFT 21 #define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT) #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) #define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1)) #if SUBSECTION_SHIFT > SECTION_SIZE_BITS #error Subsection size exceeds section size #else #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) #endif #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { struct rcu_head rcu; #ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif /* See declaration of similar field in struct zone */ unsigned long pageblock_flags[0]; }; void subsection_map_init(unsigned long pfn, unsigned long nr_pages); struct page; struct page_ext; struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. * (see sparse.c::sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. * (see sparse.c::memory_present()) * * Making it a UL at least makes someone do a cast * before using it wrong. */ unsigned long section_mem_map; struct mem_section_usage *usage; #ifdef CONFIG_PAGE_EXTENSION /* * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use * section. (see page_ext.h about this.) */ struct page_ext *page_ext; unsigned long pad; #endif /* * WARNING: mem_section must be a power-of-2 in size for the * calculation and use of SECTION_ROOT_MASK to make sense. */ }; #ifdef CONFIG_SPARSEMEM_EXTREME #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) #else #define SECTIONS_PER_ROOT 1 #endif #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) #ifdef CONFIG_SPARSEMEM_EXTREME extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif static inline unsigned long *section_to_usemap(struct mem_section *ms) { return ms->usage->pageblock_flags; } static inline struct mem_section *__nr_to_section(unsigned long nr) { unsigned long root = SECTION_NR_TO_ROOT(nr); if (unlikely(root >= NR_SECTION_ROOTS)) return NULL; #ifdef CONFIG_SPARSEMEM_EXTREME if (!mem_section || !mem_section[root]) return NULL; #endif return &mem_section[root][nr & SECTION_ROOT_MASK]; } extern size_t mem_section_usage_size(void); /* * We use the lower bits of the mem_map pointer to store * a little bit of information. The pointer is calculated * as mem_map - section_nr_to_pfn(pnum). The result is * aligned to the minimum alignment of the two values: * 1. All mem_map arrays are page-aligned. * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT * lowest bits. PFN_SECTION_SHIFT is arch-specific * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. * To sum it up, at least 6 bits are available on all architectures. * However, we can exceed 6 bits on some other architectures except * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available * with the worst case of 64K pages on arm64) if we make sure the * exceeded bit is not applicable to powerpc. */ enum { SECTION_MARKED_PRESENT_BIT, SECTION_HAS_MEM_MAP_BIT, SECTION_IS_ONLINE_BIT, SECTION_IS_EARLY_BIT, #ifdef CONFIG_ZONE_DEVICE SECTION_TAINT_ZONE_DEVICE_BIT, #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT SECTION_IS_VMEMMAP_PREINIT_BIT, #endif SECTION_MAP_LAST_BIT, }; #define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT) #define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT) #define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT) #define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT) #ifdef CONFIG_ZONE_DEVICE #define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT #define SECTION_IS_VMEMMAP_PREINIT BIT(SECTION_IS_VMEMMAP_PREINIT_BIT) #endif #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT static inline struct page *__section_mem_map_addr(struct mem_section *section) { unsigned long map = section->section_mem_map; map &= SECTION_MAP_MASK; return (struct page *)map; } static inline int present_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } static inline int present_section_nr(unsigned long nr) { return present_section(__nr_to_section(nr)); } static inline int valid_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } static inline int early_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } static inline int valid_section_nr(unsigned long nr) { return valid_section(__nr_to_section(nr)); } static inline int online_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } #ifdef CONFIG_ZONE_DEVICE static inline int online_device_section(struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } #else static inline int online_device_section(struct mem_section *section) { return 0; } #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT static inline int preinited_vmemmap_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT)); } void sparse_vmemmap_init_nid_early(int nid); void sparse_vmemmap_init_nid_late(int nid); #else static inline int preinited_vmemmap_section(struct mem_section *section) { return 0; } static inline void sparse_vmemmap_init_nid_early(int nid) { } static inline void sparse_vmemmap_init_nid_late(int nid) { } #endif static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); } #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); } extern unsigned long __highest_present_section_nr; static inline int subsection_map_index(unsigned long pfn) { return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION; } #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); struct mem_section_usage *usage = READ_ONCE(ms->usage); return usage ? test_bit(idx, usage->subsection_map) : 0; } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { return 1; } #endif void sparse_init_early_section(int nid, struct page *map, unsigned long pnum, unsigned long flags); #ifndef CONFIG_HAVE_ARCH_PFN_VALID /** * pfn_valid - check if there is a valid memory map entry for a PFN * @pfn: the page frame number to check * * Check if there is a valid memory map entry aka struct page for the @pfn. * Note, that availability of the memory map entry does not imply that * there is actual usable memory at that @pfn. The struct page may * represent a hole or an unusable page frame. * * Return: 1 for PFNs that have memory map entries and 0 otherwise */ static inline int pfn_valid(unsigned long pfn) { struct mem_section *ms; int ret; /* * Ensure the upper PAGE_SHIFT bits are clear in the * pfn. Else it might lead to false positives when * some of the upper bits are set, but the lower bits * match a valid pfn. */ if (PHYS_PFN(PFN_PHYS(pfn)) != pfn) return 0; if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __pfn_to_section(pfn); rcu_read_lock_sched(); if (!valid_section(ms)) { rcu_read_unlock_sched(); return 0; } /* * Traditionally early sections always returned pfn_valid() for * the entire section-sized span. */ ret = early_section(ms) || pfn_section_valid(ms, pfn); rcu_read_unlock_sched(); return ret; } #endif static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; return present_section(__pfn_to_section(pfn)); } static inline unsigned long next_present_section_nr(unsigned long section_nr) { while (++section_nr <= __highest_present_section_nr) { if (present_section_nr(section_nr)) return section_nr; } return -1; } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start - 1); \ section_nr != -1; \ section_nr = next_present_section_nr(section_nr)) /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate * this restriction. */ #ifdef CONFIG_NUMA #define pfn_to_nid(pfn) \ ({ \ unsigned long __pfn_to_nid_pfn = (pfn); \ page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ }) #else #define pfn_to_nid(pfn) (0) #endif void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */ |
265 186 189 186 265 265 265 265 265 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin * Copyright (C) 2012 Konstantin Khlebnikov * Copyright (C) 2016 Intel, Matthew Wilcox * Copyright (C) 2016 Intel, Ross Zwisler */ #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/cpu.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kmemleak.h> #include <linux/percpu.h> #include <linux/preempt.h> /* in_interrupt() */ #include <linux/radix-tree.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/xarray.h> #include "radix-tree.h" /* * Radix tree node cache. */ struct kmem_cache *radix_tree_node_cachep; /* * The radix tree is variable-height, so an insert operation not only has * to build the branch to its corresponding item, it also has to build the * branch to existing items if the size has to be increased (by * radix_tree_extend). * * The worst case is a zero height tree with just a single item at index 0, * and then inserting an item at index ULONG_MAX. This requires 2 new branches * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared. * Hence: */ #define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1) /* * The IDR does not have to be as high as the radix tree since it uses * signed integers, not unsigned longs. */ #define IDR_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(int) - 1) #define IDR_MAX_PATH (DIV_ROUND_UP(IDR_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) #define IDR_PRELOAD_SIZE (IDR_MAX_PATH * 2 - 1) /* * Per-cpu pool of preloaded nodes */ DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { .lock = INIT_LOCAL_LOCK(lock), }; EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads); static inline struct radix_tree_node *entry_to_node(void *ptr) { return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); } static inline void *node_to_entry(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); } #define RADIX_TREE_RETRY XA_RETRY_ENTRY static inline unsigned long get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot) { return parent ? slot - parent->slots : 0; } static unsigned int radix_tree_descend(const struct radix_tree_node *parent, struct radix_tree_node **nodep, unsigned long index) { unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; void __rcu **entry = rcu_dereference_raw(parent->slots[offset]); *nodep = (void *)entry; return offset; } static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) { return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK); } static inline void tag_set(struct radix_tree_node *node, unsigned int tag, int offset) { __set_bit(offset, node->tags[tag]); } static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, int offset) { __clear_bit(offset, node->tags[tag]); } static inline int tag_get(const struct radix_tree_node *node, unsigned int tag, int offset) { return test_bit(offset, node->tags[tag]); } static inline void root_tag_set(struct radix_tree_root *root, unsigned tag) { root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) { root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear_all(struct radix_tree_root *root) { root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1); } static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag) { return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT)); } static inline unsigned root_tags_get(const struct radix_tree_root *root) { return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT; } static inline bool is_idr(const struct radix_tree_root *root) { return !!(root->xa_flags & ROOT_IS_IDR); } /* * Returns 1 if any slot in the node has this tag set. * Otherwise returns 0. */ static inline int any_tag_set(const struct radix_tree_node *node, unsigned int tag) { unsigned idx; for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { if (node->tags[tag][idx]) return 1; } return 0; } static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag) { bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE); } /** * radix_tree_find_next_bit - find the next set bit in a memory region * * @node: where to begin the search * @tag: the tag index * @offset: the bitnumber to start searching at * * Unrollable variant of find_next_bit() for constant size arrays. * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero. * Returns next bit offset, or size if nothing found. */ static __always_inline unsigned long radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, unsigned long offset) { const unsigned long *addr = node->tags[tag]; if (offset < RADIX_TREE_MAP_SIZE) { unsigned long tmp; addr += offset / BITS_PER_LONG; tmp = *addr >> (offset % BITS_PER_LONG); if (tmp) return __ffs(tmp) + offset; offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); while (offset < RADIX_TREE_MAP_SIZE) { tmp = *++addr; if (tmp) return __ffs(tmp) + offset; offset += BITS_PER_LONG; } } return RADIX_TREE_MAP_SIZE; } static unsigned int iter_offset(const struct radix_tree_iter *iter) { return iter->index & RADIX_TREE_MAP_MASK; } /* * The maximum index which can be stored in a radix tree */ static inline unsigned long shift_maxindex(unsigned int shift) { return (RADIX_TREE_MAP_SIZE << shift) - 1; } static inline unsigned long node_maxindex(const struct radix_tree_node *node) { return shift_maxindex(node->shift); } static unsigned long next_index(unsigned long index, const struct radix_tree_node *node, unsigned long offset) { return (index & ~node_maxindex(node)) + (offset << node->shift); } /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. */ static struct radix_tree_node * radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent, struct radix_tree_root *root, unsigned int shift, unsigned int offset, unsigned int count, unsigned int nr_values) { struct radix_tree_node *ret = NULL; /* * Preload code isn't irq safe and it doesn't make sense to use * preloading during an interrupt anyway as all the allocations have * to be atomic. So just do normal allocation when in interrupt. */ if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { struct radix_tree_preload *rtp; /* * Even if the caller has preloaded, try to allocate from the * cache first for the new node to get accounted to the memory * cgroup. */ ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask | __GFP_NOWARN); if (ret) goto out; /* * Provided the caller has preloaded here, we will always * succeed in getting a node here (and never reach * kmem_cache_alloc) */ rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes; rtp->nodes = ret->parent; rtp->nr--; } /* * Update the allocation stack trace as this is more useful * for debugging. */ kmemleak_update_trace(ret); goto out; } ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: BUG_ON(radix_tree_is_internal_node(ret)); if (ret) { ret->shift = shift; ret->offset = offset; ret->count = count; ret->nr_values = nr_values; ret->parent = parent; ret->array = root; } return ret; } void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); /* * Must only free zeroed nodes into the slab. We can be left with * non-NULL entries by radix_tree_free_nodes, so clear the entries * and tags here. */ memset(node->slots, 0, sizeof(node->slots)); memset(node->tags, 0, sizeof(node->tags)); INIT_LIST_HEAD(&node->private_list); kmem_cache_free(radix_tree_node_cachep, node); } static inline void radix_tree_node_free(struct radix_tree_node *node) { call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On * success, return zero, with preemption disabled. On error, return -ENOMEM * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; int ret = -ENOMEM; /* * Nodes preloaded by one cgroup can be used by another cgroup, so * they should never be accounted to any particular memory cgroup. */ gfp_mask &= ~__GFP_ACCOUNT; local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < nr) { local_unlock(&radix_tree_preloads.lock); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr < nr) { node->parent = rtp->nodes; rtp->nodes = node; rtp->nr++; } else { kmem_cache_free(radix_tree_node_cachep, node); } } ret = 0; out: return ret; } /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On * success, return zero, with preemption disabled. On error, return -ENOMEM * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); } EXPORT_SYMBOL(radix_tree_preload); /* * The same as above function, except we don't guarantee preloading happens. * We do it, if we decide it helps. On success, return zero with preemption * disabled. On error, return -ENOMEM with preemption not disabled. */ int radix_tree_maybe_preload(gfp_t gfp_mask) { if (gfpflags_allow_blocking(gfp_mask)) return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ local_lock(&radix_tree_preloads.lock); return 0; } EXPORT_SYMBOL(radix_tree_maybe_preload); static unsigned radix_tree_load_root(const struct radix_tree_root *root, struct radix_tree_node **nodep, unsigned long *maxindex) { struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); *nodep = node; if (likely(radix_tree_is_internal_node(node))) { node = entry_to_node(node); *maxindex = node_maxindex(node); return node->shift + RADIX_TREE_MAP_SHIFT; } *maxindex = 0; return 0; } /* * Extend a radix tree so it can store key @index. */ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, unsigned long index, unsigned int shift) { void *entry; unsigned int maxshift; int tag; /* Figure out what the shift should be. */ maxshift = shift; while (index > shift_maxindex(maxshift)) maxshift += RADIX_TREE_MAP_SHIFT; entry = rcu_dereference_raw(root->xa_head); if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE))) goto out; do { struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL, root, shift, 0, 1, 0); if (!node) return -ENOMEM; if (is_idr(root)) { all_tag_set(node, IDR_FREE); if (!root_tag_get(root, IDR_FREE)) { tag_clear(node, IDR_FREE, 0); root_tag_set(root, IDR_FREE); } } else { /* Propagate the aggregated tag info to the new child */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { if (root_tag_get(root, tag)) tag_set(node, tag, 0); } } BUG_ON(shift > BITS_PER_LONG); if (radix_tree_is_internal_node(entry)) { entry_to_node(entry)->parent = node; } else if (xa_is_value(entry)) { /* Moving a value entry root->xa_head to a node */ node->nr_values = 1; } /* * entry was already in the radix tree, so we do not need * rcu_assign_pointer here */ node->slots[0] = (void __rcu *)entry; entry = node_to_entry(node); rcu_assign_pointer(root->xa_head, entry); shift += RADIX_TREE_MAP_SHIFT; } while (shift <= maxshift); out: return maxshift + RADIX_TREE_MAP_SHIFT; } /** * radix_tree_shrink - shrink radix tree to minimum height * @root: radix tree root */ static inline bool radix_tree_shrink(struct radix_tree_root *root) { bool shrunk = false; for (;;) { struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); struct radix_tree_node *child; if (!radix_tree_is_internal_node(node)) break; node = entry_to_node(node); /* * The candidate node has more than one child, or its child * is not at the leftmost slot, we cannot shrink. */ if (node->count != 1) break; child = rcu_dereference_raw(node->slots[0]); if (!child) break; /* * For an IDR, we must not shrink entry 0 into the root in * case somebody calls idr_replace() with a pointer that * appears to be an internal entry */ if (!node->shift && is_idr(root)) break; if (radix_tree_is_internal_node(child)) entry_to_node(child)->parent = NULL; /* * We don't need rcu_assign_pointer(), since we are simply * moving the node from one part of the tree to another: if it * was safe to dereference the old pointer to it * (node->slots[0]), it will be safe to dereference the new * one (root->xa_head) as far as dependent read barriers go. */ root->xa_head = (void __rcu *)child; if (is_idr(root) && !tag_get(node, IDR_FREE, 0)) root_tag_clear(root, IDR_FREE); /* * We have a dilemma here. The node's slot[0] must not be * NULLed in case there are concurrent lookups expecting to * find the item. However if this was a bottom-level node, * then it may be subject to the slot pointer being visible * to callers dereferencing it. If item corresponding to * slot[0] is subsequently deleted, these callers would expect * their slot to become empty sooner or later. * * For example, lockless pagecache will look up a slot, deref * the page pointer, and if the page has 0 refcount it means it * was concurrently deleted from pagecache so try the deref * again. Fortunately there is already a requirement for logic * to retry the entire slot lookup -- the indirect pointer * problem (replacing direct root node with an indirect pointer * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ node->count = 0; if (!radix_tree_is_internal_node(child)) { node->slots[0] = (void __rcu *)RADIX_TREE_RETRY; } WARN_ON_ONCE(!list_empty(&node->private_list)); radix_tree_node_free(node); shrunk = true; } return shrunk; } static bool delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { bool deleted = false; do { struct radix_tree_node *parent; if (node->count) { if (node_to_entry(node) == rcu_dereference_raw(root->xa_head)) deleted |= radix_tree_shrink(root); return deleted; } parent = node->parent; if (parent) { parent->slots[node->offset] = NULL; parent->count--; } else { /* * Shouldn't the tags already have all been cleared * by the caller? */ if (!is_idr(root)) root_tag_clear_all(root); root->xa_head = NULL; } WARN_ON_ONCE(!list_empty(&node->private_list)); radix_tree_node_free(node); deleted = true; node = parent; } while (node); return deleted; } /** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key * @nodep: returns node * @slotp: returns slot * * Create, if necessary, and return the node and slot for an item * at position @index in the radix tree @root. * * Until there is more than one item in the tree, no nodes are * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. * * Returns -ENOMEM, or 0 for success. */ static int __radix_tree_create(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex; unsigned int shift, offset = 0; unsigned long max = index; gfp_t gfp = root_gfp_mask(root); shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ if (max > maxindex) { int error = radix_tree_extend(root, gfp, max, shift); if (error < 0) return error; shift = error; child = rcu_dereference_raw(root->xa_head); } while (shift > 0) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ child = radix_tree_node_alloc(gfp, node, root, shift, offset, 0, 0); if (!child) return -ENOMEM; rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; } else if (!radix_tree_is_internal_node(child)) break; /* Go a level down */ node = entry_to_node(child); offset = radix_tree_descend(node, &child, index); slot = &node->slots[offset]; } if (nodep) *nodep = node; if (slotp) *slotp = slot; return 0; } /* * Free any nodes below this node. The tree is presumed to not need * shrinking, and any user data in the tree is presumed to not need a * destructor called on it. If we need to add a destructor, we can * add that functionality later. Note that we may not clear tags or * slots from the tree as an RCU walker may still have a pointer into * this subtree. We could replace the entries with RADIX_TREE_RETRY, * but we'll still have to clear those in rcu_free. */ static void radix_tree_free_nodes(struct radix_tree_node *node) { unsigned offset = 0; struct radix_tree_node *child = entry_to_node(node); for (;;) { void *entry = rcu_dereference_raw(child->slots[offset]); if (xa_is_node(entry) && child->shift) { child = entry_to_node(entry); offset = 0; continue; } offset++; while (offset == RADIX_TREE_MAP_SIZE) { struct radix_tree_node *old = child; offset = child->offset + 1; child = child->parent; WARN_ON_ONCE(!list_empty(&old->private_list)); radix_tree_node_free(old); if (old == entry_to_node(node)) return; } } } static inline int insert_entries(struct radix_tree_node *node, void __rcu **slot, void *item) { if (*slot) return -EEXIST; rcu_assign_pointer(*slot, item); if (node) { node->count++; if (xa_is_value(item)) node->nr_values++; } return 1; } /** * radix_tree_insert - insert into a radix tree * @root: radix tree root * @index: index key * @item: item to insert * * Insert an item into the radix tree at position @index. */ int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node; void __rcu **slot; int error; BUG_ON(radix_tree_is_internal_node(item)); error = __radix_tree_create(root, index, &node, &slot); if (error) return error; error = insert_entries(node, slot, item); if (error < 0) return error; if (node) { unsigned offset = get_slot_offset(node, slot); BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); } else { BUG_ON(root_tags_get(root)); } return 0; } EXPORT_SYMBOL(radix_tree_insert); /** * __radix_tree_lookup - lookup an item in a radix tree * @root: radix tree root * @index: index key * @nodep: returns node * @slotp: returns slot * * Lookup and return the item at position @index in the radix * tree @root. * * Until there is more than one item in the tree, no nodes are * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. */ void *__radix_tree_lookup(const struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node, *parent; unsigned long maxindex; void __rcu **slot; restart: parent = NULL; slot = (void __rcu **)&root->xa_head; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); slot = parent->slots + offset; if (node == RADIX_TREE_RETRY) goto restart; if (parent->shift == 0) break; } if (nodep) *nodep = parent; if (slotp) *slotp = slot; return node; } /** * radix_tree_lookup_slot - lookup a slot in a radix tree * @root: radix tree root * @index: index key * * Returns: the slot corresponding to the position @index in the * radix tree @root. This is useful for update-if-exists operations. * * This function can be called under rcu_read_lock iff the slot is not * modified by radix_tree_replace_slot, otherwise it must be called * exclusive from other writers. Any dereference of the slot must be done * using radix_tree_deref_slot. */ void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root, unsigned long index) { void __rcu **slot; if (!__radix_tree_lookup(root, index, NULL, &slot)) return NULL; return slot; } EXPORT_SYMBOL(radix_tree_lookup_slot); /** * radix_tree_lookup - perform lookup operation on a radix tree * @root: radix tree root * @index: index key * * Lookup the item at the position @index in the radix tree @root. * * This function can be called under rcu_read_lock, however the caller * must manage lifetimes of leaf nodes (eg. RCU may also be used to free * them safely). No RCU barriers are required to access or modify the * returned item, however. */ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index) { return __radix_tree_lookup(root, index, NULL, NULL); } EXPORT_SYMBOL(radix_tree_lookup); static void replace_slot(void __rcu **slot, void *item, struct radix_tree_node *node, int count, int values) { if (node && (count || values)) { node->count += count; node->nr_values += values; } rcu_assign_pointer(*slot, item); } static bool node_tag_get(const struct radix_tree_root *root, const struct radix_tree_node *node, unsigned int tag, unsigned int offset) { if (node) return tag_get(node, tag, offset); return root_tag_get(root, tag); } /* * IDR users want to be able to store NULL in the tree, so if the slot isn't * free, don't adjust the count, even if it's transitioning between NULL and * non-NULL. For the IDA, we mark slots as being IDR_FREE while they still * have empty bits, but it only stores NULL in slots when they're being * deleted. */ static int calculate_count(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item, void *old) { if (is_idr(root)) { unsigned offset = get_slot_offset(node, slot); bool free = node_tag_get(root, node, IDR_FREE, offset); if (!free) return 0; if (!old) return 1; } return !!item - !!old; } /** * __radix_tree_replace - replace item in a slot * @root: radix tree root * @node: pointer to tree node * @slot: pointer to slot in @node * @item: new item to store in the slot. * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. */ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item) { void *old = rcu_dereference_raw(*slot); int values = !!xa_is_value(item) - !!xa_is_value(old); int count = calculate_count(root, node, slot, item, old); /* * This function supports replacing value entries and * deleting entries, but that needs accounting against the * node unless the slot is root->xa_head. */ WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) && (count || values)); replace_slot(slot, item, node, count, values); if (!node) return; delete_node(root, node); } /** * radix_tree_replace_slot - replace item in a slot * @root: radix tree root * @slot: pointer to slot * @item: new item to store in the slot. * * For use with radix_tree_lookup_slot() and * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked * across slot lookup and replacement. * * NOTE: This cannot be used to switch between non-entries (empty slots), * regular entries, and value entries, as that requires accounting * inside the radix tree node. When switching from one type of entry or * deleting, use __radix_tree_lookup() and __radix_tree_replace() or * radix_tree_iter_replace(). */ void radix_tree_replace_slot(struct radix_tree_root *root, void __rcu **slot, void *item) { __radix_tree_replace(root, NULL, slot, item); } EXPORT_SYMBOL(radix_tree_replace_slot); /** * radix_tree_iter_replace - replace item in a slot * @root: radix tree root * @iter: iterator state * @slot: pointer to slot * @item: new item to store in the slot. * * For use with radix_tree_for_each_slot(). * Caller must hold tree write locked. */ void radix_tree_iter_replace(struct radix_tree_root *root, const struct radix_tree_iter *iter, void __rcu **slot, void *item) { __radix_tree_replace(root, iter->node, slot, item); } static void node_tag_set(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) { while (node) { if (tag_get(node, tag, offset)) return; tag_set(node, tag, offset); offset = node->offset; node = node->parent; } if (!root_tag_get(root, tag)) root_tag_set(root, tag); } /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index * * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. From * the root all the way down to the leaf node. * * Returns the address of the tagged item. Setting a tag on a not-present * item is a bug. */ void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; radix_tree_load_root(root, &node, &maxindex); BUG_ON(index > maxindex); while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); BUG_ON(!node); if (!tag_get(parent, tag, offset)) tag_set(parent, tag, offset); } /* set the root's tag bit */ if (!root_tag_get(root, tag)) root_tag_set(root, tag); return node; } EXPORT_SYMBOL(radix_tree_tag_set); static void node_tag_clear(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) { while (node) { if (!tag_get(node, tag, offset)) return; tag_clear(node, tag, offset); if (any_tag_set(node, tag)) return; offset = node->offset; node = node->parent; } /* clear the root's tag bit */ if (root_tag_get(root, tag)) root_tag_clear(root, tag); } /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index * * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. If this causes * the leaf node to have no tags set then clear the tag in the * next-to-leaf node, etc. * * Returns the address of the tagged item on success, else NULL. ie: * has the same return value and semantics as radix_tree_lookup(). */ void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; int offset = 0; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; parent = NULL; while (radix_tree_is_internal_node(node)) { parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); } if (node) node_tag_clear(root, parent, tag, offset); return node; } EXPORT_SYMBOL(radix_tree_tag_clear); /** * radix_tree_iter_tag_clear - clear a tag on the current iterator entry * @root: radix tree root * @iter: iterator state * @tag: tag to clear */ void radix_tree_iter_tag_clear(struct radix_tree_root *root, const struct radix_tree_iter *iter, unsigned int tag) { node_tag_clear(root, iter->node, tag, iter_offset(iter)); } /** * radix_tree_tag_get - get a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index (< RADIX_TREE_MAX_TAGS) * * Return values: * * 0: tag not present or not set * 1: tag set * * Note that the return value of this function may not be relied on, even if * the RCU lock is held, unless tag modification and node deletion are excluded * from concurrency. */ int radix_tree_tag_get(const struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; if (!root_tag_get(root, tag)) return 0; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return 0; while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); if (!tag_get(parent, tag, offset)) return 0; if (node == RADIX_TREE_RETRY) break; } return 1; } EXPORT_SYMBOL(radix_tree_tag_get); /* Construct iter->tags bit-mask from node->tags[tag] array */ static void set_iter_tags(struct radix_tree_iter *iter, struct radix_tree_node *node, unsigned offset, unsigned tag) { unsigned tag_long = offset / BITS_PER_LONG; unsigned tag_bit = offset % BITS_PER_LONG; if (!node) { iter->tags = 1; return; } iter->tags = node->tags[tag][tag_long] >> tag_bit; /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ if (tag_long < RADIX_TREE_TAG_LONGS - 1) { /* Pick tags from next element */ if (tag_bit) iter->tags |= node->tags[tag][tag_long + 1] << (BITS_PER_LONG - tag_bit); /* Clip chunk size, here only BITS_PER_LONG tags */ iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); } } void __rcu **radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter) { iter->index = __radix_tree_iter_add(iter, 1); iter->next_index = iter->index; iter->tags = 0; return NULL; } EXPORT_SYMBOL(radix_tree_iter_resume); /** * radix_tree_next_chunk - find next chunk of slots for iteration * * @root: radix tree root * @iter: iterator state * @flags: RADIX_TREE_ITER_* flags and tag index * Returns: pointer to chunk first slot, or NULL if iteration is over */ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned flags) { unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; struct radix_tree_node *node, *child; unsigned long index, offset, maxindex; if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) return NULL; /* * Catch next_index overflow after ~0UL. iter->index never overflows * during iterating; it can be zero only at the beginning. * And we cannot overflow iter->next_index in a single step, * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. * * This condition also used by radix_tree_next_slot() to stop * contiguous iterating, and forbid switching to the next chunk. */ index = iter->next_index; if (!index && iter->index) return NULL; restart: radix_tree_load_root(root, &child, &maxindex); if (index > maxindex) return NULL; if (!child) return NULL; if (!radix_tree_is_internal_node(child)) { /* Single-slot tree */ iter->index = index; iter->next_index = maxindex + 1; iter->tags = 1; iter->node = NULL; return (void __rcu **)&root->xa_head; } do { node = entry_to_node(child); offset = radix_tree_descend(node, &child, index); if ((flags & RADIX_TREE_ITER_TAGGED) ? !tag_get(node, tag, offset) : !child) { /* Hole detected */ if (flags & RADIX_TREE_ITER_CONTIG) return NULL; if (flags & RADIX_TREE_ITER_TAGGED) offset = radix_tree_find_next_bit(node, tag, offset + 1); else while (++offset < RADIX_TREE_MAP_SIZE) { void *slot = rcu_dereference_raw( node->slots[offset]); if (slot) break; } index &= ~node_maxindex(node); index += offset << node->shift; /* Overflow after ~0UL */ if (!index) return NULL; if (offset == RADIX_TREE_MAP_SIZE) goto restart; child = rcu_dereference_raw(node->slots[offset]); } if (!child) goto restart; if (child == RADIX_TREE_RETRY) break; } while (node->shift && radix_tree_is_internal_node(child)); /* Update the iterator state */ iter->index = (index &~ node_maxindex(node)) | offset; iter->next_index = (index | node_maxindex(node)) + 1; iter->node = node; if (flags & RADIX_TREE_ITER_TAGGED) set_iter_tags(iter, node, offset, tag); return node->slots + offset; } EXPORT_SYMBOL(radix_tree_next_chunk); /** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * * Performs an index-ascending scan of the tree for present items. Places * them at *@results and returns the number of items which were placed at * *@results. * * The implementation is naive. * * Like radix_tree_lookup, radix_tree_gang_lookup may be called under * rcu_read_lock. In this case, rather than the returned results being * an atomic snapshot of the tree at a single point in time, the * semantics of an RCU protected gang lookup are as though multiple * radix_tree_lookups have been issued in individual locks, and results * stored in 'results'. */ unsigned int radix_tree_gang_lookup(const struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_slot(slot, root, &iter, first_index) { results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup); /** * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree * based on a tag * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * @tag: the tag index (< RADIX_TREE_MAX_TAGS) * * Performs an index-ascending scan of the tree for present items which * have the tag indexed by @tag set. Places the items at *@results and * returns the number of items which were placed at *@results. */ unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag); /** * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a * radix tree based on a tag * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * @tag: the tag index (< RADIX_TREE_MAX_TAGS) * * Performs an index-ascending scan of the tree for present items which * have the tag indexed by @tag set. Places the slots at *@results and * returns the number of slots which were placed at *@results. */ unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root, void __rcu ***results, unsigned long first_index, unsigned int max_items, unsigned int tag) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { results[ret] = slot; if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); static bool __radix_tree_delete(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot) { void *old = rcu_dereference_raw(*slot); int values = xa_is_value(old) ? -1 : 0; unsigned offset = get_slot_offset(node, slot); int tag; if (is_idr(root)) node_tag_set(root, node, IDR_FREE, offset); else for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) node_tag_clear(root, node, tag, offset); replace_slot(slot, NULL, node, -1, values); return node && delete_node(root, node); } /** * radix_tree_iter_delete - delete the entry at this iterator position * @root: radix tree root * @iter: iterator state * @slot: pointer to slot * * Delete the entry at the position currently pointed to by the iterator. * This may result in the current node being freed; if it is, the iterator * is advanced so that it will not reference the freed memory. This * function may be called without any locking if there are no other threads * which can access this tree. */ void radix_tree_iter_delete(struct radix_tree_root *root, struct radix_tree_iter *iter, void __rcu **slot) { if (__radix_tree_delete(root, iter->node, slot)) iter->index = iter->next_index; } EXPORT_SYMBOL(radix_tree_iter_delete); /** * radix_tree_delete_item - delete an item from a radix tree * @root: radix tree root * @index: index key * @item: expected item * * Remove @item at @index from the radix tree rooted at @root. * * Return: the deleted entry, or %NULL if it was not present * or the entry at the given @index was not @item. */ void *radix_tree_delete_item(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node = NULL; void __rcu **slot = NULL; void *entry; entry = __radix_tree_lookup(root, index, &node, &slot); if (!slot) return NULL; if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE, get_slot_offset(node, slot)))) return NULL; if (item && entry != item) return NULL; __radix_tree_delete(root, node, slot); return entry; } EXPORT_SYMBOL(radix_tree_delete_item); /** * radix_tree_delete - delete an entry from a radix tree * @root: radix tree root * @index: index key * * Remove the entry at @index from the radix tree rooted at @root. * * Return: The deleted entry, or %NULL if it was not present. */ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { return radix_tree_delete_item(root, index, NULL); } EXPORT_SYMBOL(radix_tree_delete); /** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root * @tag: tag to test */ int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag) { return root_tag_get(root, tag); } EXPORT_SYMBOL(radix_tree_tagged); /** * idr_preload - preload for idr_alloc() * @gfp_mask: allocation mask to use for preloading * * Preallocate memory to use for the next call to idr_alloc(). This function * returns with preemption disabled. It will be enabled by idr_preload_end(). */ void idr_preload(gfp_t gfp_mask) { if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE)) local_lock(&radix_tree_preloads.lock); } EXPORT_SYMBOL(idr_preload); void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex, start = iter->next_index; unsigned int shift, offset = 0; grow: shift = radix_tree_load_root(root, &child, &maxindex); if (!radix_tree_tagged(root, IDR_FREE)) start = max(start, maxindex + 1); if (start > max) return ERR_PTR(-ENOSPC); if (start > maxindex) { int error = radix_tree_extend(root, gfp, start, shift); if (error < 0) return ERR_PTR(error); shift = error; child = rcu_dereference_raw(root->xa_head); } if (start == 0 && shift == 0) shift = RADIX_TREE_MAP_SHIFT; while (shift) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ child = radix_tree_node_alloc(gfp, node, root, shift, offset, 0, 0); if (!child) return ERR_PTR(-ENOMEM); all_tag_set(child, IDR_FREE); rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; } else if (!radix_tree_is_internal_node(child)) break; node = entry_to_node(child); offset = radix_tree_descend(node, &child, start); if (!tag_get(node, IDR_FREE, offset)) { offset = radix_tree_find_next_bit(node, IDR_FREE, offset + 1); start = next_index(start, node, offset); if (start > max || start == 0) return ERR_PTR(-ENOSPC); while (offset == RADIX_TREE_MAP_SIZE) { offset = node->offset + 1; node = node->parent; if (!node) goto grow; shift = node->shift; } child = rcu_dereference_raw(node->slots[offset]); } slot = &node->slots[offset]; } iter->index = start; if (node) iter->next_index = 1 + min(max, (start | node_maxindex(node))); else iter->next_index = 1; iter->node = node; set_iter_tags(iter, node, offset, IDR_FREE); return slot; } /** * idr_destroy - release all internal memory from an IDR * @idr: idr handle * * After this function is called, the IDR is empty, and may be reused or * the data structure containing it may be freed. * * A typical clean-up sequence for objects stored in an idr tree will use * idr_for_each() to free all objects, if necessary, then idr_destroy() to * free the memory used to keep track of those objects. */ void idr_destroy(struct idr *idr) { struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head); if (radix_tree_is_internal_node(node)) radix_tree_free_nodes(node); idr->idr_rt.xa_head = NULL; root_tag_set(&idr->idr_rt, IDR_FREE); } EXPORT_SYMBOL(idr_destroy); static void radix_tree_node_ctor(void *arg) { struct radix_tree_node *node = arg; memset(node, 0, sizeof(*node)); INIT_LIST_HEAD(&node->private_list); } static int radix_tree_cpu_dead(unsigned int cpu) { struct radix_tree_preload *rtp; struct radix_tree_node *node; /* Free per-cpu pool of preloaded nodes */ rtp = &per_cpu(radix_tree_preloads, cpu); while (rtp->nr) { node = rtp->nodes; rtp->nodes = node->parent; kmem_cache_free(radix_tree_node_cachep, node); rtp->nr--; } return 0; } void __init radix_tree_init(void) { int ret; BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK); BUILD_BUG_ON(XA_CHUNK_SIZE > 255); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", NULL, radix_tree_cpu_dead); WARN_ON(ret < 0); } |
201 166 169 169 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Ltd. * Author: Marc Zyngier <marc.zyngier@arm.com> */ #ifndef __ASM_ARM_KVM_ARCH_TIMER_H #define __ASM_ARM_KVM_ARCH_TIMER_H #include <linux/clocksource.h> #include <linux/hrtimer.h> enum kvm_arch_timers { TIMER_PTIMER, TIMER_VTIMER, NR_KVM_EL0_TIMERS, TIMER_HVTIMER = NR_KVM_EL0_TIMERS, TIMER_HPTIMER, NR_KVM_TIMERS }; enum kvm_arch_timer_regs { TIMER_REG_CNT, TIMER_REG_CVAL, TIMER_REG_TVAL, TIMER_REG_CTL, TIMER_REG_VOFF, }; struct arch_timer_offset { /* * If set, pointer to one of the offsets in the kvm's offset * structure. If NULL, assume a zero offset. */ u64 *vm_offset; /* * If set, pointer to one of the offsets in the vcpu's sysreg * array. If NULL, assume a zero offset. */ u64 *vcpu_offset; }; struct arch_timer_vm_data { /* Offset applied to the virtual timer/counter */ u64 voffset; /* Offset applied to the physical timer/counter */ u64 poffset; /* The PPI for each timer, global to the VM */ u8 ppi[NR_KVM_TIMERS]; }; struct arch_timer_context { struct kvm_vcpu *vcpu; /* Emulated Timer (may be unused) */ struct hrtimer hrtimer; u64 ns_frac; /* Offset for this counter/timer */ struct arch_timer_offset offset; /* * We have multiple paths which can save/restore the timer state onto * the hardware, so we need some way of keeping track of where the * latest state is. */ bool loaded; /* Output level of the timer IRQ */ struct { bool level; } irq; /* Duplicated state from arch_timer.c for convenience */ u32 host_timer_irq; }; struct timer_map { struct arch_timer_context *direct_vtimer; struct arch_timer_context *direct_ptimer; struct arch_timer_context *emul_vtimer; struct arch_timer_context *emul_ptimer; }; void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map); struct arch_timer_cpu { struct arch_timer_context timers[NR_KVM_TIMERS]; /* Background timer used when the guest is not running */ struct hrtimer bg_timer; /* Is the timer enabled */ bool enabled; }; int __init kvm_timer_hyp_init(bool has_gic); int kvm_timer_enable(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); void kvm_timer_sync_nested(struct kvm_vcpu *vcpu); void kvm_timer_sync_user(struct kvm_vcpu *vcpu); bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu); void kvm_timer_update_run(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu); void kvm_timer_init_vm(struct kvm *kvm); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); u64 kvm_phys_timer_read(void); void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu); void kvm_timer_init_vhe(void); #define vcpu_timer(v) (&(v)->arch.timer_cpu) #define vcpu_get_timer(v,t) (&vcpu_timer(v)->timers[(t)]) #define vcpu_vtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_VTIMER]) #define vcpu_ptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_PTIMER]) #define vcpu_hvtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_HVTIMER]) #define vcpu_hptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_HPTIMER]) #define arch_timer_ctx_index(ctx) ((ctx) - vcpu_timer((ctx)->vcpu)->timers) #define timer_vm_data(ctx) (&(ctx)->vcpu->kvm->arch.timer_data) #define timer_irq(ctx) (timer_vm_data(ctx)->ppi[arch_timer_ctx_index(ctx)]) u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timers tmr, enum kvm_arch_timer_regs treg); void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timers tmr, enum kvm_arch_timer_regs treg, u64 val); /* Needed for tracing */ u32 timer_get_ctl(struct arch_timer_context *ctxt); u64 timer_get_cval(struct arch_timer_context *ctxt); /* CPU HP callbacks */ void kvm_timer_cpu_up(void); void kvm_timer_cpu_down(void); /* CNTKCTL_EL1 valid bits as of DDI0487J.a */ #define CNTKCTL_VALID_BITS (BIT(17) | GENMASK_ULL(9, 0)) DECLARE_STATIC_KEY_FALSE(broken_cntvoff_key); static inline bool has_broken_cntvoff(void) { return static_branch_unlikely(&broken_cntvoff_key); } static inline bool has_cntpoff(void) { return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)); } static inline u64 timer_get_offset(struct arch_timer_context *ctxt) { u64 offset = 0; if (!ctxt) return 0; if (ctxt->offset.vm_offset) offset += *ctxt->offset.vm_offset; if (ctxt->offset.vcpu_offset) offset += *ctxt->offset.vcpu_offset; return offset; } #endif |
3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 | // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include "hsr_main.h" #include "hsr_device.h" #include "hsr_forward.h" #include "hsr_framereg.h" bool hsr_invalid_dan_ingress_frame(__be16 protocol) { return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)); } static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct hsr_port *port; struct hsr_priv *hsr; __be16 protocol; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: skb invalid", __func__); return RX_HANDLER_PASS; } port = hsr_port_get_rcu(skb->dev); if (!port) goto finish_pass; hsr = port->hsr; if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { /* Directly kill frames sent by ourselves */ kfree_skb(skb); goto finish_consume; } /* For HSR, only tagged frames are expected (unless the device offloads * HSR tag removal), but for PRP there could be non tagged frames as * well from Single attached nodes (SANs). */ protocol = eth_hdr(skb)->h_proto; if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && port->type != HSR_PT_INTERLINK && hsr->proto_ops->invalid_dan_ingress_frame && hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) || protocol == htons(ETH_P_HSR)) skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); skb_reset_mac_len(skb); /* Only the frames received over the interlink port will assign a * sequence number and require synchronisation vs other sender. */ if (port->type == HSR_PT_INTERLINK) { spin_lock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, port); spin_unlock_bh(&hsr->seqnr_lock); } else { hsr_forward_skb(skb, port); } finish_consume: return RX_HANDLER_CONSUMED; finish_pass: return RX_HANDLER_PASS; } bool hsr_port_exists(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame; } static int hsr_check_dev_ok(struct net_device *dev, struct netlink_ext_ack *extack) { /* Don't allow HSR on non-ethernet like devices */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN) { NL_SET_ERR_MSG_MOD(extack, "Cannot use loopback or non-ethernet device as HSR slave."); return -EINVAL; } /* Don't allow enslaving hsr devices */ if (is_hsr_master(dev)) { NL_SET_ERR_MSG_MOD(extack, "Cannot create trees of HSR devices."); return -EINVAL; } if (hsr_port_exists(dev)) { NL_SET_ERR_MSG_MOD(extack, "This device is already a HSR slave."); return -EINVAL; } if (is_vlan_dev(dev)) { NL_SET_ERR_MSG_MOD(extack, "HSR on top of VLAN is not yet supported in this driver."); return -EINVAL; } if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG_MOD(extack, "This device does not support bridging."); return -EOPNOTSUPP; } /* HSR over bonded devices has not been tested, but I'm not sure it * won't work... */ return 0; } /* Setup device to be added to the HSR bridge. */ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, struct hsr_port *port, struct netlink_ext_ack *extack) { struct net_device *hsr_dev; struct hsr_port *master; int res; /* Don't use promiscuous mode for offload since L2 frame forward * happens at the offloaded hardware. */ if (!port->hsr->fwd_offloaded) { res = dev_set_promiscuity(dev, 1); if (res) return res; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr_dev = master->dev; res = netdev_upper_dev_link(dev, hsr_dev, extack); if (res) goto fail_upper_dev_link; res = netdev_rx_handler_register(dev, hsr_handle_frame, port); if (res) goto fail_rx_handler; dev_disable_lro(dev); return 0; fail_rx_handler: netdev_upper_dev_unlink(dev, hsr_dev); fail_upper_dev_link: if (!port->hsr->fwd_offloaded) dev_set_promiscuity(dev, -1); return res; } int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, enum hsr_port_type type, struct netlink_ext_ack *extack) { struct hsr_port *port, *master; int res; if (type != HSR_PT_MASTER) { res = hsr_check_dev_ok(dev, extack); if (res) return res; } port = hsr_port_get_hsr(hsr, type); if (port) return -EBUSY; /* This port already exists */ port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; port->hsr = hsr; port->dev = dev; port->type = type; if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); if (res) goto fail_dev_setup; } list_add_tail_rcu(&port->port_list, &hsr->ports); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); return 0; fail_dev_setup: kfree(port); return res; } void hsr_del_port(struct hsr_port *port) { struct hsr_priv *hsr; struct hsr_port *master; hsr = port->hsr; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); list_del_rcu(&port->port_list); if (port != master) { netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); netdev_rx_handler_unregister(port->dev); if (!port->hsr->fwd_offloaded) dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); } kfree_rcu(port, rcu); } |
400 401 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/blkdev.h> #include <linux/wait.h> #include <linux/rbtree.h> #include <linux/kthread.h> #include <linux/backing-dev.h> #include <linux/blk-cgroup.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> #include <trace/events/writeback.h> #include "internal.h" struct backing_dev_info noop_backing_dev_info; EXPORT_SYMBOL_GPL(noop_backing_dev_info); static const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU * reader side locking. */ DEFINE_SPINLOCK(bdi_lock); static u64 bdi_id_cursor; static struct rb_root bdi_tree = RB_ROOT; LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> struct wb_stats { unsigned long nr_dirty; unsigned long nr_io; unsigned long nr_more_io; unsigned long nr_dirty_time; unsigned long nr_writeback; unsigned long nr_reclaimable; unsigned long nr_dirtied; unsigned long nr_written; unsigned long dirty_thresh; unsigned long wb_thresh; }; static struct dentry *bdi_debug_root; static void bdi_debug_init(void) { bdi_debug_root = debugfs_create_dir("bdi", NULL); } static void collect_wb_stats(struct wb_stats *stats, struct bdi_writeback *wb) { struct inode *inode; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_io_list) stats->nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_io_list) stats->nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_io_list) stats->nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) stats->nr_dirty_time++; spin_unlock(&wb->list_lock); stats->nr_writeback += wb_stat(wb, WB_WRITEBACK); stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE); stats->nr_dirtied += wb_stat(wb, WB_DIRTIED); stats->nr_written += wb_stat(wb, WB_WRITTEN); stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh); } #ifdef CONFIG_CGROUP_WRITEBACK static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { struct bdi_writeback *wb; rcu_read_lock(); list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { if (!wb_tryget(wb)) continue; collect_wb_stats(stats, wb); wb_put(wb); } rcu_read_unlock(); } #else static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { collect_wb_stats(stats, &bdi->wb); } #endif static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; unsigned long background_thresh; unsigned long dirty_thresh; struct wb_stats stats; unsigned long tot_bw; global_dirty_limits(&background_thresh, &dirty_thresh); memset(&stats, 0, sizeof(stats)); stats.dirty_thresh = dirty_thresh; bdi_collect_stats(bdi, &stats); tot_bw = atomic_long_read(&bdi->tot_write_bandwidth); seq_printf(m, "BdiWriteback: %10lu kB\n" "BdiReclaimable: %10lu kB\n" "BdiDirtyThresh: %10lu kB\n" "DirtyThresh: %10lu kB\n" "BackgroundThresh: %10lu kB\n" "BdiDirtied: %10lu kB\n" "BdiWritten: %10lu kB\n" "BdiWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", K(stats.nr_writeback), K(stats.nr_reclaimable), K(stats.wb_thresh), K(dirty_thresh), K(background_thresh), K(stats.nr_dirtied), K(stats.nr_written), K(tot_bw), stats.nr_dirty, stats.nr_io, stats.nr_more_io, stats.nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->wb.state); return 0; } DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb, struct wb_stats *stats) { seq_printf(m, "WbCgIno: %10lu\n" "WbWriteback: %10lu kB\n" "WbReclaimable: %10lu kB\n" "WbDirtyThresh: %10lu kB\n" "WbDirtied: %10lu kB\n" "WbWritten: %10lu kB\n" "WbWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "state: %10lx\n\n", #ifdef CONFIG_CGROUP_WRITEBACK cgroup_ino(wb->memcg_css->cgroup), #else 1ul, #endif K(stats->nr_writeback), K(stats->nr_reclaimable), K(stats->wb_thresh), K(stats->nr_dirtied), K(stats->nr_written), K(wb->avg_write_bandwidth), stats->nr_dirty, stats->nr_io, stats->nr_more_io, stats->nr_dirty_time, wb->state); } static int cgwb_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; unsigned long background_thresh; unsigned long dirty_thresh; struct bdi_writeback *wb; global_dirty_limits(&background_thresh, &dirty_thresh); rcu_read_lock(); list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { struct wb_stats stats = { .dirty_thresh = dirty_thresh }; if (!wb_tryget(wb)) continue; collect_wb_stats(&stats, wb); /* * Calculate thresh of wb in writeback cgroup which is min of * thresh in global domain and thresh in cgroup domain. Drop * rcu lock because cgwb_calc_thresh may sleep in * cgroup_rstat_flush. We can do so here because we have a ref. */ if (mem_cgroup_wb_domain(wb)) { rcu_read_unlock(); stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb)); rcu_read_lock(); } wb_stats_show(m, wb, &stats); wb_put(wb); } rcu_read_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats); static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, &bdi_debug_stats_fops); debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi, &cgwb_debug_stats_fops); } static void bdi_debug_unregister(struct backing_dev_info *bdi) { debugfs_remove_recursive(bdi->debug_dir); } #else /* CONFIG_DEBUG_FS */ static inline void bdi_debug_init(void) { } static inline void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { } static inline void bdi_debug_unregister(struct backing_dev_info *bdi) { } #endif /* CONFIG_DEBUG_FS */ static ssize_t read_ahead_kb_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned long read_ahead_kb; ssize_t ret; ret = kstrtoul(buf, 10, &read_ahead_kb); if (ret < 0) return ret; bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); return count; } #define BDI_SHOW(name, expr) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct backing_dev_info *bdi = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lld\n", (long long)expr); \ } \ static DEVICE_ATTR_RW(name); BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) static ssize_t min_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) static ssize_t min_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio_fine, bdi->min_ratio) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) static ssize_t max_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio_fine, bdi->max_ratio) static ssize_t min_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); } static ssize_t min_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_min_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(min_bytes); static ssize_t max_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); } static ssize_t max_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_max_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(max_bytes); static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); return sysfs_emit(buf, "%d\n", 0); } static DEVICE_ATTR_RO(stable_pages_required); static ssize_t strict_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int strict_limit; ssize_t ret; ret = kstrtouint(buf, 10, &strict_limit); if (ret < 0) return ret; ret = bdi_set_strict_limit(bdi, strict_limit); if (!ret) ret = count; return ret; } static ssize_t strict_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%d\n", !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); } static DEVICE_ATTR_RW(strict_limit); static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, &dev_attr_max_ratio_fine.attr, &dev_attr_min_bytes.attr, &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, &dev_attr_strict_limit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); static const struct class bdi_class = { .name = "bdi", .dev_groups = bdi_dev_groups, }; static __init int bdi_class_init(void) { int ret; ret = class_register(&bdi_class); if (ret) return ret; bdi_debug_init(); return 0; } postcore_initcall(bdi_class_init); static int __init default_bdi_init(void) { bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; return 0; } subsys_initcall(default_bdi_init); static void wb_update_bandwidth_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, bw_dwork); wb_update_bandwidth(wb); } /* * Initial write bandwidth: 100 MB/s */ #define INIT_BW (100 << (20 - PAGE_SHIFT)) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) { int err; memset(wb, 0, sizeof(*wb)); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); atomic_set(&wb->writeback_inodes, 0); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; wb->write_bandwidth = INIT_BW; wb->avg_write_bandwidth = INIT_BW; spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); err = fprop_local_init_percpu(&wb->completions, gfp); if (err) return err; err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS); if (err) fprop_local_destroy_percpu(&wb->completions); return err; } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); /* * Remove bdi from the global list and shutdown any threads we have running */ static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ spin_lock_irq(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_irq(&wb->work_lock); return; } spin_unlock_irq(&wb->work_lock); cgwb_remove_from_bdi_list(wb); /* * Drain work list and shutdown the delayed_work. !WB_registered * tells wb_workfn() that @wb is dying and its work_list needs to * be drained no matter what. */ mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) { WARN_ON(delayed_work_pending(&wb->dwork)); percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS); fprop_local_destroy_percpu(&wb->completions); } #ifdef CONFIG_CGROUP_WRITEBACK #include <linux/memcontrol.h> /* * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; static LIST_HEAD(offline_cgwbs); static void cleanup_offline_cgwbs_workfn(struct work_struct *work); static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); static void cgwb_free_rcu(struct rcu_head *rcu_head) { struct bdi_writeback *wb = container_of(rcu_head, struct bdi_writeback, rcu); percpu_ref_exit(&wb->refcnt); kfree(wb); } static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); struct backing_dev_info *bdi = wb->bdi; mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); css_put(wb->memcg_css); css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ blkcg_unpin_online(wb->blkcg_css); fprop_local_destroy_percpu(&wb->memcg_completions); spin_lock_irq(&cgwb_lock); list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); call_rcu(&wb->rcu, cgwb_free_rcu); } static void cgwb_release(struct percpu_ref *refcnt) { struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, refcnt); queue_work(cgwb_release_wq, &wb->release_work); } static void cgwb_kill(struct bdi_writeback *wb) { lockdep_assert_held(&cgwb_lock); WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); list_del(&wb->memcg_node); list_del(&wb->blkcg_node); list_add(&wb->offline_node, &offline_cgwbs); percpu_ref_kill(&wb->refcnt); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { spin_lock_irq(&cgwb_lock); list_del_rcu(&wb->bdi_node); spin_unlock_irq(&cgwb_lock); } static int cgwb_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct mem_cgroup *memcg; struct cgroup_subsys_state *blkcg_css; struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; struct bdi_writeback *wb; unsigned long flags; int ret = 0; memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css); /* look up again under lock and discard on blkcg mismatch */ spin_lock_irqsave(&cgwb_lock, flags); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb && wb->blkcg_css != blkcg_css) { cgwb_kill(wb); wb = NULL; } spin_unlock_irqrestore(&cgwb_lock, flags); if (wb) goto out_put; /* need to create a new one */ wb = kmalloc(sizeof(*wb), gfp); if (!wb) { ret = -ENOMEM; goto out_put; } ret = wb_init(wb, bdi, gfp); if (ret) goto err_free; ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); if (ret) goto err_wb_exit; ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); if (ret) goto err_ref_exit; wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); /* * The root wb determines the registered state of the whole bdi and * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate * whether they're still online. Don't link @wb if any is dead. * See wb_memcg_offline() and wb_blkcg_offline(). */ ret = -ENODEV; spin_lock_irqsave(&cgwb_lock, flags); if (test_bit(WB_registered, &bdi->wb.state) && blkcg_cgwb_list->next && memcg_cgwb_list->next) { /* we might have raced another instance of this function */ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); if (!ret) { list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); blkcg_pin_online(blkcg_css); css_get(memcg_css); css_get(blkcg_css); } } spin_unlock_irqrestore(&cgwb_lock, flags); if (ret) { if (ret == -EEXIST) ret = 0; goto err_fprop_exit; } goto out_put; err_fprop_exit: bdi_put(bdi); fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); err_wb_exit: wb_exit(wb); err_free: kfree(wb); out_put: css_put(blkcg_css); return ret; } /** * wb_get_lookup - get wb for a given memcg * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * * Try to get the wb for @memcg_css on @bdi. The returned wb has its * refcount incremented. * * This function uses css_get() on @memcg_css and thus expects its refcnt * to be positive on invocation. IOW, rcu_read_lock() protection on * @memcg_css isn't enough. try_get it before calling this function. * * A wb is keyed by its associated memcg. As blkcg implicitly enables * memcg on the default hierarchy, memcg association is guaranteed to be * more specific (equal or descendant to the associated blkcg) and thus can * identify both the memcg and blkcg associations. * * Because the blkcg associated with a memcg may change as blkcg is enabled * and disabled closer to root in the hierarchy, each wb keeps track of * both the memcg and blkcg associated with it and verifies the blkcg on * each lookup. On mismatch, the existing wb is discarded and a new one is * created. */ struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css) { struct bdi_writeback *wb; if (!memcg_css->parent) return &bdi->wb; rcu_read_lock(); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb) { struct cgroup_subsys_state *blkcg_css; /* see whether the blkcg association has changed */ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; css_put(blkcg_css); } rcu_read_unlock(); return wb; } /** * wb_get_create - get wb for a given memcg, create if necessary * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * @gfp: allocation mask to use * * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to * create one. See wb_get_lookup() for more details. */ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct bdi_writeback *wb; might_alloc(gfp); do { wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); return wb; } static int cgwb_bdi_init(struct backing_dev_info *bdi) { int ret; INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); mutex_init(&bdi->cgwb_release_mutex); init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; } return ret; } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { struct radix_tree_iter iter; void **slot; struct bdi_writeback *wb; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); spin_unlock_irq(&cgwb_lock); mutex_lock(&bdi->cgwb_release_mutex); spin_lock_irq(&cgwb_lock); while (!list_empty(&bdi->wb_list)) { wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); spin_lock_irq(&cgwb_lock); } spin_unlock_irq(&cgwb_lock); mutex_unlock(&bdi->cgwb_release_mutex); } /* * cleanup_offline_cgwbs_workfn - try to release dying cgwbs * * Try to release dying cgwbs by switching attached inodes to the nearest * living ancestor's writeback. Processed wbs are placed at the end * of the list to guarantee the forward progress. */ static void cleanup_offline_cgwbs_workfn(struct work_struct *work) { struct bdi_writeback *wb; LIST_HEAD(processed); spin_lock_irq(&cgwb_lock); while (!list_empty(&offline_cgwbs)) { wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, offline_node); list_move(&wb->offline_node, &processed); /* * If wb is dirty, cleaning up the writeback by switching * attached inodes will result in an effective removal of any * bandwidth restrictions, which isn't the goal. Instead, * it can be postponed until the next time, when all io * will be likely completed. If in the meantime some inodes * will get re-dirtied, they should be eventually switched to * a new cgwb. */ if (wb_has_dirty_io(wb)) continue; if (!wb_tryget(wb)) continue; spin_unlock_irq(&cgwb_lock); while (cleanup_offline_cgwb(wb)) cond_resched(); spin_lock_irq(&cgwb_lock); wb_put(wb); } if (!list_empty(&processed)) list_splice_tail(&processed, &offline_cgwbs); spin_unlock_irq(&cgwb_lock); } /** * wb_memcg_offline - kill all wb's associated with a memcg being offlined * @memcg: memcg being offlined * * Also prevents creation of any new wb's associated with @memcg. */ void wb_memcg_offline(struct mem_cgroup *memcg) { struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) cgwb_kill(wb); memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); } /** * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined * @css: blkcg being offlined * * Also prevents creation of any new wb's associated with @blkcg. */ void wb_blkcg_offline(struct cgroup_subsys_state *css) { struct bdi_writeback *wb, *next; struct list_head *list = blkcg_get_cgwb_list(css); spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, list, blkcg_node) cgwb_kill(wb); list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); } static void cgwb_bdi_register(struct backing_dev_info *bdi) { spin_lock_irq(&cgwb_lock); list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); spin_unlock_irq(&cgwb_lock); } static int __init cgwb_init(void) { /* * There can be many concurrent release work items overwhelming * system_wq. Put them in a separate wq and limit concurrency. * There's no point in executing many of these in parallel. */ cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); if (!cgwb_release_wq) return -ENOMEM; return 0; } subsys_initcall(cgwb_init); #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) { return wb_init(&bdi->wb, bdi, GFP_KERNEL); } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } static void cgwb_bdi_register(struct backing_dev_info *bdi) { list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { list_del_rcu(&wb->bdi_node); } #endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) { bdi->dev = NULL; kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); bdi->last_bdp_sleep = jiffies; return cgwb_bdi_init(bdi); } struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; if (bdi_init(bdi)) { kfree(bdi); return NULL; } bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { struct rb_node **p = &bdi_tree.rb_node; struct rb_node *parent = NULL; struct backing_dev_info *bdi; lockdep_assert_held(&bdi_lock); while (*p) { parent = *p; bdi = rb_entry(parent, struct backing_dev_info, rb_node); if (bdi->id > id) p = &(*p)->rb_left; else if (bdi->id < id) p = &(*p)->rb_right; else break; } if (parentp) *parentp = parent; return p; } /** * bdi_get_by_id - lookup and get bdi from its id * @id: bdi id to lookup * * Find bdi matching @id and get it. Returns NULL if the matching bdi * doesn't exist or is already unregistered. */ struct backing_dev_info *bdi_get_by_id(u64 id) { struct backing_dev_info *bdi = NULL; struct rb_node **p; spin_lock_bh(&bdi_lock); p = bdi_lookup_rb_node(id, NULL); if (*p) { bdi = rb_entry(*p, struct backing_dev_info, rb_node); bdi_get(bdi); } spin_unlock_bh(&bdi_lock); return bdi; } int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { struct device *dev; struct rb_node *parent, **p; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); cgwb_bdi_register(bdi); bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); bdi->id = ++bdi_id_cursor; p = bdi_lookup_rb_node(bdi->id, &parent); rb_link_node(&bdi->rb_node, parent, p); rb_insert_color(&bdi->rb_node, &bdi_tree); list_add_tail_rcu(&bdi->bdi_list, &bdi_list); spin_unlock_bh(&bdi_lock); trace_writeback_bdi_register(bdi); return 0; } int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) { va_list args; int ret; va_start(args, fmt); ret = bdi_register_va(bdi, fmt, args); va_end(args); return ret; } EXPORT_SYMBOL(bdi_register); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) { WARN_ON_ONCE(bdi->owner); bdi->owner = owner; get_device(owner); } /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ static void bdi_remove_from_list(struct backing_dev_info *bdi) { spin_lock_bh(&bdi_lock); rb_erase(&bdi->rb_node, &bdi_tree); list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); } void bdi_unregister(struct backing_dev_info *bdi) { timer_delete_sync(&bdi->laptop_mode_wb_timer); /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); cgwb_bdi_unregister(bdi); /* * If this BDI's min ratio has been set, use bdi_set_min_ratio() to * update the global bdi_min_ratio. */ if (bdi->min_ratio) bdi_set_min_ratio(bdi, 0); if (bdi->dev) { bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; } if (bdi->owner) { put_device(bdi->owner); bdi->owner = NULL; } } EXPORT_SYMBOL(bdi_unregister); static void release_bdi(struct kref *ref) { struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); kfree(bdi); } void bdi_put(struct backing_dev_info *bdi) { kref_put(&bdi->refcnt, release_bdi); } EXPORT_SYMBOL(bdi_put); struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb; if (!inode) return &noop_backing_dev_info; sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) return I_BDEV(inode)->bd_disk->bdi; #endif return sb->s_bdi; } EXPORT_SYMBOL(inode_to_bdi); const char *bdi_dev_name(struct backing_dev_info *bdi) { if (!bdi || !bdi->dev) return bdi_unknown_name; return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name); |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <net/flow_offload.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/pkt_cls.h> static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL); if (!flow) return NULL; flow->rule = flow_rule_alloc(num_actions); if (!flow->rule) { kfree(flow); return NULL; } flow->rule->match.dissector = &flow->match.dissector; flow->rule->match.mask = &flow->match.mask; flow->rule->match.key = &flow->match.key; return flow; } void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type) { struct nft_flow_match *match = &flow->match; struct nft_flow_key *mask = &match->mask; struct nft_flow_key *key = &match->key; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) return; key->control.addr_type = addr_type; mask->control.addr_type = 0xffff; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = offsetof(struct nft_flow_key, control); } struct nft_offload_ethertype { __be16 value; __be16 mask; }; static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow) { struct nft_flow_match *match = &flow->match; struct nft_offload_ethertype ethertype = { .value = match->key.basic.n_proto, .mask = match->mask.basic.n_proto, }; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.cvlan.vlan_tpid; match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid; match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid; match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); } else if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && (match->key.basic.n_proto == htons(ETH_P_8021Q) || match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct nft_flow_key, vlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); } } struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule) { struct nft_offload_ctx *ctx; struct nft_flow_rule *flow; int num_actions = 0, err; struct nft_expr *expr; expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { if (expr->ops->offload_action && expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); } if (num_actions == 0) return ERR_PTR(-EOPNOTSUPP); flow = nft_flow_rule_alloc(num_actions); if (!flow) return ERR_PTR(-ENOMEM); expr = nft_expr_first(rule); ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto err_out; } ctx->net = net; ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; while (nft_expr_more(rule, expr)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; } err = expr->ops->offload(ctx, flow, expr); if (err < 0) goto err_out; expr = nft_expr_next(expr); } nft_flow_rule_transfer_vlan(ctx, flow); flow->proto = ctx->dep.l3num; kfree(ctx); return flow; err_out: kfree(ctx); nft_flow_rule_destroy(flow); return ERR_PTR(err); } void nft_flow_rule_destroy(struct nft_flow_rule *flow) { struct flow_action_entry *entry; int i; flow_action_for_each(i, entry, &flow->rule->action) { switch (entry->id) { case FLOW_ACTION_REDIRECT: case FLOW_ACTION_MIRRED: dev_put(entry->dev); break; default: break; } } kfree(flow->rule); kfree(flow); } void nft_offload_set_dependency(struct nft_offload_ctx *ctx, enum nft_offload_dep_type type) { ctx->dep.type = type; } void nft_offload_update_dependency(struct nft_offload_ctx *ctx, const void *data, u32 len) { switch (ctx->dep.type) { case NFT_OFFLOAD_DEP_NETWORK: WARN_ON(len != sizeof(__u16)); memcpy(&ctx->dep.l3num, data, sizeof(__u16)); break; case NFT_OFFLOAD_DEP_TRANSPORT: WARN_ON(len != sizeof(__u8)); memcpy(&ctx->dep.protonum, data, sizeof(__u8)); break; default: break; } ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; } static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, __be16 proto, int priority, struct netlink_ext_ack *extack) { common->protocol = proto; common->prio = priority; common->extack = extack; } static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; } return 0; } static int nft_chain_offload_priority(const struct nft_base_chain *basechain) { if (basechain->ops.priority <= 0 || basechain->ops.priority > USHRT_MAX) return -1; return 0; } bool nft_chain_offload_support(const struct nft_base_chain *basechain) { struct net_device *dev; struct nft_hook *hook; if (nft_chain_offload_priority(basechain) < 0) return false; list_for_each_entry(hook, &basechain->hook_list, list) { if (hook->ops.pf != NFPROTO_NETDEV || hook->ops.hooknum != NF_NETDEV_INGRESS) return false; dev = hook->ops.dev; if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) return false; } return true; } static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, const struct nft_base_chain *basechain, const struct nft_rule *rule, const struct nft_flow_rule *flow, struct netlink_ext_ack *extack, enum flow_cls_command command) { __be16 proto = ETH_P_ALL; memset(cls_flow, 0, sizeof(*cls_flow)); if (flow) proto = flow->proto; nft_flow_offload_common_init(&cls_flow->common, proto, basechain->ops.priority, extack); cls_flow->command = command; cls_flow->cookie = (unsigned long) rule; if (flow) cls_flow->rule = flow->rule; } static int nft_flow_offload_cmd(const struct nft_chain *chain, const struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command, struct flow_cls_offload *cls_flow) { struct netlink_ext_ack extack = {}; struct nft_base_chain *basechain; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack, command); return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow, &basechain->flow_block.cb_list); } static int nft_flow_offload_rule(const struct nft_chain *chain, struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command) { struct flow_cls_offload cls_flow; return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow); } int nft_flow_rule_stats(const struct nft_chain *chain, const struct nft_rule *rule) { struct flow_cls_offload cls_flow = {}; struct nft_expr *expr, *next; int err; err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS, &cls_flow); if (err < 0) return err; nft_rule_for_each_expr(expr, next, rule) { if (expr->ops->offload_stats) expr->ops->offload_stats(expr, &cls_flow.stats); } return 0; } static int nft_flow_offload_bind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { list_splice(&bo->cb_list, &basechain->flow_block.cb_list); return 0; } static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; struct flow_cls_offload cls_flow; struct netlink_ext_ack extack; struct nft_chain *chain; struct nft_rule *rule; chain = &basechain->chain; list_for_each_entry(rule, &chain->rules, list) { memset(&extack, 0, sizeof(extack)); nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, &extack, FLOW_CLS_DESTROY); nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); flow_block_cb_free(block_cb); } return 0; } static int nft_block_setup(struct nft_base_chain *basechain, struct flow_block_offload *bo, enum flow_block_command cmd) { int err; switch (cmd) { case FLOW_BLOCK_BIND: err = nft_flow_offload_bind(bo, basechain); break; case FLOW_BLOCK_UNBIND: err = nft_flow_offload_unbind(bo, basechain); break; default: WARN_ON_ONCE(1); err = -EOPNOTSUPP; } return err; } static void nft_flow_block_offload_init(struct flow_block_offload *bo, struct net *net, enum flow_block_command cmd, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { memset(bo, 0, sizeof(*bo)); bo->net = net; bo->block = &basechain->flow_block; bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; bo->cb_list_head = &basechain->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } static int nft_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) return err; return nft_block_setup(chain, &bo, cmd); } static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) { struct nft_base_chain *basechain = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct netlink_ext_ack extack = {}; struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct flow_block_offload bo; nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); nft_flow_offload_unbind(&bo, basechain); mutex_unlock(&nft_net->commit_mutex); } static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo, nft_indr_block_cleanup); if (err < 0) return err; if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; return nft_block_setup(basechain, &bo, cmd); } static int nft_chain_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { int err; if (dev->netdev_ops->ndo_setup_tc) err = nft_block_offload_cmd(basechain, dev, cmd); else err = nft_indr_block_offload_cmd(basechain, dev, cmd); return err; } static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { struct net_device *dev; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { dev = hook->ops.dev; if (this_dev && this_dev != dev) continue; err = nft_chain_offload_cmd(basechain, dev, cmd); if (err < 0 && cmd == FLOW_BLOCK_BIND) { if (!this_dev) goto err_flow_block; return err; } i++; } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { if (i-- <= 0) break; dev = hook->ops.dev; nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); } return err; } static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { struct nft_base_chain *basechain; u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; return nft_flow_block_chain(basechain, NULL, cmd); } static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); int err = 0; list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_UNBIND); break; case NFT_MSG_DELCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_BIND); break; case NFT_MSG_NEWRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; case NFT_MSG_DELRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; } if (WARN_ON_ONCE(err)) break; } } int nft_flow_rule_offload_commit(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; int err = 0; u8 policy; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; if (trans->flags & NLM_F_REPLACE || !(trans->flags & NLM_F_APPEND)) { err = -EOPNOTSUPP; break; } err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; } if (err) { nft_flow_rule_offload_abort(net, trans); break; } } return err; } static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net, struct net_device *dev) { struct nft_base_chain *basechain; struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain) || !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { if (hook->ops.dev != dev) continue; found = hook; break; } if (!found) continue; return chain; } } return NULL; } static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); chain = __nft_offload_get_chain(nft_net, dev); if (chain) nft_flow_block_chain(nft_base_chain(chain), dev, FLOW_BLOCK_UNBIND); mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } static struct notifier_block nft_offload_netdev_notifier = { .notifier_call = nft_offload_netdev_event, }; int nft_offload_init(void) { return register_netdevice_notifier(&nft_offload_netdev_notifier); } void nft_offload_exit(void) { unregister_netdevice_notifier(&nft_offload_netdev_notifier); } |
130 124 14 9 129 124 3 2 90 2 39 13 39 2 2 2 1 1 1 2 1 2 2 3 38 2 2 82 66 12 1 5 2 3 54 16 16 1 1 1 1 1 2 1 1 7 4 1 2 1 2 3 1 3 3 1 1 8 8 8 8 8 7 8 5 5 9 5 5 5 5 4 7 1 4 1 1 1 1 1 1 1 8 8 5 5 5 5 86 1 10 43 6 4 4 22 162 1 28 86 6 5 5 80 1 1 3 2 1 8 2 1 3 2 3 4 1 1 1 4 4 1 1 1 1 37 1 25 4 7 10 1 3 3 3 6 1 3 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/kvm/guest.c: * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/bits.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/nospec.h> #include <linux/kvm_host.h> #include <linux/module.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/fs.h> #include <kvm/arm_hypercalls.h> #include <asm/cputype.h> #include <linux/uaccess.h> #include <asm/fpsimd.h> #include <asm/kvm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_nested.h> #include <asm/sigcontext.h> #include "trace.h" const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() }; const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), .id_offset = sizeof(struct kvm_stats_header), .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + sizeof(kvm_vm_stats_desc), }; const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), STATS_DESC_COUNTER(VCPU, hvc_exit_stat), STATS_DESC_COUNTER(VCPU, wfe_exit_stat), STATS_DESC_COUNTER(VCPU, wfi_exit_stat), STATS_DESC_COUNTER(VCPU, mmio_exit_user), STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), STATS_DESC_COUNTER(VCPU, signal_exits), STATS_DESC_COUNTER(VCPU, exits) }; const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), .id_offset = sizeof(struct kvm_stats_header), .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + sizeof(kvm_vcpu_stats_desc), }; static bool core_reg_offset_is_vreg(u64 off) { return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) && off < KVM_REG_ARM_CORE_REG(fp_regs.fpsr); } static u64 core_reg_offset_from_id(u64 id) { return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); } static int core_reg_size_from_offset(const struct kvm_vcpu *vcpu, u64 off) { int size; switch (off) { case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... KVM_REG_ARM_CORE_REG(regs.regs[30]): case KVM_REG_ARM_CORE_REG(regs.sp): case KVM_REG_ARM_CORE_REG(regs.pc): case KVM_REG_ARM_CORE_REG(regs.pstate): case KVM_REG_ARM_CORE_REG(sp_el1): case KVM_REG_ARM_CORE_REG(elr_el1): case KVM_REG_ARM_CORE_REG(spsr[0]) ... KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]): size = sizeof(__u64); break; case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): size = sizeof(__uint128_t); break; case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): size = sizeof(__u32); break; default: return -EINVAL; } if (!IS_ALIGNED(off, size / sizeof(__u32))) return -EINVAL; /* * The KVM_REG_ARM64_SVE regs must be used instead of * KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on * SVE-enabled vcpus: */ if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(off)) return -EINVAL; return size; } static void *core_reg_addr(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { u64 off = core_reg_offset_from_id(reg->id); int size = core_reg_size_from_offset(vcpu, off); if (size < 0) return NULL; if (KVM_REG_SIZE(reg->id) != size) return NULL; switch (off) { case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... KVM_REG_ARM_CORE_REG(regs.regs[30]): off -= KVM_REG_ARM_CORE_REG(regs.regs[0]); off /= 2; return &vcpu->arch.ctxt.regs.regs[off]; case KVM_REG_ARM_CORE_REG(regs.sp): return &vcpu->arch.ctxt.regs.sp; case KVM_REG_ARM_CORE_REG(regs.pc): return &vcpu->arch.ctxt.regs.pc; case KVM_REG_ARM_CORE_REG(regs.pstate): return &vcpu->arch.ctxt.regs.pstate; case KVM_REG_ARM_CORE_REG(sp_el1): return __ctxt_sys_reg(&vcpu->arch.ctxt, SP_EL1); case KVM_REG_ARM_CORE_REG(elr_el1): return __ctxt_sys_reg(&vcpu->arch.ctxt, ELR_EL1); case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_EL1]): return __ctxt_sys_reg(&vcpu->arch.ctxt, SPSR_EL1); case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_ABT]): return &vcpu->arch.ctxt.spsr_abt; case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_UND]): return &vcpu->arch.ctxt.spsr_und; case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_IRQ]): return &vcpu->arch.ctxt.spsr_irq; case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_FIQ]): return &vcpu->arch.ctxt.spsr_fiq; case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): off -= KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]); off /= 4; return &vcpu->arch.ctxt.fp_regs.vregs[off]; case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): return &vcpu->arch.ctxt.fp_regs.fpsr; case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): return &vcpu->arch.ctxt.fp_regs.fpcr; default: return NULL; } } static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* * Because the kvm_regs structure is a mix of 32, 64 and * 128bit fields, we index it as if it was a 32bit * array. Hence below, nr_regs is the number of entries, and * off the index in the "array". */ __u32 __user *uaddr = (__u32 __user *)(unsigned long)reg->addr; int nr_regs = sizeof(struct kvm_regs) / sizeof(__u32); void *addr; u32 off; /* Our ID is an index into the kvm_regs struct. */ off = core_reg_offset_from_id(reg->id); if (off >= nr_regs || (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; addr = core_reg_addr(vcpu, reg); if (!addr) return -EINVAL; if (copy_to_user(uaddr, addr, KVM_REG_SIZE(reg->id))) return -EFAULT; return 0; } static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { __u32 __user *uaddr = (__u32 __user *)(unsigned long)reg->addr; int nr_regs = sizeof(struct kvm_regs) / sizeof(__u32); __uint128_t tmp; void *valp = &tmp, *addr; u64 off; int err = 0; /* Our ID is an index into the kvm_regs struct. */ off = core_reg_offset_from_id(reg->id); if (off >= nr_regs || (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; addr = core_reg_addr(vcpu, reg); if (!addr) return -EINVAL; if (KVM_REG_SIZE(reg->id) > sizeof(tmp)) return -EINVAL; if (copy_from_user(valp, uaddr, KVM_REG_SIZE(reg->id))) { err = -EFAULT; goto out; } if (off == KVM_REG_ARM_CORE_REG(regs.pstate)) { u64 mode = (*(u64 *)valp) & PSR_AA32_MODE_MASK; switch (mode) { case PSR_AA32_MODE_USR: if (!kvm_supports_32bit_el0()) return -EINVAL; break; case PSR_AA32_MODE_FIQ: case PSR_AA32_MODE_IRQ: case PSR_AA32_MODE_SVC: case PSR_AA32_MODE_ABT: case PSR_AA32_MODE_UND: case PSR_AA32_MODE_SYS: if (!vcpu_el1_is_32bit(vcpu)) return -EINVAL; break; case PSR_MODE_EL2h: case PSR_MODE_EL2t: if (!vcpu_has_nv(vcpu)) return -EINVAL; fallthrough; case PSR_MODE_EL0t: case PSR_MODE_EL1t: case PSR_MODE_EL1h: if (vcpu_el1_is_32bit(vcpu)) return -EINVAL; break; default: err = -EINVAL; goto out; } } memcpy(addr, valp, KVM_REG_SIZE(reg->id)); if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { int i, nr_reg; switch (*vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK) { /* * Either we are dealing with user mode, and only the * first 15 registers (+ PC) must be narrowed to 32bit. * AArch32 r0-r14 conveniently map to AArch64 x0-x14. */ case PSR_AA32_MODE_USR: case PSR_AA32_MODE_SYS: nr_reg = 15; break; /* * Otherwise, this is a privileged mode, and *all* the * registers must be narrowed to 32bit. */ default: nr_reg = 31; break; } for (i = 0; i < nr_reg; i++) vcpu_set_reg(vcpu, i, (u32)vcpu_get_reg(vcpu, i)); *vcpu_pc(vcpu) = (u32)*vcpu_pc(vcpu); } out: return err; } #define vq_word(vq) (((vq) - SVE_VQ_MIN) / 64) #define vq_mask(vq) ((u64)1 << ((vq) - SVE_VQ_MIN) % 64) #define vq_present(vqs, vq) (!!((vqs)[vq_word(vq)] & vq_mask(vq))) static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { unsigned int max_vq, vq; u64 vqs[KVM_ARM64_SVE_VLS_WORDS]; if (!vcpu_has_sve(vcpu)) return -ENOENT; if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl))) return -EINVAL; memset(vqs, 0, sizeof(vqs)); max_vq = vcpu_sve_max_vq(vcpu); for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) if (sve_vq_available(vq)) vqs[vq_word(vq)] |= vq_mask(vq); if (copy_to_user((void __user *)reg->addr, vqs, sizeof(vqs))) return -EFAULT; return 0; } static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { unsigned int max_vq, vq; u64 vqs[KVM_ARM64_SVE_VLS_WORDS]; if (!vcpu_has_sve(vcpu)) return -ENOENT; if (kvm_arm_vcpu_sve_finalized(vcpu)) return -EPERM; /* too late! */ if (WARN_ON(vcpu->arch.sve_state)) return -EINVAL; if (copy_from_user(vqs, (const void __user *)reg->addr, sizeof(vqs))) return -EFAULT; max_vq = 0; for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; ++vq) if (vq_present(vqs, vq)) max_vq = vq; if (max_vq > sve_vq_from_vl(kvm_sve_max_vl)) return -EINVAL; /* * Vector lengths supported by the host can't currently be * hidden from the guest individually: instead we can only set a * maximum via ZCR_EL2.LEN. So, make sure the available vector * lengths match the set requested exactly up to the requested * maximum: */ for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) if (vq_present(vqs, vq) != sve_vq_available(vq)) return -EINVAL; /* Can't run with no vector lengths at all: */ if (max_vq < SVE_VQ_MIN) return -EINVAL; /* vcpu->arch.sve_state will be alloc'd by kvm_vcpu_finalize_sve() */ vcpu->arch.sve_max_vl = sve_vl_from_vq(max_vq); return 0; } #define SVE_REG_SLICE_SHIFT 0 #define SVE_REG_SLICE_BITS 5 #define SVE_REG_ID_SHIFT (SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS) #define SVE_REG_ID_BITS 5 #define SVE_REG_SLICE_MASK \ GENMASK(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS - 1, \ SVE_REG_SLICE_SHIFT) #define SVE_REG_ID_MASK \ GENMASK(SVE_REG_ID_SHIFT + SVE_REG_ID_BITS - 1, SVE_REG_ID_SHIFT) #define SVE_NUM_SLICES (1 << SVE_REG_SLICE_BITS) #define KVM_SVE_ZREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_ZREG(0, 0)) #define KVM_SVE_PREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_PREG(0, 0)) /* * Number of register slices required to cover each whole SVE register. * NOTE: Only the first slice every exists, for now. * If you are tempted to modify this, you must also rework sve_reg_to_region() * to match: */ #define vcpu_sve_slices(vcpu) 1 /* Bounds of a single SVE register slice within vcpu->arch.sve_state */ struct sve_state_reg_region { unsigned int koffset; /* offset into sve_state in kernel memory */ unsigned int klen; /* length in kernel memory */ unsigned int upad; /* extra trailing padding in user memory */ }; /* * Validate SVE register ID and get sanitised bounds for user/kernel SVE * register copy */ static int sve_reg_to_region(struct sve_state_reg_region *region, struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* reg ID ranges for Z- registers */ const u64 zreg_id_min = KVM_REG_ARM64_SVE_ZREG(0, 0); const u64 zreg_id_max = KVM_REG_ARM64_SVE_ZREG(SVE_NUM_ZREGS - 1, SVE_NUM_SLICES - 1); /* reg ID ranges for P- registers and FFR (which are contiguous) */ const u64 preg_id_min = KVM_REG_ARM64_SVE_PREG(0, 0); const u64 preg_id_max = KVM_REG_ARM64_SVE_FFR(SVE_NUM_SLICES - 1); unsigned int vq; unsigned int reg_num; unsigned int reqoffset, reqlen; /* User-requested offset and length */ unsigned int maxlen; /* Maximum permitted length */ size_t sve_state_size; const u64 last_preg_id = KVM_REG_ARM64_SVE_PREG(SVE_NUM_PREGS - 1, SVE_NUM_SLICES - 1); /* Verify that the P-regs and FFR really do have contiguous IDs: */ BUILD_BUG_ON(KVM_REG_ARM64_SVE_FFR(0) != last_preg_id + 1); /* Verify that we match the UAPI header: */ BUILD_BUG_ON(SVE_NUM_SLICES != KVM_ARM64_SVE_MAX_SLICES); reg_num = (reg->id & SVE_REG_ID_MASK) >> SVE_REG_ID_SHIFT; if (reg->id >= zreg_id_min && reg->id <= zreg_id_max) { if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) return -ENOENT; vq = vcpu_sve_max_vq(vcpu); reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) - SVE_SIG_REGS_OFFSET; reqlen = KVM_SVE_ZREG_SIZE; maxlen = SVE_SIG_ZREG_SIZE(vq); } else if (reg->id >= preg_id_min && reg->id <= preg_id_max) { if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) return -ENOENT; vq = vcpu_sve_max_vq(vcpu); reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) - SVE_SIG_REGS_OFFSET; reqlen = KVM_SVE_PREG_SIZE; maxlen = SVE_SIG_PREG_SIZE(vq); } else { return -EINVAL; } sve_state_size = vcpu_sve_state_size(vcpu); if (WARN_ON(!sve_state_size)) return -EINVAL; region->koffset = array_index_nospec(reqoffset, sve_state_size); region->klen = min(maxlen, reqlen); region->upad = reqlen - region->klen; return 0; } static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { int ret; struct sve_state_reg_region region; char __user *uptr = (char __user *)reg->addr; /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */ if (reg->id == KVM_REG_ARM64_SVE_VLS) return get_sve_vls(vcpu, reg); /* Try to interpret reg ID as an architectural SVE register... */ ret = sve_reg_to_region(®ion, vcpu, reg); if (ret) return ret; if (!kvm_arm_vcpu_sve_finalized(vcpu)) return -EPERM; if (copy_to_user(uptr, vcpu->arch.sve_state + region.koffset, region.klen) || clear_user(uptr + region.klen, region.upad)) return -EFAULT; return 0; } static int set_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { int ret; struct sve_state_reg_region region; const char __user *uptr = (const char __user *)reg->addr; /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */ if (reg->id == KVM_REG_ARM64_SVE_VLS) return set_sve_vls(vcpu, reg); /* Try to interpret reg ID as an architectural SVE register... */ ret = sve_reg_to_region(®ion, vcpu, reg); if (ret) return ret; if (!kvm_arm_vcpu_sve_finalized(vcpu)) return -EPERM; if (copy_from_user(vcpu->arch.sve_state + region.koffset, uptr, region.klen)) return -EFAULT; return 0; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { return -EINVAL; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { return -EINVAL; } static int copy_core_reg_indices(const struct kvm_vcpu *vcpu, u64 __user *uindices) { unsigned int i; int n = 0; for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) { u64 reg = KVM_REG_ARM64 | KVM_REG_ARM_CORE | i; int size = core_reg_size_from_offset(vcpu, i); if (size < 0) continue; switch (size) { case sizeof(__u32): reg |= KVM_REG_SIZE_U32; break; case sizeof(__u64): reg |= KVM_REG_SIZE_U64; break; case sizeof(__uint128_t): reg |= KVM_REG_SIZE_U128; break; default: WARN_ON(1); continue; } if (uindices) { if (put_user(reg, uindices)) return -EFAULT; uindices++; } n++; } return n; } static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) { return copy_core_reg_indices(vcpu, NULL); } static const u64 timer_reg_list[] = { KVM_REG_ARM_TIMER_CTL, KVM_REG_ARM_TIMER_CNT, KVM_REG_ARM_TIMER_CVAL, KVM_REG_ARM_PTIMER_CTL, KVM_REG_ARM_PTIMER_CNT, KVM_REG_ARM_PTIMER_CVAL, }; #define NUM_TIMER_REGS ARRAY_SIZE(timer_reg_list) static bool is_timer_reg(u64 index) { switch (index) { case KVM_REG_ARM_TIMER_CTL: case KVM_REG_ARM_TIMER_CNT: case KVM_REG_ARM_TIMER_CVAL: case KVM_REG_ARM_PTIMER_CTL: case KVM_REG_ARM_PTIMER_CNT: case KVM_REG_ARM_PTIMER_CVAL: return true; } return false; } static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { for (int i = 0; i < NUM_TIMER_REGS; i++) { if (put_user(timer_reg_list[i], uindices)) return -EFAULT; uindices++; } return 0; } static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(long)reg->addr; u64 val; int ret; ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)); if (ret != 0) return -EFAULT; return kvm_arm_timer_set_reg(vcpu, reg->id, val); } static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(long)reg->addr; u64 val; val = kvm_arm_timer_get_reg(vcpu, reg->id); return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0; } static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu) { const unsigned int slices = vcpu_sve_slices(vcpu); if (!vcpu_has_sve(vcpu)) return 0; /* Policed by KVM_GET_REG_LIST: */ WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */) + 1; /* KVM_REG_ARM64_SVE_VLS */ } static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, u64 __user *uindices) { const unsigned int slices = vcpu_sve_slices(vcpu); u64 reg; unsigned int i, n; int num_regs = 0; if (!vcpu_has_sve(vcpu)) return 0; /* Policed by KVM_GET_REG_LIST: */ WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); /* * Enumerate this first, so that userspace can save/restore in * the order reported by KVM_GET_REG_LIST: */ reg = KVM_REG_ARM64_SVE_VLS; if (put_user(reg, uindices++)) return -EFAULT; ++num_regs; for (i = 0; i < slices; i++) { for (n = 0; n < SVE_NUM_ZREGS; n++) { reg = KVM_REG_ARM64_SVE_ZREG(n, i); if (put_user(reg, uindices++)) return -EFAULT; num_regs++; } for (n = 0; n < SVE_NUM_PREGS; n++) { reg = KVM_REG_ARM64_SVE_PREG(n, i); if (put_user(reg, uindices++)) return -EFAULT; num_regs++; } reg = KVM_REG_ARM64_SVE_FFR(i); if (put_user(reg, uindices++)) return -EFAULT; num_regs++; } return num_regs; } /** * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG * @vcpu: the vCPU pointer * * This is for all registers. */ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) { unsigned long res = 0; res += num_core_regs(vcpu); res += num_sve_regs(vcpu); res += kvm_arm_num_sys_reg_descs(vcpu); res += kvm_arm_get_fw_num_regs(vcpu); res += NUM_TIMER_REGS; return res; } /** * kvm_arm_copy_reg_indices - get indices of all registers. * @vcpu: the vCPU pointer * @uindices: register list to copy * * We do core registers right here, then we append system regs. */ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { int ret; ret = copy_core_reg_indices(vcpu, uindices); if (ret < 0) return ret; uindices += ret; ret = copy_sve_reg_indices(vcpu, uindices); if (ret < 0) return ret; uindices += ret; ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices); if (ret < 0) return ret; uindices += kvm_arm_get_fw_num_regs(vcpu); ret = copy_timer_indices(vcpu, uindices); if (ret < 0) return ret; uindices += NUM_TIMER_REGS; return kvm_arm_copy_sys_reg_indices(vcpu, uindices); } int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* We currently use nothing arch-specific in upper 32 bits */ if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return get_core_reg(vcpu, reg); case KVM_REG_ARM_FW: case KVM_REG_ARM_FW_FEAT_BMAP: return kvm_arm_get_fw_reg(vcpu, reg); case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg); } if (is_timer_reg(reg->id)) return get_timer_reg(vcpu, reg); return kvm_arm_sys_reg_get_reg(vcpu, reg); } int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* We currently use nothing arch-specific in upper 32 bits */ if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return set_core_reg(vcpu, reg); case KVM_REG_ARM_FW: case KVM_REG_ARM_FW_FEAT_BMAP: return kvm_arm_set_fw_reg(vcpu, reg); case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg); } if (is_timer_reg(reg->id)) return set_timer_reg(vcpu, reg); return kvm_arm_sys_reg_set_reg(vcpu, reg); } int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { return -EINVAL; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { return -EINVAL; } int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { events->exception.serror_pending = !!(vcpu->arch.hcr_el2 & HCR_VSE); events->exception.serror_has_esr = cpus_have_final_cap(ARM64_HAS_RAS_EXTN); if (events->exception.serror_pending && events->exception.serror_has_esr) events->exception.serror_esr = vcpu_get_vsesr(vcpu); /* * We never return a pending ext_dabt here because we deliver it to * the virtual CPU directly when setting the event and it's no longer * 'pending' at this point. */ return 0; } int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { bool serror_pending = events->exception.serror_pending; bool has_esr = events->exception.serror_has_esr; bool ext_dabt_pending = events->exception.ext_dabt_pending; if (serror_pending && has_esr) { if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) return -EINVAL; if (!((events->exception.serror_esr) & ~ESR_ELx_ISS_MASK)) kvm_set_sei_esr(vcpu, events->exception.serror_esr); else return -EINVAL; } else if (serror_pending) { kvm_inject_vabt(vcpu); } if (ext_dabt_pending) kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); return 0; } u32 __attribute_const__ kvm_target_cpu(void) { unsigned long implementor = read_cpuid_implementor(); unsigned long part_number = read_cpuid_part_number(); switch (implementor) { case ARM_CPU_IMP_ARM: switch (part_number) { case ARM_CPU_PART_AEM_V8: return KVM_ARM_TARGET_AEM_V8; case ARM_CPU_PART_FOUNDATION: return KVM_ARM_TARGET_FOUNDATION_V8; case ARM_CPU_PART_CORTEX_A53: return KVM_ARM_TARGET_CORTEX_A53; case ARM_CPU_PART_CORTEX_A57: return KVM_ARM_TARGET_CORTEX_A57; } break; case ARM_CPU_IMP_APM: switch (part_number) { case APM_CPU_PART_XGENE: return KVM_ARM_TARGET_XGENE_POTENZA; } break; } /* Return a default generic target */ return KVM_ARM_TARGET_GENERIC_V8; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; } int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { return -EINVAL; } /** * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging * @vcpu: the vCPU pointer * @dbg: the ioctl data buffer * * This sets up and enables the VM for guest debugging. Userspace * passes in a control flag to enable different debug types and * potentially other architecture specific information in the rest of * the structure. */ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { trace_kvm_set_guest_debug(vcpu, dbg->control); if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) return -EINVAL; if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { vcpu->guest_debug = 0; vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); return 0; } vcpu->guest_debug = dbg->control; /* Hardware assisted Break and Watch points */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) vcpu->arch.external_debug_state = dbg->arch; return 0; } int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret; switch (attr->group) { case KVM_ARM_VCPU_PMU_V3_CTRL: mutex_lock(&vcpu->kvm->arch.config_lock); ret = kvm_arm_pmu_v3_set_attr(vcpu, attr); mutex_unlock(&vcpu->kvm->arch.config_lock); break; case KVM_ARM_VCPU_TIMER_CTRL: ret = kvm_arm_timer_set_attr(vcpu, attr); break; case KVM_ARM_VCPU_PVTIME_CTRL: ret = kvm_arm_pvtime_set_attr(vcpu, attr); break; default: ret = -ENXIO; break; } return ret; } int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret; switch (attr->group) { case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_get_attr(vcpu, attr); break; case KVM_ARM_VCPU_TIMER_CTRL: ret = kvm_arm_timer_get_attr(vcpu, attr); break; case KVM_ARM_VCPU_PVTIME_CTRL: ret = kvm_arm_pvtime_get_attr(vcpu, attr); break; default: ret = -ENXIO; break; } return ret; } int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret; switch (attr->group) { case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_has_attr(vcpu, attr); break; case KVM_ARM_VCPU_TIMER_CTRL: ret = kvm_arm_timer_has_attr(vcpu, attr); break; case KVM_ARM_VCPU_PVTIME_CTRL: ret = kvm_arm_pvtime_has_attr(vcpu, attr); break; default: ret = -ENXIO; break; } return ret; } int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, struct kvm_arm_copy_mte_tags *copy_tags) { gpa_t guest_ipa = copy_tags->guest_ipa; size_t length = copy_tags->length; void __user *tags = copy_tags->addr; gpa_t gfn; bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); int ret = 0; if (!kvm_has_mte(kvm)) return -EINVAL; if (copy_tags->reserved[0] || copy_tags->reserved[1]) return -EINVAL; if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) return -EINVAL; if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) return -EINVAL; /* Lengths above INT_MAX cannot be represented in the return value */ if (length > INT_MAX) return -EINVAL; gfn = gpa_to_gfn(guest_ipa); mutex_lock(&kvm->slots_lock); if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) { ret = -EBUSY; goto out; } while (length > 0) { struct page *page = __gfn_to_page(kvm, gfn, write); void *maddr; unsigned long num_tags; struct folio *folio; if (!page) { ret = -EFAULT; goto out; } if (!pfn_to_online_page(page_to_pfn(page))) { /* Reject ZONE_DEVICE memory */ kvm_release_page_unused(page); ret = -EFAULT; goto out; } folio = page_folio(page); maddr = page_address(page); if (!write) { if ((folio_test_hugetlb(folio) && folio_test_hugetlb_mte_tagged(folio)) || page_mte_tagged(page)) num_tags = mte_copy_tags_to_user(tags, maddr, MTE_GRANULES_PER_PAGE); else /* No tags in memory, so write zeros */ num_tags = MTE_GRANULES_PER_PAGE - clear_user(tags, MTE_GRANULES_PER_PAGE); kvm_release_page_clean(page); } else { /* * Only locking to serialise with a concurrent * __set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ if (folio_test_hugetlb(folio)) folio_try_hugetlb_mte_tagging(folio); else try_page_mte_tagging(page); num_tags = mte_copy_tags_from_user(maddr, tags, MTE_GRANULES_PER_PAGE); /* uaccess failed, don't leave stale tags */ if (num_tags != MTE_GRANULES_PER_PAGE) mte_clear_page_tags(maddr); if (folio_test_hugetlb(folio)) folio_set_hugetlb_mte_tagged(folio); else set_page_mte_tagged(page); kvm_release_page_dirty(page); } if (num_tags != MTE_GRANULES_PER_PAGE) { ret = -EFAULT; goto out; } gfn++; tags += num_tags; length -= PAGE_SIZE; } out: mutex_unlock(&kvm->slots_lock); /* If some data has been copied report the number of bytes copied */ if (length != copy_tags->length) return copy_tags->length - length; return ret; } |
324 170 170 333 2 330 8 551 71 71 71 63 552 320 353 3 905 898 71 354 121 63 903 898 317 900 54 900 358 358 572 905 346 138 860 333 7 860 7 2 71 7 334 333 3 335 71 335 552 550 550 551 333 12 71 334 3 2 3 336 337 334 337 336 71 318 3 336 146 902 357 905 901 326 318 34 860 652 335 335 335 335 326 327 60 325 327 327 327 296 326 327 327 4 4 4 348 70 98 320 321 4 71 71 3 71 71 71 71 318 318 318 318 71 71 3 71 71 71 9 9 1 59 2 27 49 49 23 1 71 71 70 12 70 71 71 69 71 71 71 67 333 71 71 71 71 71 71 71 71 71 70 71 70 7 42 47 5 31 50 24 3 23 2 18 9 4 65 3 56 14 3 63 63 63 358 357 111 157 357 2 2 2 2 34 11 31 34 34 71 34 29 24 24 23 1 33 3 71 8 9 34 9 9 9 9 71 71 2 71 71 71 71 71 71 3 71 71 71 71 7 71 71 71 71 71 71 71 71 71 71 70 24 28 34 1 29 24 29 8 34 874 876 874 855 629 875 876 872 874 4 34 34 2 9 34 34 34 34 34 34 34 34 34 9 28 28 28 22 19 28 33 9 63 63 63 63 3 63 59 7 63 63 3 63 63 63 63 63 63 63 61 63 39 9 46 51 46 9 38 38 63 63 9 46 38 63 9 63 137 138 138 9 129 136 138 318 290 31 328 23 27 357 358 356 84 358 358 61 31 31 31 31 21 2 19 2 23 4 1 12 11 258 167 166 167 167 165 31 30 31 8 23 31 31 286 235 70 319 69 331 331 266 266 1 332 15 65 326 331 319 69 9 9 3 6 9 59 74 7 6 126 6 196 3 52 51 1 52 65 352 65 327 200 63 52 9 332 31 65 327 23 355 85 281 136 63 9 355 326 358 196 357 132 358 353 70 63 52 330 341 342 202 327 146 9 44 33 33 33 33 33 33 334 334 38 38 38 36 9 38 38 334 342 343 46 336 337 342 38 344 343 343 41 65 552 335 551 3 551 552 551 551 551 550 551 333 643 642 641 551 552 81 643 551 642 552 643 644 642 550 551 33 33 33 33 33 33 33 219 258 642 257 257 33 33 33 33 33 33 33 33 3 3 3 3 3 3 3 3 129 130 3 3 130 7 130 317 317 318 307 307 8 307 89 307 334 333 242 319 325 326 357 358 353 117 117 334 317 156 317 317 128 51 51 344 343 13 13 342 344 133 133 552 552 290 552 551 1 257 551 552 339 165 167 167 167 167 11 11 9 9 5 33 33 338 337 277 130 24 595 594 594 597 34 159 595 34 593 335 333 334 333 335 335 335 335 335 335 333 334 335 335 335 335 335 335 335 335 335 335 334 333 335 335 335 334 335 335 335 335 335 333 335 334 335 335 335 333 334 333 334 335 335 335 335 334 334 335 335 334 335 335 333 335 335 333 334 334 335 335 335 335 335 335 335 335 335 335 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 | // SPDX-License-Identifier: GPL-2.0+ /* * Maple Tree implementation * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett <Liam.Howlett@oracle.com> * Matthew Wilcox <willy@infradead.org> * Copyright (c) 2023 ByteDance * Author: Peng Zhang <zhangpeng.00@bytedance.com> */ /* * DOC: Interesting implementation details of the Maple Tree * * Each node type has a number of slots for entries and a number of slots for * pivots. In the case of dense nodes, the pivots are implied by the position * and are simply the slot index + the minimum of the node. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges. Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. * * * The following illustrates the layout of a range64 nodes slots and pivots. * * * Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 | * ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ * │ │ │ │ │ │ │ │ └─ Implied maximum * │ │ │ │ │ │ │ └─ Pivot 14 * │ │ │ │ │ │ └─ Pivot 13 * │ │ │ │ │ └─ Pivot 12 * │ │ │ │ └─ Pivot 11 * │ │ │ └─ Pivot 2 * │ │ └─ Pivot 1 * │ └─ Pivot 0 * └─ Implied minimum * * Slot contents: * Internal (non-leaf) nodes contain pointers to other nodes. * Leaf nodes contain entries. * * The location of interest is often referred to as an offset. All offsets have * a slot, but the last offset has an implied pivot from the node above (or * UINT_MAX for the root node. * * Ranges complicate certain write activities. When modifying any of * the B-tree variants, it is known that one entry will either be added or * deleted. When modifying the Maple Tree, one store operation may overwrite * the entire data set, or one half of the tree, or the middle half of the tree. * */ #include <linux/maple_tree.h> #include <linux/xarray.h> #include <linux/types.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/limits.h> #include <asm/barrier.h> #define CREATE_TRACE_POINTS #include <trace/events/maple_tree.h> /* * Kernel pointer hashing renders much of the maple tree dump useless as tagged * pointers get hashed to arbitrary values. * * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is * permissible to bypass this. Otherwise remain cautious and retain the hashing. * * Userland doesn't know about %px so also use %p there. */ #if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE) #define PTR_FMT "%px" #else #define PTR_FMT "%p" #endif #define MA_ROOT_PARENT 1 /* * Maple state flags * * MA_STATE_BULK - Bulk insert mode * * MA_STATE_REBALANCE - Indicate a rebalance during bulk insert * * MA_STATE_PREALLOC - Preallocated nodes, WARN_ON allocation */ #define MA_STATE_BULK 1 #define MA_STATE_REBALANCE 2 #define MA_STATE_PREALLOC 4 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) #define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) #define ma_mnode_ptr(x) ((struct maple_node *)(x)) #define ma_enode_ptr(x) ((struct maple_enode *)(x)) static struct kmem_cache *maple_node_cache; #ifdef CONFIG_DEBUG_MAPLE_TREE static const unsigned long mt_max[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = ULONG_MAX, [maple_range_64] = ULONG_MAX, [maple_arange_64] = ULONG_MAX, }; #define mt_node_max(x) mt_max[mte_node_type(x)] #endif static const unsigned char mt_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = MAPLE_RANGE64_SLOTS, [maple_range_64] = MAPLE_RANGE64_SLOTS, [maple_arange_64] = MAPLE_ARANGE64_SLOTS, }; #define mt_slot_count(x) mt_slots[mte_node_type(x)] static const unsigned char mt_pivots[] = { [maple_dense] = 0, [maple_leaf_64] = MAPLE_RANGE64_SLOTS - 1, [maple_range_64] = MAPLE_RANGE64_SLOTS - 1, [maple_arange_64] = MAPLE_ARANGE64_SLOTS - 1, }; #define mt_pivot_count(x) mt_pivots[mte_node_type(x)] static const unsigned char mt_min_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS / 2, [maple_leaf_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_range_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_arange_64] = (MAPLE_ARANGE64_SLOTS / 2) - 1, }; #define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)] #define MAPLE_BIG_NODE_SLOTS (MAPLE_RANGE64_SLOTS * 2 + 2) #define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1) struct maple_big_node { unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1]; union { struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS]; struct { unsigned long padding[MAPLE_BIG_NODE_GAPS]; unsigned long gap[MAPLE_BIG_NODE_GAPS]; }; }; unsigned char b_end; enum maple_type type; }; /* * The maple_subtree_state is used to build a tree to replace a segment of an * existing tree in a more atomic way. Any walkers of the older tree will hit a * dead node and restart on updates. */ struct maple_subtree_state { struct ma_state *orig_l; /* Original left side of subtree */ struct ma_state *orig_r; /* Original right side of subtree */ struct ma_state *l; /* New left side of subtree */ struct ma_state *m; /* New middle of subtree (rare) */ struct ma_state *r; /* New right side of subtree */ struct ma_topiary *free; /* nodes to be freed */ struct ma_topiary *destroy; /* Nodes to be destroyed (walked and freed) */ struct maple_big_node *bn; }; #ifdef CONFIG_KASAN_STACK /* Prevent mas_wr_bnode() from exceeding the stack frame limit */ #define noinline_for_kasan noinline_for_stack #else #define noinline_for_kasan inline #endif /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { return kmem_cache_alloc(maple_node_cache, gfp); } static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) { return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } static inline void mt_free_one(struct maple_node *node) { kmem_cache_free(maple_node_cache, node); } static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } static void mt_free_rcu(struct rcu_head *head) { struct maple_node *node = container_of(head, struct maple_node, rcu); kmem_cache_free(maple_node_cache, node); } /* * ma_free_rcu() - Use rcu callback to free a maple node * @node: The node to free * * The maple tree uses the parent pointer to indicate this node is no longer in * use and will be freed. */ static void ma_free_rcu(struct maple_node *node) { WARN_ON(node->parent != ma_parent_ptr(node)); call_rcu(&node->rcu, mt_free_rcu); } static void mas_set_height(struct ma_state *mas) { unsigned int new_flags = mas->tree->ma_flags; new_flags &= ~MT_FLAGS_HEIGHT_MASK; MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX); new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET; mas->tree->ma_flags = new_flags; } static unsigned int mas_mt_height(struct ma_state *mas) { return mt_height(mas->tree); } static inline unsigned int mt_attr(struct maple_tree *mt) { return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; } static __always_inline enum maple_type mte_node_type( const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & MAPLE_NODE_TYPE_MASK; } static __always_inline bool ma_is_dense(const enum maple_type type) { return type < maple_leaf_64; } static __always_inline bool ma_is_leaf(const enum maple_type type) { return type < maple_range_64; } static __always_inline bool mte_is_leaf(const struct maple_enode *entry) { return ma_is_leaf(mte_node_type(entry)); } /* * We also reserve values with the bottom two bits set to '10' which are * below 4096 */ static __always_inline bool mt_is_reserved(const void *entry) { return ((unsigned long)entry < MAPLE_RESERVED_RANGE) && xa_is_internal(entry); } static __always_inline void mas_set_err(struct ma_state *mas, long err) { mas->node = MA_ERROR(err); mas->status = ma_error; } static __always_inline bool mas_is_ptr(const struct ma_state *mas) { return mas->status == ma_root; } static __always_inline bool mas_is_start(const struct ma_state *mas) { return mas->status == ma_start; } static __always_inline bool mas_is_none(const struct ma_state *mas) { return mas->status == ma_none; } static __always_inline bool mas_is_paused(const struct ma_state *mas) { return mas->status == ma_pause; } static __always_inline bool mas_is_overflow(struct ma_state *mas) { return mas->status == ma_overflow; } static inline bool mas_is_underflow(struct ma_state *mas) { return mas->status == ma_underflow; } static __always_inline struct maple_node *mte_to_node( const struct maple_enode *entry) { return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mte_to_mat() - Convert a maple encoded node to a maple topiary node. * @entry: The maple encoded node * * Return: a maple topiary pointer */ static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry) { return (struct maple_topiary *) ((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mas_mn() - Get the maple state node. * @mas: The maple state * * Return: the maple node (not encoded - bare pointer). */ static inline struct maple_node *mas_mn(const struct ma_state *mas) { return mte_to_node(mas->node); } /* * mte_set_node_dead() - Set a maple encoded node as dead. * @mn: The maple encoded node. */ static inline void mte_set_node_dead(struct maple_enode *mn) { mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn)); smp_wmb(); /* Needed for RCU */ } /* Bit 1 indicates the root is a node */ #define MAPLE_ROOT_NODE 0x02 /* maple_type stored bit 3-6 */ #define MAPLE_ENODE_TYPE_SHIFT 0x03 /* Bit 2 means a NULL somewhere below */ #define MAPLE_ENODE_NULL 0x04 static inline struct maple_enode *mt_mk_node(const struct maple_node *node, enum maple_type type) { return (void *)((unsigned long)node | (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL); } static inline void *mte_mk_root(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ROOT_NODE); } static inline void *mte_safe_root(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); } static inline void __maybe_unused *mte_set_full(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); } static inline void __maybe_unused *mte_clear_full(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ENODE_NULL); } static inline bool __maybe_unused mte_has_null(const struct maple_enode *node) { return (unsigned long)node & MAPLE_ENODE_NULL; } static __always_inline bool ma_is_root(struct maple_node *node) { return ((unsigned long)node->parent & MA_ROOT_PARENT); } static __always_inline bool mte_is_root(const struct maple_enode *node) { return ma_is_root(mte_to_node(node)); } static inline bool mas_is_root_limits(const struct ma_state *mas) { return !mas->min && mas->max == ULONG_MAX; } static __always_inline bool mt_is_alloc(struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE); } /* * The Parent Pointer * Excluding root, the parent pointer is 256B aligned like all other tree nodes. * When storing a 32 or 64 bit values, the offset can fit into 5 bits. The 16 * bit values need an extra bit to store the offset. This extra bit comes from * a reuse of the last bit in the node type. This is possible by using bit 1 to * indicate if bit 2 is part of the type or the slot. * * Note types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and alignment * 0b??1 : Root * 0b?00 : 16 bit values, type in 0-1, slot in 2-7 * 0b010 : 32 bit values, type in 0-2, slot in 3-7 * 0b110 : 64 bit values, type in 0-2, slot in 3-7 */ #define MAPLE_PARENT_ROOT 0x01 #define MAPLE_PARENT_SLOT_SHIFT 0x03 #define MAPLE_PARENT_SLOT_MASK 0xF8 #define MAPLE_PARENT_16B_SLOT_SHIFT 0x02 #define MAPLE_PARENT_16B_SLOT_MASK 0xFC #define MAPLE_PARENT_RANGE64 0x06 #define MAPLE_PARENT_RANGE32 0x04 #define MAPLE_PARENT_NOT_RANGE16 0x02 /* * mte_parent_shift() - Get the parent shift for the slot storage. * @parent: The parent pointer cast as an unsigned long * Return: The shift into that pointer to the star to of the slot */ static inline unsigned long mte_parent_shift(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_SHIFT; return MAPLE_PARENT_16B_SLOT_SHIFT; } /* * mte_parent_slot_mask() - Get the slot mask for the parent. * @parent: The parent pointer cast as an unsigned long. * Return: The slot mask for that parent. */ static inline unsigned long mte_parent_slot_mask(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_MASK; return MAPLE_PARENT_16B_SLOT_MASK; } /* * mas_parent_type() - Return the maple_type of the parent from the stored * parent type. * @mas: The maple state * @enode: The maple_enode to extract the parent's enum * Return: The node->parent maple_type */ static inline enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode) { unsigned long p_type; p_type = (unsigned long)mte_to_node(enode)->parent; if (WARN_ON(p_type & MAPLE_PARENT_ROOT)) return 0; p_type &= MAPLE_NODE_MASK; p_type &= ~mte_parent_slot_mask(p_type); switch (p_type) { case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */ if (mt_is_alloc(mas->tree)) return maple_arange_64; return maple_range_64; } return 0; } /* * mas_set_parent() - Set the parent node and encode the slot * @mas: The maple state * @enode: The encoded maple node. * @parent: The encoded maple node that is the parent of @enode. * @slot: The slot that @enode resides in @parent. * * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the * parent type. */ static inline void mas_set_parent(struct ma_state *mas, struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); MAS_BUG_ON(mas, p_type == maple_dense); MAS_BUG_ON(mas, p_type == maple_leaf_64); switch (p_type) { case maple_range_64: case maple_arange_64: shift = MAPLE_PARENT_SLOT_SHIFT; type = MAPLE_PARENT_RANGE64; break; default: case maple_dense: case maple_leaf_64: shift = type = 0; break; } val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */ val |= (slot << shift) | type; mte_to_node(enode)->parent = ma_parent_ptr(val); } /* * mte_parent_slot() - get the parent slot of @enode. * @enode: The encoded maple node. * * Return: The slot in the parent node where @enode resides. */ static __always_inline unsigned int mte_parent_slot(const struct maple_enode *enode) { unsigned long val = (unsigned long)mte_to_node(enode)->parent; if (unlikely(val & MA_ROOT_PARENT)) return 0; /* * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT */ return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val); } /* * mte_parent() - Get the parent of @node. * @enode: The encoded maple node. * * Return: The parent maple node. */ static __always_inline struct maple_node *mte_parent(const struct maple_enode *enode) { return (void *)((unsigned long) (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK); } /* * ma_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool ma_dead_node(const struct maple_node *node) { struct maple_node *parent; /* Do not reorder reads from the node prior to the parent check */ smp_rmb(); parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK); return (parent == node); } /* * mte_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool mte_dead_node(const struct maple_enode *enode) { struct maple_node *node; node = mte_to_node(enode); return ma_dead_node(node); } /* * mas_allocated() - Get the number of nodes allocated in a maple state. * @mas: The maple state * * The ma_state alloc member is overloaded to hold a pointer to the first * allocated node or to the number of requested nodes to allocate. If bit 0 is * set, then the alloc contains the number of requested nodes. If there is an * allocated node, then the total allocated nodes is in that node. * * Return: The total number of nodes allocated */ static inline unsigned long mas_allocated(const struct ma_state *mas) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) return 0; return mas->alloc->total; } /* * mas_set_alloc_req() - Set the requested number of allocations. * @mas: the maple state * @count: the number of allocations. * * The requested number of allocations is either in the first allocated node, * located in @mas->alloc->request_count, or directly in @mas->alloc if there is * no allocated node. Set the request either in the node or do the necessary * encoding to store in @mas->alloc directly. */ static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { if (!count) mas->alloc = NULL; else mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); return; } mas->alloc->request_count = count; } /* * mas_alloc_req() - get the requested number of allocations. * @mas: The maple state * * The alloc count is either stored directly in @mas, or in * @mas->alloc->request_count if there is at least one node allocated. Decode * the request count if it's stored directly in @mas->alloc. * * Return: The allocation request count. */ static inline unsigned int mas_alloc_req(const struct ma_state *mas) { if ((unsigned long)mas->alloc & 0x1) return (unsigned long)(mas->alloc) >> 1; else if (mas->alloc) return mas->alloc->request_count; return 0; } /* * ma_pivots() - Get a pointer to the maple node pivots. * @node: the maple node * @type: the node type * * In the event of a dead node, this array may be %NULL * * Return: A pointer to the maple node pivots */ static inline unsigned long *ma_pivots(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.pivot; case maple_range_64: case maple_leaf_64: return node->mr64.pivot; case maple_dense: return NULL; } return NULL; } /* * ma_gaps() - Get a pointer to the maple node gaps. * @node: the maple node * @type: the node type * * Return: A pointer to the maple node gaps */ static inline unsigned long *ma_gaps(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.gap; case maple_range_64: case maple_leaf_64: case maple_dense: return NULL; } return NULL; } /* * mas_safe_pivot() - get the pivot at @piv or mas->max. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @piv: The pivot to fetch * @type: The maple node type * * Return: The pivot at @piv within the limit of the @pivots array, @mas->max * otherwise. */ static __always_inline unsigned long mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots, unsigned char piv, enum maple_type type) { if (piv >= mt_pivots[type]) return mas->max; return pivots[piv]; } /* * mas_safe_min() - Return the minimum for a given offset. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @offset: The offset into the pivot array * * Return: The minimum range value that is contained in @offset. */ static inline unsigned long mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset) { if (likely(offset)) return pivots[offset - 1] + 1; return mas->min; } /* * mte_set_pivot() - Set a pivot to a value in an encoded maple node. * @mn: The encoded maple node * @piv: The pivot offset * @val: The value of the pivot */ static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv, unsigned long val) { struct maple_node *node = mte_to_node(mn); enum maple_type type = mte_node_type(mn); BUG_ON(piv >= mt_pivots[type]); switch (type) { case maple_range_64: case maple_leaf_64: node->mr64.pivot[piv] = val; break; case maple_arange_64: node->ma64.pivot[piv] = val; break; case maple_dense: break; } } /* * ma_slots() - Get a pointer to the maple node slots. * @mn: The maple node * @mt: The maple node type * * Return: A pointer to the maple node slots */ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return mn->ma64.slot; case maple_range_64: case maple_leaf_64: return mn->mr64.slot; case maple_dense: return mn->slot; } return NULL; } static inline bool mt_write_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_write_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline bool mt_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline void *mt_slot(const struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_check(slots[offset], mt_locked(mt)); } static __always_inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_protected(slots[offset], mt_write_locked(mt)); } /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset. */ static __always_inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot_locked(mas->tree, slots, offset); } /* * mas_slot() - Get the slot value when not holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset */ static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot(mas->tree, slots, offset); } /* * mas_root() - Get the maple tree root. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static __always_inline void *mas_root(struct ma_state *mas) { return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree)); } static inline void *mt_root_locked(struct maple_tree *mt) { return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt)); } /* * mas_root_locked() - Get the maple tree root when holding the maple tree lock. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static inline void *mas_root_locked(struct ma_state *mas) { return mt_root_locked(mas->tree); } static inline struct maple_metadata *ma_meta(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return &mn->ma64.meta; default: return &mn->mr64.meta; } } /* * ma_set_meta() - Set the metadata information of a node. * @mn: The maple node * @mt: The maple node type * @offset: The offset of the highest sub-gap in this node. * @end: The end of the data in this node. */ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, unsigned char offset, unsigned char end) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; meta->end = end; } /* * mt_clear_meta() - clear the metadata information of a node, if it exists * @mt: The maple tree * @mn: The maple node * @type: The maple node type */ static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, enum maple_type type) { struct maple_metadata *meta; unsigned long *pivots; void __rcu **slots; void *next; switch (type) { case maple_range_64: pivots = mn->mr64.pivot; if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { slots = mn->mr64.slot; next = mt_slot_locked(mt, slots, MAPLE_RANGE64_SLOTS - 1); if (unlikely((mte_to_node(next) && mte_node_type(next)))) return; /* no metadata, could be node */ } fallthrough; case maple_arange_64: meta = ma_meta(mn, type); break; default: return; } meta->gap = 0; meta->end = 0; } /* * ma_meta_end() - Get the data end of a node from the metadata * @mn: The maple node * @mt: The maple node type */ static inline unsigned char ma_meta_end(struct maple_node *mn, enum maple_type mt) { struct maple_metadata *meta = ma_meta(mn, mt); return meta->end; } /* * ma_meta_gap() - Get the largest gap location of a node from the metadata * @mn: The maple node */ static inline unsigned char ma_meta_gap(struct maple_node *mn) { return mn->ma64.meta.gap; } /* * ma_set_meta_gap() - Set the largest gap location in a nodes metadata * @mn: The maple node * @mt: The maple node type * @offset: The location of the largest gap. */ static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, unsigned char offset) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; } /* * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes. * @mat: the ma_topiary, a linked list of dead nodes. * @dead_enode: the node to be marked as dead and added to the tail of the list * * Add the @dead_enode to the linked list in @mat. */ static inline void mat_add(struct ma_topiary *mat, struct maple_enode *dead_enode) { mte_set_node_dead(dead_enode); mte_to_mat(dead_enode)->next = NULL; if (!mat->tail) { mat->tail = mat->head = dead_enode; return; } mte_to_mat(mat->tail)->next = dead_enode; mat->tail = dead_enode; } static void mt_free_walk(struct rcu_head *head); static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free); /* * mas_mat_destroy() - Free all nodes and subtrees in a dead list. * @mas: the maple state * @mat: the ma_topiary linked list of dead nodes to free. * * Destroy walk a dead list. */ static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) { struct maple_enode *next; struct maple_node *node; bool in_rcu = mt_in_rcu(mas->tree); while (mat->head) { next = mte_to_mat(mat->head)->next; node = mte_to_node(mat->head); mt_destroy_walk(mat->head, mas->tree, !in_rcu); if (in_rcu) call_rcu(&node->rcu, mt_free_walk); mat->head = next; } } /* * mas_descend() - Descend into the slot stored in the ma_state. * @mas: the maple state. * * Note: Not RCU safe, only use in write side or debug code. */ static inline void mas_descend(struct ma_state *mas) { enum maple_type type; unsigned long *pivots; struct maple_node *node; void __rcu **slots; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); slots = ma_slots(node, type); if (mas->offset) mas->min = pivots[mas->offset - 1] + 1; mas->max = mas_safe_pivot(mas, pivots, mas->offset, type); mas->node = mas_slot(mas, slots, mas->offset); } /* * mte_set_gap() - Set a maple node gap. * @mn: The encoded maple node * @gap: The offset of the gap to set * @val: The gap value */ static inline void mte_set_gap(const struct maple_enode *mn, unsigned char gap, unsigned long val) { switch (mte_node_type(mn)) { default: break; case maple_arange_64: mte_to_node(mn)->ma64.gap[gap] = val; break; } } /* * mas_ascend() - Walk up a level of the tree. * @mas: The maple state * * Sets the @mas->max and @mas->min to the correct values when walking up. This * may cause several levels of walking up to find the correct min and max. * May find a dead node which will cause a premature return. * Return: 1 on dead node, 0 otherwise */ static int mas_ascend(struct ma_state *mas) { struct maple_enode *p_enode; /* parent enode. */ struct maple_enode *a_enode; /* ancestor enode. */ struct maple_node *a_node; /* ancestor node. */ struct maple_node *p_node; /* parent node. */ unsigned char a_slot; enum maple_type a_type; unsigned long min, max; unsigned long *pivots; bool set_max = false, set_min = false; a_node = mas_mn(mas); if (ma_is_root(a_node)) { mas->offset = 0; return 0; } p_node = mte_parent(mas->node); if (unlikely(a_node == p_node)) return 1; a_type = mas_parent_type(mas, mas->node); mas->offset = mte_parent_slot(mas->node); a_enode = mt_mk_node(p_node, a_type); /* Check to make sure all parent information is still accurate */ if (p_node != mte_parent(mas->node)) return 1; mas->node = a_enode; if (mte_is_root(a_enode)) { mas->max = ULONG_MAX; mas->min = 0; return 0; } min = 0; max = ULONG_MAX; if (!mas->offset) { min = mas->min; set_min = true; } if (mas->max == ULONG_MAX) set_max = true; do { p_enode = a_enode; a_type = mas_parent_type(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); a_enode = mt_mk_node(a_node, a_type); pivots = ma_pivots(a_node, a_type); if (unlikely(ma_dead_node(a_node))) return 1; if (!set_min && a_slot) { set_min = true; min = pivots[a_slot - 1] + 1; } if (!set_max && a_slot < mt_pivots[a_type]) { set_max = true; max = pivots[a_slot]; } if (unlikely(ma_dead_node(a_node))) return 1; if (unlikely(ma_is_root(a_node))) break; } while (!set_min || !set_max); mas->max = max; mas->min = min; return 0; } /* * mas_pop_node() - Get a previously allocated maple node from the maple state. * @mas: The maple state * * Return: A pointer to a maple node. */ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); unsigned int req = mas_alloc_req(mas); /* nothing or a request pending. */ if (WARN_ON(!total)) return NULL; if (total == 1) { /* single allocation in this ma_state */ mas->alloc = NULL; ret = node; goto single_node; } if (node->node_count == 1) { /* Single allocation in this node. */ mas->alloc = node->slot[0]; mas->alloc->total = node->total - 1; ret = node; goto new_head; } node->total--; ret = node->slot[--node->node_count]; node->slot[node->node_count] = NULL; single_node: new_head: if (req) { req++; mas_set_alloc_req(mas, req); } memset(ret, 0, sizeof(*ret)); return (struct maple_node *)ret; } /* * mas_push_node() - Push a node back on the maple state allocation. * @mas: The maple state * @used: The used maple node * * Stores the maple node back into @mas->alloc for reuse. Updates allocated and * requested node count as necessary. */ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) { struct maple_alloc *reuse = (struct maple_alloc *)used; struct maple_alloc *head = mas->alloc; unsigned long count; unsigned int requested = mas_alloc_req(mas); count = mas_allocated(mas); reuse->request_count = 0; reuse->node_count = 0; if (count) { if (head->node_count < MAPLE_ALLOC_SLOTS) { head->slot[head->node_count++] = reuse; head->total++; goto done; } reuse->slot[0] = head; reuse->node_count = 1; } reuse->total = count + 1; mas->alloc = reuse; done: if (requested > 1) mas_set_alloc_req(mas, requested - 1); } /* * mas_alloc_nodes() - Allocate nodes into a maple state * @mas: The maple state * @gfp: The GFP Flags */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; unsigned int max_req = 0; if (!requested) return; mas_set_alloc_req(mas, 0); if (mas->mas_flags & MA_STATE_PREALLOC) { if (allocated) return; WARN_ON(!allocated); } if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { node = (struct maple_alloc *)mt_alloc_one(gfp); if (!node) goto nomem_one; if (allocated) { node->slot[0] = mas->alloc; node->node_count = 1; } else { node->node_count = 0; } mas->alloc = node; node->total = ++allocated; node->request_count = 0; requested--; } node = mas->alloc; while (requested) { max_req = MAPLE_ALLOC_SLOTS - node->node_count; slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; if (node->node_count == 0) { node->slot[0]->node_count = 0; node->slot[0]->request_count = 0; } node->node_count += count; allocated += count; /* find a non-full node*/ do { node = node->slot[0]; } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); requested -= count; } mas->alloc->total = allocated; return; nomem_bulk: /* Clean up potential freed allocations on bulk failure */ memset(slots, 0, max_req * sizeof(unsigned long)); mas->alloc->total = allocated; nomem_one: mas_set_alloc_req(mas, requested); mas_set_err(mas, -ENOMEM); } /* * mas_free() - Free an encoded maple node * @mas: The maple state * @used: The encoded maple node to free. * * Uses rcu free if necessary, pushes @used back on the maple state allocations * otherwise. */ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) { struct maple_node *tmp = mte_to_node(used); if (mt_in_rcu(mas->tree)) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_node_count_gfp() - Check if enough nodes are allocated and request more * if there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * @gfp: the gfp flags */ static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) { unsigned long allocated = mas_allocated(mas); if (allocated < count) { mas_set_alloc_req(mas, count - allocated); mas_alloc_nodes(mas, gfp); } } /* * mas_node_count() - Check if enough nodes are allocated and request more if * there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags. */ static void mas_node_count(struct ma_state *mas, int count) { return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN); } /* * mas_start() - Sets up maple state for operations. * @mas: The maple state. * * If mas->status == ma_start, then set the min, max and depth to * defaults. * * Return: * - If mas->node is an error or not mas_start, return NULL. * - If it's an empty tree: NULL & mas->status == ma_none * - If it's a single entry: The entry & mas->status == ma_root * - If it's a tree: NULL & mas->status == ma_active */ static inline struct maple_enode *mas_start(struct ma_state *mas) { if (likely(mas_is_start(mas))) { struct maple_enode *root; mas->min = 0; mas->max = ULONG_MAX; retry: mas->depth = 0; root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; if (mte_dead_node(mas->node)) goto retry; return NULL; } mas->node = NULL; /* empty tree */ if (unlikely(!root)) { mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; } /* Single entry tree */ mas->status = ma_root; mas->offset = MAPLE_NODE_SLOTS; /* Single entry tree. */ if (mas->index > 0) return NULL; return root; } return NULL; } /* * ma_data_end() - Find the end of the data in a node. * @node: The maple node * @type: The maple node type * @pivots: The array of pivots in the node * @max: The maximum value in the node * * Uses metadata to find the end of the data when possible. * Return: The zero indexed last slot with data (may be null). */ static __always_inline unsigned char ma_data_end(struct maple_node *node, enum maple_type type, unsigned long *pivots, unsigned long max) { unsigned char offset; if (!pivots) return 0; if (type == maple_arange_64) return ma_meta_end(node, type); offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == max)) return offset; return mt_pivots[type]; } /* * mas_data_end() - Find the end of the data (slot). * @mas: the maple state * * This method is optimized to check the metadata of a node if the node type * supports data end metadata. * * Return: The zero indexed last slot with data (may be null). */ static inline unsigned char mas_data_end(struct ma_state *mas) { enum maple_type type; struct maple_node *node; unsigned char offset; unsigned long *pivots; type = mte_node_type(mas->node); node = mas_mn(mas); if (type == maple_arange_64) return ma_meta_end(node, type); pivots = ma_pivots(node, type); if (unlikely(ma_dead_node(node))) return 0; offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == mas->max)) return offset; return mt_pivots[type]; } /* * mas_leaf_max_gap() - Returns the largest gap in a leaf node * @mas: the maple state * * Return: The maximum gap in the leaf. */ static unsigned long mas_leaf_max_gap(struct ma_state *mas) { enum maple_type mt; unsigned long pstart, gap, max_gap; struct maple_node *mn; unsigned long *pivots; void __rcu **slots; unsigned char i; unsigned char max_piv; mt = mte_node_type(mas->node); mn = mas_mn(mas); slots = ma_slots(mn, mt); max_gap = 0; if (unlikely(ma_is_dense(mt))) { gap = 0; for (i = 0; i < mt_slots[mt]; i++) { if (slots[i]) { if (gap > max_gap) max_gap = gap; gap = 0; } else { gap++; } } if (gap > max_gap) max_gap = gap; return max_gap; } /* * Check the first implied pivot optimizes the loop below and slot 1 may * be skipped if there is a gap in slot 0. */ pivots = ma_pivots(mn, mt); if (likely(!slots[0])) { max_gap = pivots[0] - mas->min + 1; i = 2; } else { i = 1; } /* reduce max_piv as the special case is checked before the loop */ max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1; /* * Check end implied pivot which can only be a gap on the right most * node. */ if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) { gap = ULONG_MAX - pivots[max_piv]; if (gap > max_gap) max_gap = gap; if (max_gap > pivots[max_piv] - mas->min) return max_gap; } for (; i <= max_piv; i++) { /* data == no gap. */ if (likely(slots[i])) continue; pstart = pivots[i - 1]; gap = pivots[i] - pstart; if (gap > max_gap) max_gap = gap; /* There cannot be two gaps in a row. */ i++; } return max_gap; } /* * ma_max_gap() - Get the maximum gap in a maple node (non-leaf) * @node: The maple node * @gaps: The pointer to the gaps * @mt: The maple node type * @off: Pointer to store the offset location of the gap. * * Uses the metadata data end to scan backwards across set gaps. * * Return: The maximum gap value */ static inline unsigned long ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt, unsigned char *off) { unsigned char offset, i; unsigned long max_gap = 0; i = offset = ma_meta_end(node, mt); do { if (gaps[i] > max_gap) { max_gap = gaps[i]; offset = i; } } while (i--); *off = offset; return max_gap; } /* * mas_max_gap() - find the largest gap in a non-leaf node and set the slot. * @mas: The maple state. * * Return: The gap value. */ static inline unsigned long mas_max_gap(struct ma_state *mas) { unsigned long *gaps; unsigned char offset; enum maple_type mt; struct maple_node *node; mt = mte_node_type(mas->node); if (ma_is_leaf(mt)) return mas_leaf_max_gap(mas); node = mas_mn(mas); MAS_BUG_ON(mas, mt != maple_arange_64); offset = ma_meta_gap(node); gaps = ma_gaps(node, mt); return gaps[offset]; } /* * mas_parent_gap() - Set the parent gap and any gaps above, as needed * @mas: The maple state * @offset: The gap offset in the parent to set * @new: The new gap value. * * Set the parent gap then continue to set the gap upwards, using the metadata * of the parent to see if it is necessary to check the node above. */ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, unsigned long new) { unsigned long meta_gap = 0; struct maple_node *pnode; struct maple_enode *penode; unsigned long *pgaps; unsigned char meta_offset; enum maple_type pmt; pnode = mte_parent(mas->node); pmt = mas_parent_type(mas, mas->node); penode = mt_mk_node(pnode, pmt); pgaps = ma_gaps(pnode, pmt); ascend: MAS_BUG_ON(mas, pmt != maple_arange_64); meta_offset = ma_meta_gap(pnode); meta_gap = pgaps[meta_offset]; pgaps[offset] = new; if (meta_gap == new) return; if (offset != meta_offset) { if (meta_gap > new) return; ma_set_meta_gap(pnode, pmt, offset); } else if (new < meta_gap) { new = ma_max_gap(pnode, pgaps, pmt, &meta_offset); ma_set_meta_gap(pnode, pmt, meta_offset); } if (ma_is_root(pnode)) return; /* Go to the parent node. */ pnode = mte_parent(penode); pmt = mas_parent_type(mas, penode); pgaps = ma_gaps(pnode, pmt); offset = mte_parent_slot(penode); penode = mt_mk_node(pnode, pmt); goto ascend; } /* * mas_update_gap() - Update a nodes gaps and propagate up if necessary. * @mas: the maple state. */ static inline void mas_update_gap(struct ma_state *mas) { unsigned char pslot; unsigned long p_gap; unsigned long max_gap; if (!mt_is_alloc(mas->tree)) return; if (mte_is_root(mas->node)) return; max_gap = mas_max_gap(mas); pslot = mte_parent_slot(mas->node); p_gap = ma_gaps(mte_parent(mas->node), mas_parent_type(mas, mas->node))[pslot]; if (p_gap != max_gap) mas_parent_gap(mas, pslot, max_gap); } /* * mas_adopt_children() - Set the parent pointer of all nodes in @parent to * @parent with the slot encoded. * @mas: the maple state (for the tree) * @parent: the maple encoded node containing the children. */ static inline void mas_adopt_children(struct ma_state *mas, struct maple_enode *parent) { enum maple_type type = mte_node_type(parent); struct maple_node *node = mte_to_node(parent); void __rcu **slots = ma_slots(node, type); unsigned long *pivots = ma_pivots(node, type); struct maple_enode *child; unsigned char offset; offset = ma_data_end(node, type, pivots, mas->max); do { child = mas_slot_locked(mas, slots, offset); mas_set_parent(mas, child, parent, offset); } while (offset--); } /* * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old * node as dead. * @mas: the maple state with the new node * @old_enode: The old maple encoded node to replace. */ static inline void mas_put_in_tree(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { unsigned char offset; void __rcu **slots; if (mte_is_root(mas->node)) { mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); mas_set_height(mas); } else { offset = mte_parent_slot(mas->node); slots = ma_slots(mte_parent(mas->node), mas_parent_type(mas, mas->node)); rcu_assign_pointer(slots[offset], mas->node); } mte_set_node_dead(old_enode); } /* * mas_replace_node() - Replace a node by putting it in the tree, marking it * dead, and freeing it. * the parent encoding to locate the maple node in the tree. * @mas: the ma_state with @mas->node pointing to the new node. * @old_enode: The old maple encoded node. */ static inline void mas_replace_node(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { mas_put_in_tree(mas, old_enode); mas_free(mas, old_enode); } /* * mas_find_child() - Find a child who has the parent @mas->node. * @mas: the maple state with the parent. * @child: the maple state to store the child. */ static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child) __must_hold(mas->tree->ma_lock) { enum maple_type mt; unsigned char offset; unsigned char end; unsigned long *pivots; struct maple_enode *entry; struct maple_node *node; void __rcu **slots; mt = mte_node_type(mas->node); node = mas_mn(mas); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); end = ma_data_end(node, mt, pivots, mas->max); for (offset = mas->offset; offset <= end; offset++) { entry = mas_slot_locked(mas, slots, offset); if (mte_parent(entry) == node) { *child = *mas; mas->offset = offset + 1; child->offset = offset; mas_descend(child); child->offset = 0; return true; } } return false; } /* * mab_shift_right() - Shift the data in mab right. Note, does not clean out the * old data or set b_node->b_end. * @b_node: the maple_big_node * @shift: the shift count */ static inline void mab_shift_right(struct maple_big_node *b_node, unsigned char shift) { unsigned long size = b_node->b_end * sizeof(unsigned long); memmove(b_node->pivot + shift, b_node->pivot, size); memmove(b_node->slot + shift, b_node->slot, size); if (b_node->type == maple_arange_64) memmove(b_node->gap + shift, b_node->gap, size); } /* * mab_middle_node() - Check if a middle node is needed (unlikely) * @b_node: the maple_big_node that contains the data. * @split: the potential split location * @slot_count: the size that can be stored in a single node being considered. * * Return: true if a middle node is required. */ static inline bool mab_middle_node(struct maple_big_node *b_node, int split, unsigned char slot_count) { unsigned char size = b_node->b_end; if (size >= 2 * slot_count) return true; if (!b_node->slot[split] && (size >= 2 * slot_count - 1)) return true; return false; } /* * mab_no_null_split() - ensure the split doesn't fall on a NULL * @b_node: the maple_big_node with the data * @split: the suggested split location * @slot_count: the number of slots in the node being considered. * * Return: the split location. */ static inline int mab_no_null_split(struct maple_big_node *b_node, unsigned char split, unsigned char slot_count) { if (!b_node->slot[split]) { /* * If the split is less than the max slot && the right side will * still be sufficient, then increment the split on NULL. */ if ((split < slot_count - 1) && (b_node->b_end - split) > (mt_min_slots[b_node->type])) split++; else split--; } return split; } /* * mab_calc_split() - Calculate the split location and if there needs to be two * splits. * @mas: The maple state * @bn: The maple_big_node with the data * @mid_split: The second split, if required. 0 otherwise. * * Return: The first split location. The middle split is set in @mid_split. */ static inline int mab_calc_split(struct ma_state *mas, struct maple_big_node *bn, unsigned char *mid_split) { unsigned char b_end = bn->b_end; int split = b_end / 2; /* Assume equal split. */ unsigned char slot_count = mt_slots[bn->type]; /* * To support gap tracking, all NULL entries are kept together and a node cannot * end on a NULL entry, with the exception of the left-most leaf. The * limitation means that the split of a node must be checked for this condition * and be able to put more data in one direction or the other. */ if (unlikely((mas->mas_flags & MA_STATE_BULK))) { *mid_split = 0; split = b_end - mt_min_slots[bn->type]; if (!ma_is_leaf(bn->type)) return split; mas->mas_flags |= MA_STATE_REBALANCE; if (!bn->slot[split]) split--; return split; } /* * Although extremely rare, it is possible to enter what is known as the 3-way * split scenario. The 3-way split comes about by means of a store of a range * that overwrites the end and beginning of two full nodes. The result is a set * of entries that cannot be stored in 2 nodes. Sometimes, these two nodes can * also be located in different parent nodes which are also full. This can * carry upwards all the way to the root in the worst case. */ if (unlikely(mab_middle_node(bn, split, slot_count))) { split = b_end / 3; *mid_split = split * 2; } else { *mid_split = 0; } /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); if (unlikely(*mid_split)) *mid_split = mab_no_null_split(bn, *mid_split, slot_count); return split; } /* * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node * and set @b_node->b_end to the next free slot. * @mas: The maple state * @mas_start: The starting slot to copy * @mas_end: The end slot to copy (inclusively) * @b_node: The maple_big_node to place the data * @mab_start: The starting location in maple_big_node to store the data. */ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, unsigned char mas_end, struct maple_big_node *b_node, unsigned char mab_start) { enum maple_type mt; struct maple_node *node; void __rcu **slots; unsigned long *pivots, *gaps; int i = mas_start, j = mab_start; unsigned char piv_end; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); if (!i) { b_node->pivot[j] = pivots[i++]; if (unlikely(i > mas_end)) goto complete; j++; } piv_end = min(mas_end, mt_pivots[mt]); for (; i < piv_end; i++, j++) { b_node->pivot[j] = pivots[i]; if (unlikely(!b_node->pivot[j])) goto complete; if (unlikely(mas->max == b_node->pivot[j])) goto complete; } b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); complete: b_node->b_end = ++j; j -= mab_start; slots = ma_slots(node, mt); memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j); if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) { gaps = ma_gaps(node, mt); memcpy(b_node->gap + mab_start, gaps + mas_start, sizeof(unsigned long) * j); } } /* * mas_leaf_set_meta() - Set the metadata of a leaf if possible. * @node: The maple node * @mt: The maple type * @end: The node end */ static inline void mas_leaf_set_meta(struct maple_node *node, enum maple_type mt, unsigned char end) { if (end < mt_slots[mt] - 1) ma_set_meta(node, mt, 0, end); } /* * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node. * @b_node: the maple_big_node that has the data * @mab_start: the start location in @b_node. * @mab_end: The end location in @b_node (inclusively) * @mas: The maple state with the maple encoded node. */ static inline void mab_mas_cp(struct maple_big_node *b_node, unsigned char mab_start, unsigned char mab_end, struct ma_state *mas, bool new_max) { int i, j = 0; enum maple_type mt = mte_node_type(mas->node); struct maple_node *node = mte_to_node(mas->node); void __rcu **slots = ma_slots(node, mt); unsigned long *pivots = ma_pivots(node, mt); unsigned long *gaps = NULL; unsigned char end; if (mab_end - mab_start > mt_pivots[mt]) mab_end--; if (!pivots[mt_pivots[mt] - 1]) slots[mt_pivots[mt]] = NULL; i = mab_start; do { pivots[j++] = b_node->pivot[i++]; } while (i <= mab_end && likely(b_node->pivot[i])); memcpy(slots, b_node->slot + mab_start, sizeof(void *) * (i - mab_start)); if (new_max) mas->max = b_node->pivot[i - 1]; end = j - 1; if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { unsigned long max_gap = 0; unsigned char offset = 0; gaps = ma_gaps(node, mt); do { gaps[--j] = b_node->gap[--i]; if (gaps[j] > max_gap) { offset = j; max_gap = gaps[j]; } } while (j); ma_set_meta(node, mt, offset, end); } else { mas_leaf_set_meta(node, mt, end); } } /* * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert. * @mas: The maple state * @end: The maple node end * @mt: The maple node type */ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, enum maple_type mt) { if (!(mas->mas_flags & MA_STATE_BULK)) return; if (mte_is_root(mas->node)) return; if (end > mt_min_slots[mt]) { mas->mas_flags &= ~MA_STATE_REBALANCE; return; } } /* * mas_store_b_node() - Store an @entry into the b_node while also copying the * data from a maple encoded node. * @wr_mas: the maple write state * @b_node: the maple_big_node to fill with data * @offset_end: the offset to end copying * * Return: The actual end of the data stored in @b_node */ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char offset_end) { unsigned char slot; unsigned char b_end; /* Possible underflow of piv will wrap back to 0 before use. */ unsigned long piv; struct ma_state *mas = wr_mas->mas; b_node->type = wr_mas->type; b_end = 0; slot = mas->offset; if (slot) { /* Copy start data up to insert. */ mas_mab_cp(mas, 0, slot - 1, b_node, 0); b_end = b_node->b_end; piv = b_node->pivot[b_end - 1]; } else piv = mas->min - 1; if (piv + 1 < mas->index) { /* Handle range starting after old range */ b_node->slot[b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = mas->index - 1 - piv; b_node->pivot[b_end++] = mas->index - 1; } /* Store the new entry. */ mas->offset = b_end; b_node->slot[b_end] = wr_mas->entry; b_node->pivot[b_end] = mas->last; /* Appended. */ if (mas->last >= mas->max) goto b_end; /* Handle new range ending before old range ends */ piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); if (piv > mas->last) { if (piv == ULONG_MAX) mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); if (offset_end != slot) wr_mas->content = mas_slot_locked(mas, wr_mas->slots, offset_end); b_node->slot[++b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = piv - mas->last + 1; b_node->pivot[b_end] = piv; } slot = offset_end + 1; if (slot > mas->end) goto b_end; /* Copy end data to the end of the node. */ mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end); b_node->b_end--; return; b_end: b_node->b_end = b_end; } /* * mas_prev_sibling() - Find the previous node with the same parent. * @mas: the maple state * * Return: True if there is a previous sibling, false otherwise. */ static inline bool mas_prev_sibling(struct ma_state *mas) { unsigned int p_slot = mte_parent_slot(mas->node); /* For root node, p_slot is set to 0 by mte_parent_slot(). */ if (!p_slot) return false; mas_ascend(mas); mas->offset = p_slot - 1; mas_descend(mas); return true; } /* * mas_next_sibling() - Find the next node with the same parent. * @mas: the maple state * * Return: true if there is a next sibling, false otherwise. */ static inline bool mas_next_sibling(struct ma_state *mas) { MA_STATE(parent, mas->tree, mas->index, mas->last); if (mte_is_root(mas->node)) return false; parent = *mas; mas_ascend(&parent); parent.offset = mte_parent_slot(mas->node) + 1; if (parent.offset > mas_data_end(&parent)) return false; *mas = parent; mas_descend(mas); return true; } /* * mas_node_or_none() - Set the enode and state. * @mas: the maple state * @enode: The encoded maple node. * * Set the node to the enode and the status. */ static inline void mas_node_or_none(struct ma_state *mas, struct maple_enode *enode) { if (enode) { mas->node = enode; mas->status = ma_active; } else { mas->node = NULL; mas->status = ma_none; } } /* * mas_wr_node_walk() - Find the correct offset for the index in the @mas. * If @mas->index cannot be found within the containing * node, we traverse to the last entry in the node. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. */ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char count, offset; if (unlikely(ma_is_dense(wr_mas->type))) { wr_mas->r_max = wr_mas->r_min = mas->index; mas->offset = mas->index = mas->min; return; } wr_mas->node = mas_mn(wr_mas->mas); wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); count = mas->end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots, mas->max); offset = mas->offset; while (offset < count && mas->index > wr_mas->pivots[offset]) offset++; wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max; wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset); wr_mas->offset_end = mas->offset = offset; } /* * mast_rebalance_next() - Rebalance against the next node * @mast: The maple subtree state */ static inline void mast_rebalance_next(struct maple_subtree_state *mast) { unsigned char b_end = mast->bn->b_end; mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node), mast->bn, b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_rebalance_prev() - Rebalance against the previous node * @mast: The maple subtree state */ static inline void mast_rebalance_prev(struct maple_subtree_state *mast) { unsigned char end = mas_data_end(mast->orig_l) + 1; unsigned char b_end = mast->bn->b_end; mab_shift_right(mast->bn, end); mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0); mast->l->min = mast->orig_l->min; mast->orig_l->index = mast->orig_l->min; mast->bn->b_end = end + b_end; mast->l->offset += end; } /* * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring * the node to the right. Checking the nodes to the right then the left at each * level upwards until root is reached. * Data is copied into the @mast->bn. * @mast: The maple_subtree_state. */ static inline bool mast_spanning_rebalance(struct maple_subtree_state *mast) { struct ma_state r_tmp = *mast->orig_r; struct ma_state l_tmp = *mast->orig_l; unsigned char depth = 0; do { mas_ascend(mast->orig_r); mas_ascend(mast->orig_l); depth++; if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { mast->orig_r->offset++; do { mas_descend(mast->orig_r); mast->orig_r->offset = 0; } while (--depth); mast_rebalance_next(mast); *mast->orig_l = l_tmp; return true; } else if (mast->orig_l->offset != 0) { mast->orig_l->offset--; do { mas_descend(mast->orig_l); mast->orig_l->offset = mas_data_end(mast->orig_l); } while (--depth); mast_rebalance_prev(mast); *mast->orig_r = r_tmp; return true; } } while (!mte_is_root(mast->orig_r->node)); *mast->orig_r = r_tmp; *mast->orig_l = l_tmp; return false; } /* * mast_ascend() - Ascend the original left and right maple states. * @mast: the maple subtree state. * * Ascend the original left and right sides. Set the offsets to point to the * data already in the new tree (@mast->l and @mast->r). */ static inline void mast_ascend(struct maple_subtree_state *mast) { MA_WR_STATE(wr_mas, mast->orig_r, NULL); mas_ascend(mast->orig_l); mas_ascend(mast->orig_r); mast->orig_r->offset = 0; mast->orig_r->index = mast->r->max; /* last should be larger than or equal to index */ if (mast->orig_r->last < mast->orig_r->index) mast->orig_r->last = mast->orig_r->index; wr_mas.type = mte_node_type(mast->orig_r->node); mas_wr_node_walk(&wr_mas); /* Set up the left side of things */ mast->orig_l->offset = 0; mast->orig_l->index = mast->l->min; wr_mas.mas = mast->orig_l; wr_mas.type = mte_node_type(mast->orig_l->node); mas_wr_node_walk(&wr_mas); mast->bn->type = wr_mas.type; } /* * mas_new_ma_node() - Create and return a new maple node. Helper function. * @mas: the maple state with the allocations. * @b_node: the maple_big_node with the type encoding. * * Use the node type from the maple_big_node to allocate a new node from the * ma_state. This function exists mainly for code readability. * * Return: A new maple encoded node */ static inline struct maple_enode *mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node) { return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type); } /* * mas_mab_to_node() - Set up right and middle nodes * * @mas: the maple state that contains the allocations. * @b_node: the node which contains the data. * @left: The pointer which will have the left node * @right: The pointer which may have the right node * @middle: the pointer which may have the middle node (rare) * @mid_split: the split location for the middle node * * Return: the split of left. */ static inline unsigned char mas_mab_to_node(struct ma_state *mas, struct maple_big_node *b_node, struct maple_enode **left, struct maple_enode **right, struct maple_enode **middle, unsigned char *mid_split) { unsigned char split = 0; unsigned char slot_count = mt_slots[b_node->type]; *left = mas_new_ma_node(mas, b_node); *right = NULL; *middle = NULL; *mid_split = 0; if (b_node->b_end < slot_count) { split = b_node->b_end; } else { split = mab_calc_split(mas, b_node, mid_split); *right = mas_new_ma_node(mas, b_node); } if (*mid_split) *middle = mas_new_ma_node(mas, b_node); return split; } /* * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end * pointer. * @b_node: the big node to add the entry * @mas: the maple state to get the pivot (mas->max) * @entry: the entry to add, if NULL nothing happens. */ static inline void mab_set_b_end(struct maple_big_node *b_node, struct ma_state *mas, void *entry) { if (!entry) return; b_node->slot[b_node->b_end] = entry; if (mt_is_alloc(mas->tree)) b_node->gap[b_node->b_end] = mas_max_gap(mas); b_node->pivot[b_node->b_end++] = mas->max; } /* * mas_set_split_parent() - combine_then_separate helper function. Sets the parent * of @mas->node to either @left or @right, depending on @slot and @split * * @mas: the maple state with the node that needs a parent * @left: possible parent 1 * @right: possible parent 2 * @slot: the slot the mas->node was placed * @split: the split location between @left and @right */ static inline void mas_set_split_parent(struct ma_state *mas, struct maple_enode *left, struct maple_enode *right, unsigned char *slot, unsigned char split) { if (mas_is_none(mas)) return; if ((*slot) <= split) mas_set_parent(mas, mas->node, left, *slot); else if (right) mas_set_parent(mas, mas->node, right, (*slot) - split - 1); (*slot)++; } /* * mte_mid_split_check() - Check if the next node passes the mid-split * @l: Pointer to left encoded maple node. * @m: Pointer to middle encoded maple node. * @r: Pointer to right encoded maple node. * @slot: The offset * @split: The split location. * @mid_split: The middle split. */ static inline void mte_mid_split_check(struct maple_enode **l, struct maple_enode **r, struct maple_enode *right, unsigned char slot, unsigned char *split, unsigned char mid_split) { if (*r == right) return; if (slot < mid_split) return; *l = *r; *r = right; *split = mid_split; } /* * mast_set_split_parents() - Helper function to set three nodes parents. Slot * is taken from @mast->l. * @mast: the maple subtree state * @left: the left node * @right: the right node * @split: the split location. */ static inline void mast_set_split_parents(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { unsigned char slot; struct maple_enode *l = left; struct maple_enode *r = right; if (mas_is_none(mast->l)) return; if (middle) r = middle; slot = mast->l->offset; mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->l, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->m, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->r, l, r, &slot, split); } /* * mas_topiary_node() - Dispose of a single node * @mas: The maple state for pushing nodes * @in_rcu: If the tree is in rcu mode * * The node will either be RCU freed or pushed back on the maple state. */ static inline void mas_topiary_node(struct ma_state *mas, struct ma_state *tmp_mas, bool in_rcu) { struct maple_node *tmp; struct maple_enode *enode; if (mas_is_none(tmp_mas)) return; enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); if (in_rcu) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_topiary_replace() - Replace the data with new data, then repair the * parent links within the new tree. Iterate over the dead sub-tree and collect * the dead subtrees and topiary the nodes that are no longer of use. * * The new tree will have up to three children with the correct parent. Keep * track of the new entries as they need to be followed to find the next level * of new entries. * * The old tree will have up to three children with the old parent. Keep track * of the old entries as they may have more nodes below replaced. Nodes within * [index, last] are dead subtrees, others need to be freed and followed. * * @mas: The maple state pointing at the new data * @old_enode: The maple encoded node being replaced * */ static inline void mas_topiary_replace(struct ma_state *mas, struct maple_enode *old_enode) { struct ma_state tmp[3], tmp_next[3]; MA_TOPIARY(subtrees, mas->tree); bool in_rcu; int i, n; /* Place data in tree & then mark node as old */ mas_put_in_tree(mas, old_enode); /* Update the parent pointers in the tree */ tmp[0] = *mas; tmp[0].offset = 0; tmp[1].status = ma_none; tmp[2].status = ma_none; while (!mte_is_leaf(tmp[0].node)) { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; n++; } mas_adopt_children(&tmp[i], tmp[i].node); } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) tmp[i] = tmp_next[i]; } /* Collect the old nodes that need to be discarded */ if (mte_is_leaf(old_enode)) return mas_free(mas, old_enode); tmp[0] = *mas; tmp[0].offset = 0; tmp[0].node = old_enode; tmp[1].status = ma_none; tmp[2].status = ma_none; in_rcu = mt_in_rcu(mas->tree); do { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; if ((tmp_next[n].min >= tmp_next->index) && (tmp_next[n].max <= tmp_next->last)) { mat_add(&subtrees, tmp_next[n].node); tmp_next[n].status = ma_none; } else { n++; } } } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) { mas_topiary_node(mas, &tmp[i], in_rcu); tmp[i] = tmp_next[i]; } } while (!mte_is_leaf(tmp[0].node)); for (i = 0; i < 3; i++) mas_topiary_node(mas, &tmp[i], in_rcu); mas_mat_destroy(mas, &subtrees); } /* * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state * @old_enode: The old maple encoded node that is being replaced. * * Updates gap as necessary. */ static inline void mas_wmb_replace(struct ma_state *mas, struct maple_enode *old_enode) { /* Insert the new data in the tree */ mas_topiary_replace(mas, old_enode); if (mte_is_leaf(mas->node)) return; mas_update_gap(mas); } /* * mast_cp_to_nodes() - Copy data out to nodes. * @mast: The maple subtree state * @left: The left encoded maple node * @middle: The middle encoded maple node * @right: The right encoded maple node * @split: The location to split between left and (middle ? middle : right) * @mid_split: The location to split between middle and right. */ static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { bool new_lmax = true; mas_node_or_none(mast->l, left); mas_node_or_none(mast->m, middle); mas_node_or_none(mast->r, right); mast->l->min = mast->orig_l->min; if (split == mast->bn->b_end) { mast->l->max = mast->orig_r->max; new_lmax = false; } mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax); if (middle) { mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true); mast->m->min = mast->bn->pivot[split] + 1; split = mid_split; } mast->r->max = mast->orig_r->max; if (right) { mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false); mast->r->min = mast->bn->pivot[split] + 1; } } /* * mast_combine_cp_left - Copy in the original left side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_left(struct maple_subtree_state *mast) { unsigned char l_slot = mast->orig_l->offset; if (!l_slot) return; mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0); } /* * mast_combine_cp_right: Copy in the original right side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_right(struct maple_subtree_state *mast) { if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max) return; mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1, mt_slot_count(mast->orig_r->node), mast->bn, mast->bn->b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_sufficient: Check if the maple subtree state has enough data in the big * node to create at least one sufficient node * @mast: the maple subtree state */ static inline bool mast_sufficient(struct maple_subtree_state *mast) { if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node)) return true; return false; } /* * mast_overflow: Check if there is too much data in the subtree state for a * single node. * @mast: The maple subtree state */ static inline bool mast_overflow(struct maple_subtree_state *mast) { if (mast->bn->b_end >= mt_slot_count(mast->orig_l->node)) return true; return false; } static inline void *mtree_range_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next, *last; enum maple_type type; void __rcu **slots; unsigned char end; unsigned long max, min; unsigned long prev_max, prev_min; next = mas->node; min = mas->min; max = mas->max; do { last = next; node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = ma_data_end(node, type, pivots, max); prev_min = min; prev_max = max; if (pivots[0] >= mas->index) { offset = 0; max = pivots[0]; goto next; } offset = 1; while (offset < end) { if (pivots[offset] >= mas->index) { max = pivots[offset]; break; } offset++; } min = pivots[offset - 1] + 1; next: slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); mas->end = end; mas->offset = offset; mas->index = min; mas->last = max; mas->min = prev_min; mas->max = prev_max; mas->node = last; return (void *)next; dead_node: mas_reset(mas); return NULL; } /* * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers. * @mas: The starting maple state * @mast: The maple_subtree_state, keeps track of 4 maple states. * @count: The estimated count of iterations needed. * * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root * is hit. First @b_node is split into two entries which are inserted into the * next iteration of the loop. @b_node is returned populated with the final * iteration. @mas is used to obtain allocations. orig_l_mas keeps track of the * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last * to account of what has been copied into the new sub-tree. The update of * orig_l_mas->last is used in mas_consume to find the slots that will need to * be either freed or destroyed. orig_l_mas->depth keeps track of the height of * the new sub-tree in case the sub-tree becomes the full tree. */ static void mas_spanning_rebalance(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char count) { unsigned char split, mid_split; unsigned char slot = 0; struct maple_enode *left = NULL, *middle = NULL, *right = NULL; struct maple_enode *old_enode; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); /* * The tree needs to be rebalanced and leaves need to be kept at the same level. * Rebalancing is done by use of the ``struct maple_topiary``. */ mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; l_mas.status = r_mas.status = m_mas.status = ma_none; /* Check if this is not root and has sufficient data. */ if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); l_mas.depth = 0; /* * Each level of the tree is examined and balanced, pushing data to the left or * right, or rebalancing against left or right nodes is employed to avoid * rippling up the tree to limit the amount of churn. Once a new sub-section of * the tree is created, there may be a mix of new and old nodes. The old nodes * will have the incorrect parent pointers and currently be in two trees: the * original tree and the partially new tree. To remedy the parent pointers in * the old tree, the new data is swapped into the active tree and a walk down * the tree is performed and the parent pointers are updated. * See mas_topiary_replace() for more information. */ while (count--) { mast->bn->b_end--; mast->bn->type = mte_node_type(mast->orig_l->node); split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle, &mid_split); mast_set_split_parents(mast, left, middle, right, split, mid_split); mast_cp_to_nodes(mast, left, middle, right, split, mid_split); /* * Copy data from next level in the tree to mast->bn from next * iteration */ memset(mast->bn, 0, sizeof(struct maple_big_node)); mast->bn->type = mte_node_type(left); l_mas.depth++; /* Root already stored in l->node. */ if (mas_is_root_limits(mast->l)) goto new_root; mast_ascend(mast); mast_combine_cp_left(mast); l_mas.offset = mast->bn->b_end; mab_set_b_end(mast->bn, &l_mas, left); mab_set_b_end(mast->bn, &m_mas, middle); mab_set_b_end(mast->bn, &r_mas, right); /* Copy anything necessary out of the right node. */ mast_combine_cp_right(mast); mast->orig_l->last = mast->orig_l->max; if (mast_sufficient(mast)) continue; if (mast_overflow(mast)) continue; /* May be a new root stored in mast->bn */ if (mas_is_root_limits(mast->orig_l)) break; mast_spanning_rebalance(mast); /* rebalancing from other nodes may require another loop. */ if (!count) count++; } l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); l_mas.depth++; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); mas_set_parent(mas, left, l_mas.node, slot); if (middle) mas_set_parent(mas, middle, l_mas.node, ++slot); if (right) mas_set_parent(mas, right, l_mas.node, ++slot); if (mas_is_root_limits(mast->l)) { new_root: mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); while (!mte_is_root(mast->orig_l->node)) mast_ascend(mast); } else { mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent; } old_enode = mast->orig_l->node; mas->depth = l_mas.depth; mas->node = l_mas.node; mas->min = l_mas.min; mas->max = l_mas.max; mas->offset = l_mas.offset; mas_wmb_replace(mas, old_enode); mtree_range_walk(mas); return; } /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state * @b_node: The big maple node. * * Rebalance two nodes into a single node or two new nodes that are sufficient. * Continue upwards until tree is sufficient. */ static inline void mas_rebalance(struct ma_state *mas, struct maple_big_node *b_node) { char empty_count = mas_mt_height(mas); struct maple_subtree_state mast; unsigned char shift, b_end = ++b_node->b_end; MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); /* * Rebalancing occurs if a node is insufficient. Data is rebalanced * against the node to the right if it exists, otherwise the node to the * left of this node is rebalanced against this node. If rebalancing * causes just one node to be produced instead of two, then the parent * is also examined and rebalanced if it is insufficient. Every level * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ mast.orig_l = &l_mas; mast.orig_r = &r_mas; mast.bn = b_node; mast.bn->type = mte_node_type(mas->node); l_mas = r_mas = *mas; if (mas_next_sibling(&r_mas)) { mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end); r_mas.last = r_mas.index = r_mas.max; } else { mas_prev_sibling(&l_mas); shift = mas_data_end(&l_mas) + 1; mab_shift_right(b_node, shift); mas->offset += shift; mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0); b_node->b_end = shift + b_end; l_mas.index = l_mas.last = l_mas.min; } return mas_spanning_rebalance(mas, &mast, empty_count); } /* * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple * state. * @mas: The maple state * @end: The end of the left-most node. * * During a mass-insert event (such as forking), it may be necessary to * rebalance the left-most node when it is not sufficient. */ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end) { enum maple_type mt = mte_node_type(mas->node); struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; struct maple_enode *eparent, *old_eparent; unsigned char offset, tmp, split = mt_slots[mt] / 2; void __rcu **l_slots, **slots; unsigned long *l_pivs, *pivs, gap; bool in_rcu = mt_in_rcu(mas->tree); MA_STATE(l_mas, mas->tree, mas->index, mas->last); l_mas = *mas; mas_prev_sibling(&l_mas); /* set up node. */ if (in_rcu) { newnode = mas_pop_node(mas); } else { newnode = &reuse; } node = mas_mn(mas); newnode->parent = node->parent; slots = ma_slots(newnode, mt); pivs = ma_pivots(newnode, mt); left = mas_mn(&l_mas); l_slots = ma_slots(left, mt); l_pivs = ma_pivots(left, mt); if (!l_slots[split]) split++; tmp = mas_data_end(&l_mas) - split; memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp); memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp); pivs[tmp] = l_mas.max; memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end); memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end); l_mas.max = l_pivs[split]; mas->min = l_mas.max + 1; old_eparent = mt_mk_node(mte_parent(l_mas.node), mas_parent_type(&l_mas, l_mas.node)); tmp += end; if (!in_rcu) { unsigned char max_p = mt_pivots[mt]; unsigned char max_s = mt_slots[mt]; if (tmp < max_p) memset(pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); if (tmp < mt_slots[mt]) memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp)); memcpy(node, newnode, sizeof(struct maple_node)); ma_set_meta(node, mt, 0, tmp - 1); mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node), l_pivs[split]); /* Remove data from l_pivs. */ tmp = split + 1; memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); ma_set_meta(left, mt, 0, split); eparent = old_eparent; goto done; } /* RCU requires replacing both l_mas, mas, and parent. */ mas->node = mt_mk_node(newnode, mt); ma_set_meta(newnode, mt, 0, tmp); new_left = mas_pop_node(mas); new_left->parent = left->parent; mt = mte_node_type(l_mas.node); slots = ma_slots(new_left, mt); pivs = ma_pivots(new_left, mt); memcpy(slots, l_slots, sizeof(void *) * split); memcpy(pivs, l_pivs, sizeof(unsigned long) * split); ma_set_meta(new_left, mt, 0, split); l_mas.node = mt_mk_node(new_left, mt); /* replace parent. */ offset = mte_parent_slot(mas->node); mt = mas_parent_type(&l_mas, l_mas.node); parent = mas_pop_node(mas); slots = ma_slots(parent, mt); pivs = ma_pivots(parent, mt); memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node)); rcu_assign_pointer(slots[offset], mas->node); rcu_assign_pointer(slots[offset - 1], l_mas.node); pivs[offset - 1] = l_mas.max; eparent = mt_mk_node(parent, mt); done: gap = mas_leaf_max_gap(mas); mte_set_gap(eparent, mte_parent_slot(mas->node), gap); gap = mas_leaf_max_gap(&l_mas); mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); mas_ascend(mas); if (in_rcu) { mas_replace_node(mas, old_eparent); mas_adopt_children(mas, mas->node); } mas_update_gap(mas); } /* * mas_split_final_node() - Split the final node in a subtree operation. * @mast: the maple subtree state * @mas: The maple state * @height: The height of the tree in case it's a new root. */ static inline void mas_split_final_node(struct maple_subtree_state *mast, struct ma_state *mas, int height) { struct maple_enode *ancestor; if (mte_is_root(mas->node)) { if (mt_is_alloc(mas->tree)) mast->bn->type = maple_arange_64; else mast->bn->type = maple_range_64; mas->depth = height; } /* * Only a single node is used here, could be root. * The Big_node data should just fit in a single node. */ ancestor = mas_new_ma_node(mas, mast->bn); mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset); mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset); mte_to_node(ancestor)->parent = mas_mn(mas)->parent; mast->l->node = ancestor; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); mas->offset = mast->bn->b_end - 1; } /* * mast_fill_bnode() - Copy data into the big node in the subtree state * @mast: The maple subtree state * @mas: the maple state * @skip: The number of entries to skip for new nodes insertion. */ static inline void mast_fill_bnode(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char skip) { bool cp = true; unsigned char split; memset(mast->bn, 0, sizeof(struct maple_big_node)); if (mte_is_root(mas->node)) { cp = false; } else { mas_ascend(mas); mas->offset = mte_parent_slot(mas->node); } if (cp && mast->l->offset) mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0); split = mast->bn->b_end; mab_set_b_end(mast->bn, mast->l, mast->l->node); mast->r->offset = mast->bn->b_end; mab_set_b_end(mast->bn, mast->r, mast->r->node); if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max) cp = false; if (cp) mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1, mast->bn, mast->bn->b_end); mast->bn->b_end--; mast->bn->type = mte_node_type(mas->node); } /* * mast_split_data() - Split the data in the subtree state big node into regular * nodes. * @mast: The maple subtree state * @mas: The maple state * @split: The location to split the big node */ static inline void mast_split_data(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char split) { unsigned char p_slot; mab_mas_cp(mast->bn, 0, split, mast->l, true); mte_set_pivot(mast->r->node, 0, mast->r->max); mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false); mast->l->offset = mte_parent_slot(mas->node); mast->l->max = mast->bn->pivot[split]; mast->r->min = mast->l->max + 1; if (mte_is_leaf(mas->node)) return; p_slot = mast->orig_l->offset; mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node, &p_slot, split); mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node, &p_slot, split); } /* * mas_push_data() - Instead of splitting a node, it is beneficial to push the * data to the right or left node if there is room. * @mas: The maple state * @height: The current height of the maple state * @mast: The maple subtree state * @left: Push left or not. * * Keeping the height of the tree low means faster lookups. * * Return: True if pushed, false otherwise. */ static inline bool mas_push_data(struct ma_state *mas, int height, struct maple_subtree_state *mast, bool left) { unsigned char slot_total = mast->bn->b_end; unsigned char end, space, split; MA_STATE(tmp_mas, mas->tree, mas->index, mas->last); tmp_mas = *mas; tmp_mas.depth = mast->l->depth; if (left && !mas_prev_sibling(&tmp_mas)) return false; else if (!left && !mas_next_sibling(&tmp_mas)) return false; end = mas_data_end(&tmp_mas); slot_total += end; space = 2 * mt_slot_count(mas->node) - 2; /* -2 instead of -1 to ensure there isn't a triple split */ if (ma_is_leaf(mast->bn->type)) space--; if (mas->max == ULONG_MAX) space--; if (slot_total >= space) return false; /* Get the data; Fill mast->bn */ mast->bn->b_end++; if (left) { mab_shift_right(mast->bn, end + 1); mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0); mast->bn->b_end = slot_total + 1; } else { mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end); } /* Configure mast for splitting of mast->bn */ split = mt_slots[mast->bn->type] - 2; if (left) { /* Switch mas to prev node */ *mas = tmp_mas; /* Start using mast->l for the left side. */ tmp_mas.node = mast->l->node; *mast->l = tmp_mas; } else { tmp_mas.node = mast->r->node; *mast->r = tmp_mas; split = slot_total - split; } split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]); /* Update parent slot for split calculation. */ if (left) mast->orig_l->offset += end + 1; mast_split_data(mast, mas, split); mast_fill_bnode(mast, mas, 2); mas_split_final_node(mast, mas, height + 1); return true; } /* * mas_split() - Split data that is too big for one node into two. * @mas: The maple state * @b_node: The maple big node */ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) { struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; struct maple_enode *old; /* * Splitting is handled differently from any other B-tree; the Maple * Tree splits upwards. Splitting up means that the split operation * occurs when the walk of the tree hits the leaves and not on the way * down. The reason for splitting up is that it is impossible to know * how much space will be needed until the leaf is (or leaves are) * reached. Since overwriting data is allowed and a range could * overwrite more than one range or result in changing one entry into 3 * entries, it is impossible to know if a split is required until the * data is examined. * * Splitting is a balancing act between keeping allocations to a minimum * and avoiding a 'jitter' event where a tree is expanded to make room * for an entry followed by a contraction when the entry is removed. To * accomplish the balance, there are empty slots remaining in both left * and right nodes after a split. */ MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); mas->depth = mas_mt_height(mas); mast.l = &l_mas; mast.r = &r_mas; mast.orig_l = &prev_l_mas; mast.orig_r = &prev_r_mas; mast.bn = b_node; while (height++ <= mas->depth) { if (mt_slots[b_node->type] > b_node->b_end) { mas_split_final_node(&mast, mas, height); break; } l_mas = r_mas = *mas; l_mas.node = mas_new_ma_node(mas, b_node); r_mas.node = mas_new_ma_node(mas, b_node); /* * Another way that 'jitter' is avoided is to terminate a split up early if the * left or right node has space to spare. This is referred to as "pushing left" * or "pushing right" and is similar to the B* tree, except the nodes left or * right can rarely be reused due to RCU, but the ripple upwards is halted which * is a significant savings. */ /* Try to push left. */ if (mas_push_data(mas, height, &mast, true)) break; /* Try to push right. */ if (mas_push_data(mas, height, &mast, false)) break; split = mab_calc_split(mas, b_node, &mid_split); mast_split_data(&mast, mas, split); /* * Usually correct, mab_mas_cp in the above call overwrites * r->max. */ mast.r->max = mas->max; mast_fill_bnode(&mast, mas, 1); prev_l_mas = *mast.l; prev_r_mas = *mast.r; } /* Set the original node as dead */ old = mas->node; mas->node = l_mas.node; mas_wmb_replace(mas, old); mtree_range_walk(mas); return; } /* * mas_commit_b_node() - Commit the big node into the tree. * @wr_mas: The maple write state * @b_node: The maple big node */ static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node) { enum store_type type = wr_mas->mas->store_type; WARN_ON_ONCE(type != wr_rebalance && type != wr_split_store); if (type == wr_rebalance) return mas_rebalance(wr_mas->mas, b_node); return mas_split(wr_mas->mas, b_node); } /* * mas_root_expand() - Expand a root to a node * @mas: The maple state * @entry: The entry to store into the tree */ static inline void mas_root_expand(struct ma_state *mas, void *entry) { void *contents = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; int slot = 0; node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; if (mas->index) { if (contents) { rcu_assign_pointer(slots[slot], contents); if (likely(mas->index > 1)) slot++; } pivots[slot++] = mas->index - 1; } rcu_assign_pointer(slots[slot], entry); mas->offset = slot; pivots[slot] = mas->last; if (mas->last != ULONG_MAX) pivots[++slot] = ULONG_MAX; mas->depth = 1; mas_set_height(mas); ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); return; } /* * mas_store_root() - Storing value into root. * @mas: The maple state * @entry: The entry to store. * * There is no root node now and we are storing a value into the root - this * function either assigns the pointer or expands into a node. */ static inline void mas_store_root(struct ma_state *mas, void *entry) { if (!entry) { if (!mas->index) rcu_assign_pointer(mas->tree->ma_root, NULL); } else if (likely((mas->last != 0) || (mas->index != 0))) mas_root_expand(mas, entry); else if (((unsigned long) (entry) & 3) == 2) mas_root_expand(mas, entry); else { rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; } } /* * mas_is_span_wr() - Check if the write needs to be treated as a write that * spans the node. * @wr_mas: The maple write state * * Spanning writes are writes that start in one node and end in another OR if * the write of a %NULL will cause the node to end with a %NULL. * * Return: True if this is a spanning write, false otherwise. */ static bool mas_is_span_wr(struct ma_wr_state *wr_mas) { unsigned long max = wr_mas->r_max; unsigned long last = wr_mas->mas->last; enum maple_type type = wr_mas->type; void *entry = wr_mas->entry; /* Contained in this pivot, fast path */ if (last < max) return false; if (ma_is_leaf(type)) { max = wr_mas->mas->max; if (last < max) return false; } if (last == max) { /* * The last entry of leaf node cannot be NULL unless it is the * rightmost node (writing ULONG_MAX), otherwise it spans slots. */ if (entry || last == ULONG_MAX) return false; } trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry); return true; } static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas) { wr_mas->type = mte_node_type(wr_mas->mas->node); mas_wr_node_walk(wr_mas); wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type); } static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas) { wr_mas->mas->max = wr_mas->r_max; wr_mas->mas->min = wr_mas->r_min; wr_mas->mas->node = wr_mas->content; wr_mas->mas->offset = 0; wr_mas->mas->depth++; } /* * mas_wr_walk() - Walk the tree for a write. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. * * Return: True if it's contained in a node, false on spanning write. */ static bool mas_wr_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); if (unlikely(mas_is_span_wr(wr_mas))) return false; wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return true; mas_wr_walk_traverse(wr_mas); } return true; } static void mas_wr_walk_index(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return; mas_wr_walk_traverse(wr_mas); } } /* * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs. * @l_wr_mas: The left maple write state * @r_wr_mas: The right maple write state */ static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) { struct ma_state *r_mas = r_wr_mas->mas; struct ma_state *l_mas = l_wr_mas->mas; unsigned char l_slot; l_slot = l_mas->offset; if (!l_wr_mas->content) l_mas->index = l_wr_mas->r_min; if ((l_mas->index == l_wr_mas->r_min) && (l_slot && !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) { if (l_slot > 1) l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1; else l_mas->index = l_mas->min; l_mas->offset = l_slot - 1; } if (!r_wr_mas->content) { if (r_mas->last < r_wr_mas->r_max) r_mas->last = r_wr_mas->r_max; r_mas->offset++; } else if ((r_mas->last == r_wr_mas->r_max) && (r_mas->last < r_mas->max) && !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) { r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots, r_wr_mas->type, r_mas->offset + 1); r_mas->offset++; } } static inline void *mas_state_walk(struct ma_state *mas) { void *entry; entry = mas_start(mas); if (mas_is_none(mas)) return NULL; if (mas_is_ptr(mas)) return entry; return mtree_range_walk(mas); } /* * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up * to date. * * @mas: The maple state. * * Note: Leaves mas in undesirable state. * Return: The entry for @mas->index or %NULL on dead node. */ static inline void *mtree_lookup_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next; enum maple_type type; void __rcu **slots; unsigned char end; next = mas->node; do { node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = mt_pivots[type]; offset = 0; do { if (pivots[offset] >= mas->index) break; } while (++offset < end); slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); return (void *)next; dead_node: mas_reset(mas); return NULL; } static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); /* * mas_new_root() - Create a new root node that only contains the entry passed * in. * @mas: The maple state * @entry: The entry to store. * * Only valid when the index == 0 and the last == ULONG_MAX */ static inline void mas_new_root(struct ma_state *mas, void *entry) { struct maple_enode *root = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX); if (!entry) { mas->depth = 0; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; goto done; } node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; mas->depth = 1; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); done: if (xa_is_node(root)) mte_destroy_walk(root, mas->tree); return; } /* * mas_wr_spanning_store() - Create a subtree with the store operation completed * and new nodes where necessary, then place the sub-tree in the actual tree. * Note that mas is expected to point to the node which caused the store to * span. * @wr_mas: The maple write state */ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; struct maple_big_node b_node; struct ma_state *mas; unsigned char height; /* Left and Right side of spanning store */ MA_STATE(l_mas, NULL, 0, 0); MA_STATE(r_mas, NULL, 0, 0); MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry); /* * A store operation that spans multiple nodes is called a spanning * store and is handled early in the store call stack by the function * mas_is_span_wr(). When a spanning store is identified, the maple * state is duplicated. The first maple state walks the left tree path * to ``index``, the duplicate walks the right tree path to ``last``. * The data in the two nodes are combined into a single node, two nodes, * or possibly three nodes (see the 3-way split above). A ``NULL`` * written to the last entry of a node is considered a spanning store as * a rebalance is required for the operation to complete and an overflow * of data may happen. */ mas = wr_mas->mas; trace_ma_op(__func__, mas); if (unlikely(!mas->index && mas->last == ULONG_MAX)) return mas_new_root(mas, wr_mas->entry); /* * Node rebalancing may occur due to this store, so there may be three new * entries per level plus a new root. */ height = mas_mt_height(mas); /* * Set up right side. Need to get to the next offset after the spanning * store to ensure it's not NULL and to combine both the next node and * the node with the start together. */ r_mas = *mas; /* Avoid overflow, walk to next slot in the tree. */ if (r_mas.last + 1) r_mas.last++; r_mas.index = r_mas.last; mas_wr_walk_index(&r_wr_mas); r_mas.last = r_mas.index = mas->last; /* Set up left side. */ l_mas = *mas; mas_wr_walk_index(&l_wr_mas); if (!wr_mas->entry) { mas_extend_spanning_null(&l_wr_mas, &r_wr_mas); mas->offset = l_mas.offset; mas->index = l_mas.index; mas->last = l_mas.last = r_mas.last; } /* expanding NULLs may make this cover the entire range */ if (!l_mas.index && r_mas.last == ULONG_MAX) { mas_set_range(mas, 0, ULONG_MAX); return mas_new_root(mas, wr_mas->entry); } memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ mas_store_b_node(&l_wr_mas, &b_node, l_mas.end); /* Copy r_mas into b_node if there is anything to copy. */ if (r_mas.max > r_mas.last) mas_mab_cp(&r_mas, r_mas.offset, r_mas.end, &b_node, b_node.b_end + 1); else b_node.b_end++; /* Stop spanning searches by searching for just index. */ l_mas.index = l_mas.last = mas->index; mast.bn = &b_node; mast.orig_l = &l_mas; mast.orig_r = &r_mas; /* Combine l_mas and r_mas and split them up evenly again. */ return mas_spanning_rebalance(mas, &mast, height + 1); } /* * mas_wr_node_store() - Attempt to store the value in a node * @wr_mas: The maple write state * * Attempts to reuse the node, but may allocate. */ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **dst_slots; unsigned long *dst_pivots; unsigned char dst_offset, offset_end = wr_mas->offset_end; struct maple_node reuse, *newnode; unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; bool in_rcu = mt_in_rcu(mas->tree); if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ else if (unlikely(wr_mas->r_max == ULONG_MAX)) mas_bulk_rebalance(mas, mas->end, wr_mas->type); /* set up node. */ if (in_rcu) { newnode = mas_pop_node(mas); } else { memset(&reuse, 0, sizeof(struct maple_node)); newnode = &reuse; } newnode->parent = mas_mn(mas)->parent; dst_pivots = ma_pivots(newnode, wr_mas->type); dst_slots = ma_slots(newnode, wr_mas->type); /* Copy from start to insert point */ memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset); memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset); /* Handle insert of new range starting after old range */ if (wr_mas->r_min < mas->index) { rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content); dst_pivots[mas->offset++] = mas->index - 1; } /* Store the new entry and range end. */ if (mas->offset < node_pivots) dst_pivots[mas->offset] = mas->last; rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry); /* * this range wrote to the end of the node or it overwrote the rest of * the data */ if (offset_end > mas->end) goto done; dst_offset = mas->offset + 1; /* Copy to the end of node if necessary. */ copy_size = mas->end - offset_end + 1; memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end, sizeof(void *) * copy_size); memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end, sizeof(unsigned long) * (copy_size - 1)); if (new_end < node_pivots) dst_pivots[new_end] = mas->max; done: mas_leaf_set_meta(newnode, maple_leaf_64, new_end); if (in_rcu) { struct maple_enode *old_enode = mas->node; mas->node = mt_mk_node(newnode, wr_mas->type); mas_replace_node(mas, old_enode); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } trace_ma_write(__func__, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; return; } /* * mas_wr_slot_store: Attempt to store a value in a slot. * @wr_mas: the maple write state */ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; void __rcu **slots = wr_mas->slots; bool gap = false; gap |= !mt_slot_locked(mas->tree, slots, offset); gap |= !mt_slot_locked(mas->tree, slots, offset + 1); if (wr_mas->offset_end - offset == 1) { if (mas->index == wr_mas->r_min) { /* Overwriting the range and a part of the next one */ rcu_assign_pointer(slots[offset], wr_mas->entry); wr_mas->pivots[offset] = mas->last; } else { /* Overwriting a part of the range and the next one */ rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; mas->offset++; /* Keep mas accurate. */ } } else { WARN_ON_ONCE(mt_in_rcu(mas->tree)); /* * Expand the range, only partially overwriting the previous and * next ranges */ gap |= !mt_slot_locked(mas->tree, slots, offset + 2); rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ } trace_ma_write(__func__, mas, 0, wr_mas->entry); /* * Only update gap when the new entry is empty or there is an empty * entry in the original two ranges. */ if (!wr_mas->entry || gap) mas_update_gap(mas); return; } static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!wr_mas->slots[wr_mas->offset_end]) { /* If this one is null, the next and prev are not */ mas->last = wr_mas->end_piv; } else { /* Check next slot(s) if we are overwriting the end */ if ((mas->last == wr_mas->end_piv) && (mas->end != wr_mas->offset_end) && !wr_mas->slots[wr_mas->offset_end + 1]) { wr_mas->offset_end++; if (wr_mas->offset_end == mas->end) mas->last = mas->max; else mas->last = wr_mas->pivots[wr_mas->offset_end]; wr_mas->end_piv = mas->last; } } if (!wr_mas->content) { /* If this one is null, the next and prev are not */ mas->index = wr_mas->r_min; } else { /* Check prev slot if we are overwriting the start */ if (mas->index == wr_mas->r_min && mas->offset && !wr_mas->slots[mas->offset - 1]) { mas->offset--; wr_mas->r_min = mas->index = mas_safe_min(mas, wr_mas->pivots, mas->offset); wr_mas->r_max = wr_mas->pivots[mas->offset]; } } } static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) { while ((wr_mas->offset_end < wr_mas->mas->end) && (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) wr_mas->offset_end++; if (wr_mas->offset_end < wr_mas->mas->end) wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; else wr_mas->end_piv = wr_mas->mas->max; } static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas->end + 2; new_end -= wr_mas->offset_end - mas->offset; if (wr_mas->r_min == mas->index) new_end--; if (wr_mas->end_piv == mas->last) new_end--; return new_end; } /* * mas_wr_append: Attempt to append * @wr_mas: the maple write state * @new_end: The end of the node after the modification * * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in * inaccurate information. */ static inline void mas_wr_append(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **slots; unsigned char end = mas->end; if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end); } slots = wr_mas->slots; if (new_end == end + 1) { if (mas->last == wr_mas->r_max) { /* Append to end of range */ rcu_assign_pointer(slots[new_end], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = new_end; } else { /* Append to start of range */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end] = mas->last; rcu_assign_pointer(slots[end], wr_mas->entry); } } else { /* Append to the range without touching any boundaries. */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end + 1] = mas->last; rcu_assign_pointer(slots[end + 1], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = end + 1; } if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); mas->end = new_end; trace_ma_write(__func__, mas, new_end, wr_mas->entry); return; } /* * mas_wr_bnode() - Slow path for a modification. * @wr_mas: The write maple state * * This is where split, rebalance end up. */ static void mas_wr_bnode(struct ma_wr_state *wr_mas) { struct maple_big_node b_node; trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); mas_commit_b_node(wr_mas, &b_node); } /* * mas_wr_store_entry() - Internal call to store a value * @wr_mas: The maple write state */ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas_wr_new_end(wr_mas); switch (mas->store_type) { case wr_invalid: MT_BUG_ON(mas->tree, 1); return; case wr_new_root: mas_new_root(mas, wr_mas->entry); break; case wr_store_root: mas_store_root(mas, wr_mas->entry); break; case wr_exact_fit: rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); if (!!wr_mas->entry ^ !!wr_mas->content) mas_update_gap(mas); break; case wr_append: mas_wr_append(wr_mas, new_end); break; case wr_slot_store: mas_wr_slot_store(wr_mas); break; case wr_node_store: mas_wr_node_store(wr_mas, new_end); break; case wr_spanning_store: mas_wr_spanning_store(wr_mas); break; case wr_split_store: case wr_rebalance: mas_wr_bnode(wr_mas); break; } return; } static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!mas_is_active(mas)) { if (mas_is_start(mas)) goto set_content; if (unlikely(mas_is_paused(mas))) goto reset; if (unlikely(mas_is_none(mas))) goto reset; if (unlikely(mas_is_overflow(mas))) goto reset; if (unlikely(mas_is_underflow(mas))) goto reset; } /* * A less strict version of mas_is_span_wr() where we allow spanning * writes within this node. This is to stop partial walks in * mas_prealloc() from being reset. */ if (mas->last > mas->max) goto reset; if (wr_mas->entry) goto set_content; if (mte_is_leaf(mas->node) && mas->last == mas->max) goto reset; goto set_content; reset: mas_reset(mas); set_content: wr_mas->content = mas_start(mas); } /** * mas_prealloc_calc() - Calculate number of nodes needed for a * given store oepration * @mas: The maple state * @entry: The entry to store into the tree * * Return: Number of nodes required for preallocation. */ static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) { int ret = mas_mt_height(mas) * 3 + 1; switch (mas->store_type) { case wr_invalid: WARN_ON_ONCE(1); break; case wr_new_root: ret = 1; break; case wr_store_root: if (likely((mas->last != 0) || (mas->index != 0))) ret = 1; else if (((unsigned long) (entry) & 3) == 2) ret = 1; else ret = 0; break; case wr_spanning_store: ret = mas_mt_height(mas) * 3 + 1; break; case wr_split_store: ret = mas_mt_height(mas) * 2 + 1; break; case wr_rebalance: ret = mas_mt_height(mas) * 2 - 1; break; case wr_node_store: ret = mt_in_rcu(mas->tree) ? 1 : 0; break; case wr_append: case wr_exact_fit: case wr_slot_store: ret = 0; } return ret; } /* * mas_wr_store_type() - Determine the store type for a given * store operation. * @wr_mas: The maple write state * * Return: the type of store needed for the operation */ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return wr_store_root; if (unlikely(!mas_wr_walk(wr_mas))) return wr_spanning_store; /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); if (!wr_mas->entry) mas_wr_extend_null(wr_mas); if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) return wr_exact_fit; if (unlikely(!mas->index && mas->last == ULONG_MAX)) return wr_new_root; new_end = mas_wr_new_end(wr_mas); /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) return wr_rebalance; return wr_node_store; } if (new_end >= mt_slots[wr_mas->type]) return wr_split_store; if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) return wr_append; if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || (wr_mas->offset_end - mas->offset == 1))) return wr_slot_store; return wr_node_store; } /** * mas_wr_preallocate() - Preallocate enough nodes for a store operation * @wr_mas: The maple write state * @entry: The entry that will be stored * */ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) { struct ma_state *mas = wr_mas->mas; int request; mas_wr_prealloc_setup(wr_mas); mas->store_type = mas_wr_store_type(wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return; mas_node_count(mas, request); } /** * mas_insert() - Internal call to insert a value * @mas: The maple state * @entry: The entry to store * * Return: %NULL or the contents that already exists at the requested index * otherwise. The maple state needs to be checked for error conditions. */ static inline void *mas_insert(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); /* * Inserting a new range inserts either 0, 1, or 2 pivots within the * tree. If the insert fits exactly into an existing gap with a value * of NULL, then the slot only needs to be written with the new value. * If the range being inserted is adjacent to another range, then only a * single pivot needs to be inserted (as well as writing the entry). If * the new range is within a gap but does not touch any other ranges, * then two pivots need to be inserted: the start - 1, and the end. As * usual, the entry must be written. Most operations require a new node * to be allocated and replace an existing node to ensure RCU safety, * when in RCU mode. The exception to requiring a newly allocated node * is when inserting at the end of a node (appending). When done * carefully, appending can reuse the node in place. */ wr_mas.content = mas_start(mas); if (wr_mas.content) goto exists; mas_wr_preallocate(&wr_mas, entry); if (mas_is_err(mas)) return NULL; /* spanning writes always overwrite something */ if (mas->store_type == wr_spanning_store) goto exists; /* At this point, we are at the leaf node that needs to be altered. */ if (mas->store_type != wr_new_root && mas->store_type != wr_store_root) { wr_mas.offset_end = mas->offset; wr_mas.end_piv = wr_mas.r_max; if (wr_mas.content || (mas->last > wr_mas.r_max)) goto exists; } mas_wr_store_entry(&wr_mas); return wr_mas.content; exists: mas_set_err(mas, -EEXIST); return wr_mas.content; } /** * mas_alloc_cyclic() - Internal call to find somewhere to store an entry * @mas: The maple state. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, or -EBUSY if there are no * free entries. */ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { unsigned long min = range_lo; int ret = 0; range_lo = max(min, *next); ret = mas_empty_area(mas, range_lo, range_hi, 1); if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) { mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED; ret = 1; } if (ret < 0 && range_lo > min) { mas_reset(mas); ret = mas_empty_area(mas, min, range_hi, 1); if (ret == 0) ret = 1; } if (ret < 0) return ret; do { mas_insert(mas, entry); } while (mas_nomem(mas, gfp)); if (mas_is_err(mas)) return xa_err(mas->node); *startp = mas->index; *next = *startp + 1; if (*next == 0) mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; mas_destroy(mas); return ret; } EXPORT_SYMBOL(mas_alloc_cyclic); static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index) { retry: mas_set(mas, index); mas_state_walk(mas); if (mas_is_start(mas)) goto retry; } static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas, struct maple_node *node, const unsigned long index) { if (unlikely(ma_dead_node(node))) { mas_rewalk(mas, index); return true; } return false; } /* * mas_prev_node() - Find the prev non-null entry at the same level in the * tree. The prev value will be mas->node[mas->offset] or the status will be * ma_none. * @mas: The maple state * @min: The lower limit to search * * The prev node value will be mas->node[mas->offset] or the status will be * ma_none. * Return: 1 if the node is dead, 0 otherwise. */ static int mas_prev_node(struct ma_state *mas, unsigned long min) { enum maple_type mt; int offset, level; void __rcu **slots; struct maple_node *node; unsigned long *pivots; unsigned long max; node = mas_mn(mas); if (!mas->min) goto no_entry; max = mas->min - 1; if (max < min) goto no_entry; level = 0; do { if (ma_is_root(node)) goto no_entry; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; offset = mas->offset; level++; node = mas_mn(mas); } while (!offset); offset--; mt = mte_node_type(mas->node); while (level > 1) { level--; slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); if (unlikely(ma_dead_node(node))) return 1; mt = mte_node_type(mas->node); node = mas_mn(mas); pivots = ma_pivots(node, mt); offset = ma_data_end(node, mt, pivots, max); if (unlikely(ma_dead_node(node))) return 1; } slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); pivots = ma_pivots(node, mt); if (unlikely(ma_dead_node(node))) return 1; if (likely(offset)) mas->min = pivots[offset - 1] + 1; mas->max = max; mas->offset = mas_data_end(mas); if (unlikely(mte_dead_node(mas->node))) return 1; mas->end = mas->offset; return 0; no_entry: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_underflow; return 0; } /* * mas_prev_slot() - Get the entry in the previous slot * * @mas: The maple state * @min: The minimum starting range * @empty: Can be empty * * Return: The entry in the previous slot which is possibly NULL */ static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) { void *entry; void __rcu **slots; unsigned long pivot; enum maple_type type; unsigned long *pivots; struct maple_node *node; unsigned long save_point = mas->index; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->min <= min) { pivot = mas_safe_min(mas, pivots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot <= min) goto underflow; } again: if (likely(mas->offset)) { mas->offset--; mas->last = mas->index - 1; mas->index = mas_safe_min(mas, pivots, mas->offset); } else { if (mas->index <= min) goto underflow; if (mas_prev_node(mas, min)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_underflow(mas))) return NULL; mas->last = mas->max; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->index = pivots[mas->offset - 1] + 1; } slots = ma_slots(node, type); entry = mas_slot(mas, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (likely(entry)) return entry; if (!empty) { if (mas->index <= min) { mas->status = ma_underflow; return NULL; } goto again; } return entry; underflow: mas->status = ma_underflow; return NULL; } /* * mas_next_node() - Get the next node at the same level in the tree. * @mas: The maple state * @node: The maple node * @max: The maximum pivot value to check. * * The next value will be mas->node[mas->offset] or the status will have * overflowed. * Return: 1 on dead node, 0 otherwise. */ static int mas_next_node(struct ma_state *mas, struct maple_node *node, unsigned long max) { unsigned long min; unsigned long *pivots; struct maple_enode *enode; struct maple_node *tmp; int level = 0; unsigned char node_end; enum maple_type mt; void __rcu **slots; if (mas->max >= max) goto overflow; min = mas->max + 1; level = 0; do { if (ma_is_root(node)) goto overflow; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; level++; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); node_end = ma_data_end(node, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; } while (unlikely(mas->offset == node_end)); slots = ma_slots(node, mt); mas->offset++; enode = mas_slot(mas, slots, mas->offset); if (unlikely(ma_dead_node(node))) return 1; if (level > 1) mas->offset = 0; while (unlikely(level > 1)) { level--; mas->node = enode; node = mas_mn(mas); mt = mte_node_type(mas->node); slots = ma_slots(node, mt); enode = mas_slot(mas, slots, 0); if (unlikely(ma_dead_node(node))) return 1; } if (!mas->offset) pivots = ma_pivots(node, mt); mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt); tmp = mte_to_node(enode); mt = mte_node_type(enode); pivots = ma_pivots(tmp, mt); mas->end = ma_data_end(tmp, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; mas->node = enode; mas->min = min; return 0; overflow: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_overflow; return 0; } /* * mas_next_slot() - Get the entry in the next slot * * @mas: The maple state * @max: The maximum starting range * @empty: Can be empty * * Return: The entry in the next slot which is possibly NULL */ static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty) { void __rcu **slots; unsigned long *pivots; unsigned long pivot; enum maple_type type; struct maple_node *node; unsigned long save_point = mas->last; void *entry; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->max >= max) { if (likely(mas->offset < mas->end)) pivot = pivots[mas->offset]; else pivot = mas->max; if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot >= max) { /* Was at the limit, next will extend beyond */ mas->status = ma_overflow; return NULL; } } if (likely(mas->offset < mas->end)) { mas->index = pivots[mas->offset] + 1; again: mas->offset++; if (likely(mas->offset < mas->end)) mas->last = pivots[mas->offset]; else mas->last = mas->max; } else { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } if (mas_next_node(mas, node, max)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_overflow(mas))) return NULL; mas->offset = 0; mas->index = mas->min; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->last = pivots[0]; } slots = ma_slots(node, type); entry = mt_slot(mas->tree, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (entry) return entry; if (!empty) { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } mas->index = mas->last + 1; goto again; } return entry; } /* * mas_rev_awalk() - Internal function. Reverse allocation walk. Find the * highest gap address of a given size in a given node and descend. * @mas: The maple state * @size: The needed size. * * Return: True if found in a leaf, false otherwise. * */ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); unsigned long *pivots, *gaps; void __rcu **slots; unsigned long gap = 0; unsigned long max, min; unsigned char offset; if (unlikely(mas_is_err(mas))) return true; if (ma_is_dense(type)) { /* dense nodes. */ mas->offset = (unsigned char)(mas->index - mas->min); return true; } pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); /* Skip out of bounds. */ while (mas->last < min) min = mas_safe_min(mas, pivots, --offset); max = mas_safe_pivot(mas, pivots, offset, type); while (mas->index <= max) { gap = 0; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = max - min + 1; if (gap) { if ((size <= gap) && (size <= mas->last - min + 1)) break; if (!gaps) { /* Skip the next slot, it cannot be a gap. */ if (offset < 2) goto ascend; offset -= 2; max = pivots[offset]; min = mas_safe_min(mas, pivots, offset); continue; } } if (!offset) goto ascend; offset--; max = min - 1; min = mas_safe_min(mas, pivots, offset); } if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) goto no_space; if (unlikely(ma_is_leaf(type))) { mas->offset = offset; *gap_min = min; *gap_max = min + gap - 1; return true; } /* descend, only happens under lock. */ mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = max; mas->offset = mas_data_end(mas); return false; ascend: if (!mte_is_root(mas->node)) return false; no_space: mas_set_err(mas, -EBUSY); return false; } static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; unsigned char offset, data_end; unsigned long *gaps, *pivots; void __rcu **slots; struct maple_node *node; bool found = false; if (ma_is_dense(type)) { mas->offset = (unsigned char)(mas->index - mas->min); return true; } node = mas_mn(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); data_end = ma_data_end(node, type, pivots, mas->max); for (; offset <= data_end; offset++) { pivot = mas_safe_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) goto next_slot; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = min(pivot, mas->last) - max(mas->index, min) + 1; else goto next_slot; if (gap >= size) { if (ma_is_leaf(type)) { found = true; break; } mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = pivot; offset = 0; break; } next_slot: min = pivot + 1; if (mas->last <= pivot) { mas_set_err(mas, -EBUSY); return true; } } mas->offset = offset; return found; } /** * mas_walk() - Search for @mas->index in the tree. * @mas: The maple state. * * mas->index and mas->last will be set to the range if there is a value. If * mas->status is ma_none, reset to ma_start * * Return: the entry at the location or %NULL. */ void *mas_walk(struct ma_state *mas) { void *entry; if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; retry: entry = mas_state_walk(mas); if (mas_is_start(mas)) { goto retry; } else if (mas_is_none(mas)) { mas->index = 0; mas->last = ULONG_MAX; } else if (mas_is_ptr(mas)) { if (!mas->index) { mas->last = 0; return entry; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return NULL; } return entry; } EXPORT_SYMBOL_GPL(mas_walk); static inline bool mas_rewind_node(struct ma_state *mas) { unsigned char slot; do { if (mte_is_root(mas->node)) { slot = mas->offset; if (!slot) return false; } else { mas_ascend(mas); slot = mas->offset; } } while (!slot); mas->offset = --slot; return true; } /* * mas_skip_node() - Internal function. Skip over a node. * @mas: The maple state. * * Return: true if there is another node, false otherwise. */ static inline bool mas_skip_node(struct ma_state *mas) { if (mas_is_err(mas)) return false; do { if (mte_is_root(mas->node)) { if (mas->offset >= mas_data_end(mas)) { mas_set_err(mas, -EBUSY); return false; } } else { mas_ascend(mas); } } while (mas->offset >= mas_data_end(mas)); mas->offset++; return true; } /* * mas_awalk() - Allocation walk. Search from low address to high, for a gap of * @size * @mas: The maple state * @size: The size of the gap required * * Search between @mas->index and @mas->last for a gap of @size. */ static inline void mas_awalk(struct ma_state *mas, unsigned long size) { struct maple_enode *last = NULL; /* * There are 4 options: * go to child (descend) * go back to parent (ascend) * no gap found. (return, error == -EBUSY) * found the gap. (return) */ while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) { if (last == mas->node) mas_skip_node(mas); else last = mas->node; } } /* * mas_sparse_area() - Internal function. Return upper or lower limit when * searching for a gap in an empty tree. * @mas: The maple state * @min: the minimum range * @max: The maximum range * @size: The size of the gap * @fwd: Searching forward or back */ static inline int mas_sparse_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size, bool fwd) { if (!unlikely(mas_is_none(mas)) && min == 0) { min++; /* * At this time, min is increased, we need to recheck whether * the size is satisfied. */ if (min > max || max - min + 1 < size) return -EBUSY; } /* mas_is_ptr */ if (fwd) { mas->index = min; mas->last = min + size - 1; } else { mas->last = max; mas->index = max - size + 1; } return 0; } /* * mas_empty_area() - Get the lowest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { unsigned char offset; unsigned long *pivots; enum maple_type mt; struct maple_node *node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if (mas->offset >= 2) mas->offset -= 2; else if (!mas_skip_node(mas)) return -EBUSY; /* Empty set */ if (mas_is_none(mas) || mas_is_ptr(mas)) return mas_sparse_area(mas, min, max, size, true); /* The start of the window can only be within these values */ mas->index = min; mas->last = max; mas_awalk(mas, size); if (unlikely(mas_is_err(mas))) return xa_err(mas->node); offset = mas->offset; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); min = mas_safe_min(mas, pivots, offset); if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; mas->end = ma_data_end(node, mt, pivots, mas->max); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); /* * mas_empty_area_rev() - Get the highest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { struct maple_enode *last = mas->node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if ((mas->offset < 2) && (!mas_rewind_node(mas))) return -EBUSY; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return mas_sparse_area(mas, min, max, size, false); else if (mas->offset >= 2) mas->offset -= 2; else mas->offset = mas_data_end(mas); /* The start of the window can only be within these values. */ mas->index = min; mas->last = max; while (!mas_rev_awalk(mas, size, &min, &max)) { if (last == mas->node) { if (!mas_rewind_node(mas)) return -EBUSY; } else { last = mas->node; } } if (mas_is_err(mas)) return xa_err(mas->node); if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) return -EBUSY; /* Trim the upper limit to the max. */ if (max < mas->last) mas->last = max; mas->index = mas->last - size + 1; mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area_rev); /* * mte_dead_leaves() - Mark all leaves of a node as dead. * @enode: the encoded node * @mt: the maple tree * @slots: Pointer to the slot array * * Must hold the write lock. * * Return: The number of leaves marked as dead. */ static inline unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt, void __rcu **slots) { struct maple_node *node; enum maple_type type; void *entry; int offset; for (offset = 0; offset < mt_slot_count(enode); offset++) { entry = mt_slot(mt, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); /* Use both node and type to catch LE & BE metadata */ if (!node || !type) break; mte_set_node_dead(entry); node->type = type; rcu_assign_pointer(slots[offset], node); } return offset; } /** * mte_dead_walk() - Walk down a dead tree to just before the leaves * @enode: The maple encoded node * @offset: The starting offset * * Note: This can only be used from the RCU callback context. */ static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset) { struct maple_node *node, *next; void __rcu **slots = NULL; next = mte_to_node(*enode); do { *enode = ma_enode_ptr(next); node = mte_to_node(*enode); slots = ma_slots(node, node->type); next = rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map)); offset = 0; } while (!ma_is_leaf(next->type)); return slots; } /** * mt_free_walk() - Walk & free a tree in the RCU callback context * @head: The RCU head that's within the node. * * Note: This can only be used from the RCU callback context. */ static void mt_free_walk(struct rcu_head *head) { void __rcu **slots; struct maple_node *node, *start; struct maple_enode *enode; unsigned char offset; enum maple_type type; node = container_of(head, struct maple_node, rcu); if (ma_is_leaf(node->type)) goto free_leaf; start = node; enode = mt_mk_node(node, node->type); slots = mte_dead_walk(&enode, 0); node = mte_to_node(enode); do { mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if ((offset < mt_slots[type]) && rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map))) slots = mte_dead_walk(&enode, offset); node = mte_to_node(enode); } while ((node != start) || (node->slot_len < offset)); slots = ma_slots(node, node->type); mt_free_bulk(node->slot_len, slots); free_leaf: mt_free_rcu(&node->rcu); } static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, struct maple_tree *mt, struct maple_enode *prev, unsigned char offset) { struct maple_node *node; struct maple_enode *next = *enode; void __rcu **slots = NULL; enum maple_type type; unsigned char next_offset = 0; do { *enode = next; node = mte_to_node(*enode); type = mte_node_type(*enode); slots = ma_slots(node, type); next = mt_slot_locked(mt, slots, next_offset); if ((mte_dead_node(next))) next = mt_slot_locked(mt, slots, ++next_offset); mte_set_node_dead(*enode); node->type = type; node->piv_parent = prev; node->parent_slot = offset; offset = next_offset; next_offset = 0; prev = *enode; } while (!mte_is_leaf(next)); return slots; } static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free) { void __rcu **slots; struct maple_node *node = mte_to_node(enode); struct maple_enode *start; if (mte_is_leaf(enode)) { node->type = mte_node_type(enode); goto free_leaf; } start = enode; slots = mte_destroy_descend(&enode, mt, start, 0); node = mte_to_node(enode); // Updated in the above call. do { enum maple_type type; unsigned char offset; struct maple_enode *parent, *tmp; node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if (offset >= mt_slots[type]) goto next; tmp = mt_slot_locked(mt, slots, offset); if (mte_node_type(tmp) && mte_to_node(tmp)) { parent = enode; enode = tmp; slots = mte_destroy_descend(&enode, mt, parent, offset); } next: node = mte_to_node(enode); } while (start != enode); node = mte_to_node(enode); node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); free_leaf: if (free) mt_free_rcu(&node->rcu); else mt_clear_meta(mt, node, node->type); } /* * mte_destroy_walk() - Free a tree or sub-tree. * @enode: the encoded maple node (maple_enode) to start * @mt: the tree to free - needed for node types. * * Must hold the write lock. */ static inline void mte_destroy_walk(struct maple_enode *enode, struct maple_tree *mt) { struct maple_node *node = mte_to_node(enode); if (mt_in_rcu(mt)) { mt_destroy_walk(enode, mt, false); call_rcu(&node->rcu, mt_free_walk); } else { mt_destroy_walk(enode, mt, true); } } /* Interface */ /** * mas_store() - Store an @entry. * @mas: The maple state. * @entry: The entry to store. * * The @mas->index and @mas->last is used to set the range for the @entry. * * Return: the first entry between mas->index and mas->last or %NULL. */ void *mas_store(struct ma_state *mas, void *entry) { int request; MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(__func__, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (MAS_WARN_ON(mas, mas->index > mas->last)) pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, entry); if (mas->index > mas->last) { mas_set_err(mas, -EINVAL); return NULL; } #endif /* * Storing is the same operation as insert with the added caveat that it * can overwrite entries. Although this seems simple enough, one may * want to examine what happens if a single store operation was to * overwrite multiple entries within a self-balancing B-Tree. */ mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); if (mas->mas_flags & MA_STATE_PREALLOC) { mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); return wr_mas.content; } request = mas_prealloc_calc(mas, entry); if (!request) goto store; mas_node_count(mas, request); if (mas_is_err(mas)) return NULL; store: mas_wr_store_entry(&wr_mas); mas_destroy(mas); return wr_mas.content; } EXPORT_SYMBOL_GPL(mas_store); /** * mas_store_gfp() - Store a value into the tree. * @mas: The maple state * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations if necessary. * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) { unsigned long index = mas->index; unsigned long last = mas->last; MA_WR_STATE(wr_mas, mas, entry); int ret = 0; retry: mas_wr_preallocate(&wr_mas, entry); if (unlikely(mas_nomem(mas, gfp))) { if (!entry) __mas_set_range(mas, index, last); goto retry; } if (mas_is_err(mas)) { ret = xa_err(mas->node); goto out; } mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_store_gfp); /** * mas_store_prealloc() - Store a value into the tree using memory * preallocated in the maple state. * @mas: The maple state * @entry: The entry to store. */ void mas_store_prealloc(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); if (mas->store_type == wr_store_root) { mas_wr_prealloc_setup(&wr_mas); goto store; } mas_wr_walk_descend(&wr_mas); if (mas->store_type != wr_spanning_store) { /* set wr_mas->content to current slot */ wr_mas.content = mas_slot_locked(mas, wr_mas.slots, mas->offset); mas_wr_end_piv(&wr_mas); } store: trace_ma_write(__func__, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); mas_destroy(mas); } EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); int ret = 0; int request; mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return ret; mas_node_count_gfp(mas, request, gfp); if (mas_is_err(mas)) { mas_set_alloc_req(mas, 0); ret = xa_err(mas->node); mas_destroy(mas); mas_reset(mas); return ret; } mas->mas_flags |= MA_STATE_PREALLOC; return ret; } EXPORT_SYMBOL_GPL(mas_preallocate); /* * mas_destroy() - destroy a maple state. * @mas: The maple state * * Upon completion, check the left-most node and rebalance against the node to * the right if necessary. Frees any allocated nodes associated with this maple * state. */ void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; unsigned long total; /* * When using mas_for_each() to insert an expected number of elements, * it is possible that the number inserted is less than the expected * number. To fix an invalid final node, a check is performed here to * rebalance the previous node with the final node. */ if (mas->mas_flags & MA_STATE_REBALANCE) { unsigned char end; if (mas_is_err(mas)) mas_reset(mas); mas_start(mas); mtree_range_walk(mas); end = mas->end + 1; if (end < mt_min_slot_count(mas->node) - 1) mas_destroy_rebalance(mas, end); mas->mas_flags &= ~MA_STATE_REBALANCE; } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); total = mas_allocated(mas); while (total) { node = mas->alloc; mas->alloc = node->slot[0]; if (node->node_count > 1) { size_t count = node->node_count - 1; mt_free_bulk(count, (void __rcu **)&node->slot[1]); total -= count; } mt_free_one(ma_mnode_ptr(node)); total--; } mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); /* * mas_expected_entries() - Set the expected number of entries that will be inserted. * @mas: The maple state * @nr_entries: The number of expected entries. * * This will attempt to pre-allocate enough nodes to store the expected number * of entries. The allocations will occur using the bulk allocator interface * for speed. Please call mas_destroy() on the @mas after inserting the entries * to ensure any unused nodes are freed. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) { int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2; struct maple_enode *enode = mas->node; int nr_nodes; int ret; /* * Sometimes it is necessary to duplicate a tree to a new tree, such as * forking a process and duplicating the VMAs from one tree to a new * tree. When such a situation arises, it is known that the new tree is * not going to be used until the entire tree is populated. For * performance reasons, it is best to use a bulk load with RCU disabled. * This allows for optimistic splitting that favours the left and reuse * of nodes during the operation. */ /* Optimize splitting for bulk insert in-order */ mas->mas_flags |= MA_STATE_BULK; /* * Avoid overflow, assume a gap between each entry and a trailing null. * If this is wrong, it just means allocation can happen during * insertion of entries. */ nr_nodes = max(nr_entries, nr_entries * 2 + 1); if (!mt_is_alloc(mas->tree)) nonleaf_cap = MAPLE_RANGE64_SLOTS - 2; /* Leaves; reduce slots to keep space for expansion */ nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2); /* Internal nodes */ nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap); /* Add working room for split (2 nodes) + new parents */ mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL); /* Detect if allocations run out */ mas->mas_flags |= MA_STATE_PREALLOC; if (!mas_is_err(mas)) return 0; ret = xa_err(mas->node); mas->node = enode; mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_expected_entries); static bool mas_next_setup(struct ma_state *mas, unsigned long max, void **entry) { bool was_none = mas_is_none(mas); if (unlikely(mas->last >= max)) { mas->status = ma_overflow; return true; } switch (mas->status) { case ma_active: return false; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; fallthrough; case ma_start: mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ break; case ma_overflow: /* Overflowed before, but the max changed */ mas->status = ma_active; break; case ma_underflow: /* The user expects the mas to be one before where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (likely(mas_is_active(mas))) /* Fast path */ return false; if (mas_is_ptr(mas)) { *entry = NULL; if (was_none && mas->index == 0) { mas->index = mas->last = 0; return true; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return true; } if (mas_is_none(mas)) return true; return false; } /** * mas_next() - Get the next entry. * @mas: The maple state * @max: The maximum index to check. * * Returns the next entry after @mas->index. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_next); /** * mas_next_range() - Advance the maple state to the next range * @mas: The maple state * @max: The maximum index to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_next_range); /** * mt_next() - get the next value in the maple tree * @mt: The maple tree * @index: The start index * @max: The maximum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry higher than @index or %NULL if nothing is found. */ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_next(&mas, max); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_next); static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } switch (mas->status) { case ma_active: return false; case ma_start: break; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; break; case ma_underflow: /* underflowed before but the min changed */ mas->status = ma_active; break; case ma_overflow: /* User expects mas to be one after where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) mas_walk(mas); if (unlikely(mas_is_ptr(mas))) { if (!mas->index) { mas->status = ma_none; return true; } mas->index = mas->last = 0; *entry = mas_root(mas); return true; } if (mas_is_none(mas)) { if (mas->index) { /* Walked to out-of-range pointer? */ mas->index = mas->last = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } return true; } return false; } /** * mas_prev() - Get the previous entry * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the status is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_prev); /** * mas_prev_range() - Advance to the previous range * @mas: The maple state * @min: The minimum value to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the node is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev_range(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_prev_range); /** * mt_prev() - get the previous value in the maple tree * @mt: The maple tree * @index: The start index * @min: The minimum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry before @index or %NULL if nothing is found. */ void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_prev(&mas, min); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_prev); /** * mas_pause() - Pause a mas_find/mas_for_each to drop the lock. * @mas: The maple state to pause * * Some users need to pause a walk and drop the lock they're holding in * order to yield to a higher priority thread or carry out an operation * on an entry. Those users should call this function before they drop * the lock. It resets the @mas to be suitable for the next iteration * of the loop after the user has reacquired the lock. If most entries * found during a walk require you to call mas_pause(), the mt_for_each() * iterator may be more appropriate. * */ void mas_pause(struct ma_state *mas) { mas->status = ma_pause; mas->node = NULL; } EXPORT_SYMBOL_GPL(mas_pause); /** * mas_find_setup() - Internal function to set up mas_find*(). * @mas: The maple state * @max: The maximum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry) { switch (mas->status) { case ma_active: if (mas->last < max) return false; return true; case ma_start: break; case ma_pause: if (unlikely(mas->last >= max)) return true; mas->index = ++mas->last; mas->status = ma_start; break; case ma_none: if (unlikely(mas->last >= max)) return true; mas->index = mas->last; mas->status = ma_start; break; case ma_underflow: /* mas is pointing at entry before unable to go lower */ if (unlikely(mas->index >= max)) { mas->status = ma_overflow; return true; } mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_overflow: if (unlikely(mas->last >= max)) return true; mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index > max) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto ptr_out_of_range; if (unlikely(mas_is_none(mas))) return true; if (mas->index == max) return true; return false; ptr_out_of_range: mas->status = ma_none; mas->index = 1; mas->last = ULONG_MAX; return true; } /** * mas_find() - On the first call, find the entry at or after mas->index up to * %max. Otherwise, find the entry after mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ entry = mas_next_slot(mas, max, false); /* Ignore overflow */ mas->status = ma_active; return entry; } EXPORT_SYMBOL_GPL(mas_find); /** * mas_find_range() - On the first call, find the entry at or after * mas->index up to %max. Otherwise, advance to the next slot mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_find_range); /** * mas_find_rev_setup() - Internal function to set up mas_find_*_rev() * @mas: The maple state * @min: The minimum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, void **entry) { switch (mas->status) { case ma_active: goto active; case ma_start: break; case ma_pause: if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->last = --mas->index; mas->status = ma_start; break; case ma_none: if (mas->index <= min) goto none; mas->last = mas->index; mas->status = ma_start; break; case ma_overflow: /* user expects the mas to be one after where it is */ if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->status = ma_active; break; case ma_underflow: /* user expects the mas to be one before where it is */ if (unlikely(mas->index <= min)) return true; mas->status = ma_active; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index < min) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto none; if (unlikely(mas_is_none(mas))) { /* * Walked to the location, and there was nothing so the previous * location is 0. */ mas->last = mas->index = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } active: if (mas->index < min) return true; return false; none: mas->status = ma_none; return true; } /** * mas_find_rev: On the first call, find the first non-null entry at or below * mas->index down to %min. Otherwise find the first non-null entry below * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_find_rev); /** * mas_find_range_rev: On the first call, find the first non-null entry at or * below mas->index down to %min. Otherwise advance to the previous slot after * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_range_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_find_range_rev); /** * mas_erase() - Find the range in which index resides and erase the entire * range. * @mas: The maple state * * Must hold the write lock. * Searches for @mas->index, sets @mas->index and @mas->last to the range and * erases that range. * * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated. */ void *mas_erase(struct ma_state *mas) { void *entry; unsigned long index = mas->index; MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; write_retry: entry = mas_state_walk(mas); if (!entry) return NULL; /* Must reset to ensure spanning writes of last slot are detected */ mas_reset(mas); mas_wr_preallocate(&wr_mas, NULL); if (mas_nomem(mas, GFP_KERNEL)) { /* in case the range of entry changed when unlocked */ mas->index = mas->last = index; goto write_retry; } if (mas_is_err(mas)) goto out; mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return entry; } EXPORT_SYMBOL_GPL(mas_erase); /** * mas_nomem() - Check if there was an error allocating and do the allocation * if necessary If there are allocations, then free them. * @mas: The maple state * @gfp: The GFP_FLAGS to use for allocations * Return: true on allocation, false otherwise. */ bool mas_nomem(struct ma_state *mas, gfp_t gfp) __must_hold(mas->tree->ma_lock) { if (likely(mas->node != MA_ERROR(-ENOMEM))) return false; if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) { mtree_unlock(mas->tree); mas_alloc_nodes(mas, gfp); mtree_lock(mas->tree); } else { mas_alloc_nodes(mas, gfp); } if (!mas_allocated(mas)) return false; mas->status = ma_start; return true; } void __init maple_tree_init(void) { maple_node_cache = kmem_cache_create("maple_node", sizeof(struct maple_node), sizeof(struct maple_node), SLAB_PANIC, NULL); } /** * mtree_load() - Load a value stored in a maple tree * @mt: The maple tree * @index: The index to load * * Return: the entry or %NULL */ void *mtree_load(struct maple_tree *mt, unsigned long index) { MA_STATE(mas, mt, index, index); void *entry; trace_ma_read(__func__, &mas); rcu_read_lock(); retry: entry = mas_start(&mas); if (unlikely(mas_is_none(&mas))) goto unlock; if (unlikely(mas_is_ptr(&mas))) { if (index) entry = NULL; goto unlock; } entry = mtree_lookup_walk(&mas); if (!entry && unlikely(mas_is_start(&mas))) goto retry; unlock: rcu_read_unlock(); if (xa_is_zero(entry)) return NULL; return entry; } EXPORT_SYMBOL(mtree_load); /** * mtree_store_range() - Store an entry at a given range. * @mt: The maple tree * @index: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store_range(struct maple_tree *mt, unsigned long index, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(mas, mt, index, last); int ret = 0; trace_ma_write(__func__, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (index > last) return -EINVAL; mtree_lock(mt); ret = mas_store_gfp(&mas, entry, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_store_range); /** * mtree_store() - Store an entry at a given index. * @mt: The maple tree * @index: The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_store_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_store); /** * mtree_insert_range() - Insert an entry at a given range if there is no value. * @mt: The maple tree * @first: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(ms, mt, first, last); int ret = 0; if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (first > last) return -EINVAL; mtree_lock(mt); retry: mas_insert(&ms, entry); if (mas_nomem(&ms, gfp)) goto retry; mtree_unlock(mt); if (mas_is_err(&ms)) ret = xa_err(ms.node); mas_destroy(&ms); return ret; } EXPORT_SYMBOL(mtree_insert_range); /** * mtree_insert() - Insert an entry at a given index if there is no value. * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_insert_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_insert); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_range); /** * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree. * @mt: The maple tree. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Finds an empty entry in @mt after @next, stores the new index into * the @id pointer, stores the entry at that index, then updates @next. * * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag. * * Context: Any context. Takes and releases the mt.lock. May sleep if * the @gfp flags permit. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no * free entries. */ int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { int ret; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi, next, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_alloc_cyclic); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area_rev(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_rrange); /** * mtree_erase() - Find an index and erase the entire range. * @mt: The maple tree * @index: The index to erase * * Erasing is the same as a walk to an entry then a store of a NULL to that * ENTIRE range. In fact, it is implemented as such using the advanced API. * * Return: The entry stored at the @index or %NULL */ void *mtree_erase(struct maple_tree *mt, unsigned long index) { void *entry = NULL; MA_STATE(mas, mt, index, index); trace_ma_op(__func__, &mas); mtree_lock(mt); entry = mas_erase(&mas); mtree_unlock(mt); return entry; } EXPORT_SYMBOL(mtree_erase); /* * mas_dup_free() - Free an incomplete duplication of a tree. * @mas: The maple state of a incomplete tree. * * The parameter @mas->node passed in indicates that the allocation failed on * this node. This function frees all nodes starting from @mas->node in the * reverse order of mas_dup_build(). There is no need to hold the source tree * lock at this time. */ static void mas_dup_free(struct ma_state *mas) { struct maple_node *node; enum maple_type type; void __rcu **slots; unsigned char count, i; /* Maybe the first node allocation failed. */ if (mas_is_none(mas)) return; while (!mte_is_root(mas->node)) { mas_ascend(mas); if (mas->offset) { mas->offset--; do { mas_descend(mas); mas->offset = mas_data_end(mas); } while (!mte_is_leaf(mas->node)); mas_ascend(mas); } node = mte_to_node(mas->node); type = mte_node_type(mas->node); slots = ma_slots(node, type); count = mas_data_end(mas) + 1; for (i = 0; i < count; i++) ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; mt_free_bulk(count, slots); } node = mte_to_node(mas->node); mt_free_one(node); } /* * mas_copy_node() - Copy a maple node and replace the parent. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @parent: The parent of the new node. * * Copy @mas->node to @new_mas->node, set @parent to be the parent of * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, struct maple_pnode *parent) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); unsigned long val; /* Copy the node completely. */ memcpy(new_node, node, sizeof(struct maple_node)); /* Update the parent node pointer. */ val = (unsigned long)node->parent & MAPLE_NODE_MASK; new_node->parent = ma_parent_ptr(val | (unsigned long)parent); } /* * mas_dup_alloc() - Allocate child nodes for a maple node. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @gfp: The GFP_FLAGS to use for allocations. * * This function allocates child nodes for @new_mas->node during the duplication * process. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); enum maple_type type; unsigned char request, count, i; void __rcu **slots; void __rcu **new_slots; unsigned long val; /* Allocate memory for child nodes. */ type = mte_node_type(mas->node); new_slots = ma_slots(new_node, type); request = mas_data_end(mas) + 1; count = mt_alloc_bulk(gfp, request, (void **)new_slots); if (unlikely(count < request)) { memset(new_slots, 0, request * sizeof(void *)); mas_set_err(mas, -ENOMEM); return; } /* Restore node type information in slots. */ slots = ma_slots(node, type); for (i = 0; i < count; i++) { val = (unsigned long)mt_slot_locked(mas->tree, slots, i); val &= MAPLE_NODE_MASK; ((unsigned long *)new_slots)[i] |= val; } } /* * mas_dup_build() - Build a new maple tree from a source tree * @mas: The maple state of source tree, need to be in MAS_START state. * @new_mas: The maple state of new tree, need to be in MAS_START state. * @gfp: The GFP_FLAGS to use for allocations. * * This function builds a new tree in DFS preorder. If the memory allocation * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the * last node. mas_dup_free() will free the incomplete duplication of a tree. * * Note that the attributes of the two trees need to be exactly the same, and the * new tree needs to be empty, otherwise -EINVAL will be set in @mas. */ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node; struct maple_pnode *parent = NULL; struct maple_enode *root; enum maple_type type; if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || unlikely(!mtree_empty(new_mas->tree))) { mas_set_err(mas, -EINVAL); return; } root = mas_start(mas); if (mas_is_ptr(mas) || mas_is_none(mas)) goto set_new_tree; node = mt_alloc_one(gfp); if (!node) { new_mas->status = ma_none; mas_set_err(mas, -ENOMEM); return; } type = mte_node_type(mas->node); root = mt_mk_node(node, type); new_mas->node = root; new_mas->min = 0; new_mas->max = ULONG_MAX; root = mte_mk_root(root); while (1) { mas_copy_node(mas, new_mas, parent); if (!mte_is_leaf(mas->node)) { /* Only allocate child nodes for non-leaf nodes. */ mas_dup_alloc(mas, new_mas, gfp); if (unlikely(mas_is_err(mas))) return; } else { /* * This is the last leaf node and duplication is * completed. */ if (mas->max == ULONG_MAX) goto done; /* This is not the last leaf node and needs to go up. */ do { mas_ascend(mas); mas_ascend(new_mas); } while (mas->offset == mas_data_end(mas)); /* Move to the next subtree. */ mas->offset++; new_mas->offset++; } mas_descend(mas); parent = ma_parent_ptr(mte_to_node(new_mas->node)); mas_descend(new_mas); mas->offset = 0; new_mas->offset = 0; } done: /* Specially handle the parent of the root node. */ mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); set_new_tree: /* Make them the same height */ new_mas->tree->ma_flags = mas->tree->ma_flags; rcu_assign_pointer(new_mas->tree->ma_root, root); } /** * __mt_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * Note that the user needs to manually lock the source tree and the new tree. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_dup_build(&mas, &new_mas, gfp); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } return ret; } EXPORT_SYMBOL(__mt_dup); /** * mtree_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_lock(&new_mas); mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); mas_dup_build(&mas, &new_mas, gfp); mas_unlock(&mas); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } mas_unlock(&new_mas); return ret; } EXPORT_SYMBOL(mtree_dup); /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree * * Note: Does not handle locking. */ void __mt_destroy(struct maple_tree *mt) { void *root = mt_root_locked(mt); rcu_assign_pointer(mt->ma_root, NULL); if (xa_is_node(root)) mte_destroy_walk(root, mt); mt->ma_flags = mt_attr(mt); } EXPORT_SYMBOL_GPL(__mt_destroy); /** * mtree_destroy() - Destroy a maple tree * @mt: The maple tree * * Frees all resources used by the tree. Handles locking. */ void mtree_destroy(struct maple_tree *mt) { mtree_lock(mt); __mt_destroy(mt); mtree_unlock(mt); } EXPORT_SYMBOL(mtree_destroy); /** * mt_find() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value of the search range * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * In case that an entry is found @index is updated to point to the next * possible entry independent whether the found entry is occupying a * single index or a range if indices. * * Return: The entry at or after the @index or %NULL */ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max) { MA_STATE(mas, mt, *index, *index); void *entry; #ifdef CONFIG_DEBUG_MAPLE_TREE unsigned long copy = *index; #endif trace_ma_read(__func__, &mas); if ((*index) > max) return NULL; rcu_read_lock(); retry: entry = mas_state_walk(&mas); if (mas_is_start(&mas)) goto retry; if (unlikely(xa_is_zero(entry))) entry = NULL; if (entry) goto unlock; while (mas_is_active(&mas) && (mas.last < max)) { entry = mas_next_slot(&mas, max, false); if (likely(entry && !xa_is_zero(entry))) break; } if (unlikely(xa_is_zero(entry))) entry = NULL; unlock: rcu_read_unlock(); if (likely(entry)) { *index = mas.last + 1; #ifdef CONFIG_DEBUG_MAPLE_TREE if (MT_WARN_ON(mt, (*index) && ((*index) <= copy))) pr_err("index not increased! %lx <= %lx\n", *index, copy); #endif } return entry; } EXPORT_SYMBOL(mt_find); /** * mt_find_after() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value to check * * Same as mt_find() except that it checks @index for 0 before * searching. If @index == 0, the search is aborted. This covers a wrap * around of @index to 0 in an iterator loop. * * Return: The entry at or after the @index or %NULL */ void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max) { if (!(*index)) return NULL; return mt_find(mt, index, max); } EXPORT_SYMBOL(mt_find_after); #ifdef CONFIG_DEBUG_MAPLE_TREE atomic_t maple_tree_tests_run; EXPORT_SYMBOL_GPL(maple_tree_tests_run); atomic_t maple_tree_tests_passed; EXPORT_SYMBOL_GPL(maple_tree_tests_passed); #ifndef __KERNEL__ extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int); void mt_set_non_kernel(unsigned int val) { kmem_cache_set_non_kernel(maple_node_cache, val); } extern void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *)); void mt_set_callback(void (*callback)(void *)) { kmem_cache_set_callback(maple_node_cache, callback); } extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private); void mt_set_private(void *private) { kmem_cache_set_private(maple_node_cache, private); } extern unsigned long kmem_cache_get_alloc(struct kmem_cache *); unsigned long mt_get_alloc_size(void) { return kmem_cache_get_alloc(maple_node_cache); } extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *); void mt_zero_nr_tallocated(void) { kmem_cache_zero_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *); unsigned int mt_nr_tallocated(void) { return kmem_cache_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *); unsigned int mt_nr_allocated(void) { return kmem_cache_nr_allocated(maple_node_cache); } void mt_cache_shrink(void) { } #else /* * mt_cache_shrink() - For testing, don't use this. * * Certain testcases can trigger an OOM when combined with other memory * debugging configuration options. This function is used to reduce the * possibility of an out of memory even due to kmem_cache objects remaining * around for longer than usual. */ void mt_cache_shrink(void) { kmem_cache_shrink(maple_node_cache); } EXPORT_SYMBOL_GPL(mt_cache_shrink); #endif /* not defined __KERNEL__ */ /* * mas_get_slot() - Get the entry in the maple state node stored at @offset. * @mas: The maple state * @offset: The offset into the slot array to fetch. * * Return: The entry stored at @offset. */ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, unsigned char offset) { return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)), offset); } /* Depth first search, post-order */ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { struct maple_enode *p, *mn = mas->node; unsigned long p_min, p_max; mas_next_node(mas, mas_mn(mas), max); if (!mas_is_overflow(mas)) return; if (mte_is_root(mn)) return; mas->node = mn; mas_ascend(mas); do { p = mas->node; p_min = mas->min; p_max = mas->max; mas_prev_node(mas, 0); } while (!mas_is_underflow(mas)); mas->node = p; mas->max = p_max; mas->min = p_min; } /* Tree validations */ static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format); static void mt_dump_range(unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { static const char spaces[] = " "; switch(format) { case mt_dump_hex: if (min == max) pr_info("%.*s%lx: ", depth * 2, spaces, min); else pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max); break; case mt_dump_dec: if (min == max) pr_info("%.*s%lu: ", depth * 2, spaces, min); else pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); } } static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { mt_dump_range(min, max, depth, format); if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry), xa_to_value(entry), entry); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else if (mt_is_reserved(entry)) pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry); else pr_cont(PTR_FMT "\n", entry); } static void mt_dump_range64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_range_64 *node = &mte_to_node(entry)->mr64; bool leaf = mte_is_leaf(entry); unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { switch(format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_RANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i] && max != mt_node_max(entry)) break; if (last == 0 && i > 0) break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_arange64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_arange_64 *node = &mte_to_node(entry)->ma64; unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { switch (format) { case mt_dump_hex: pr_cont("%lx ", node->gap[i]); break; case mt_dump_dec: pr_cont("%lu ", node->gap[i]); } } pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap); for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { switch (format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_ARANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i]) break; if (last == 0 && i > 0) break; if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_node *node = mte_to_node(entry); unsigned int type = mte_node_type(entry); unsigned int i; mt_dump_range(min, max, depth, format); pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node, depth, type, node ? node->parent : NULL); switch (type) { case maple_dense: pr_cont("\n"); for (i = 0; i < MAPLE_NODE_SLOTS; i++) { if (min + i > max) pr_cont("OUT OF RANGE: "); mt_dump_entry(mt_slot(mt, node->slot, i), min + i, min + i, depth, format); } break; case maple_leaf_64: case maple_range_64: mt_dump_range64(mt, entry, min, max, depth, format); break; case maple_arange_64: mt_dump_arange64(mt, entry, min, max, depth, format); break; default: pr_cont(" UNKNOWN TYPE\n"); } } void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) { void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n", mt, mt->ma_flags, mt_height(mt), entry); if (xa_is_node(entry)) mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format); else if (entry) mt_dump_entry(entry, 0, 0, 0, format); else pr_info("(empty)\n"); } EXPORT_SYMBOL_GPL(mt_dump); /* * Calculate the maximum gap in a node and check if that's what is reported in * the parent (unless root). */ static void mas_validate_gaps(struct ma_state *mas) { struct maple_enode *mte = mas->node; struct maple_node *p_mn, *node = mte_to_node(mte); enum maple_type mt = mte_node_type(mas->node); unsigned long gap = 0, max_gap = 0; unsigned long p_end, p_start = mas->min; unsigned char p_slot, offset; unsigned long *gaps = NULL; unsigned long *pivots = ma_pivots(node, mt); unsigned int i; if (ma_is_dense(mt)) { for (i = 0; i < mt_slot_count(mte); i++) { if (mas_get_slot(mas, i)) { if (gap > max_gap) max_gap = gap; gap = 0; continue; } gap++; } goto counted; } gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { p_end = mas_safe_pivot(mas, pivots, i, mt); if (!gaps) { if (!mas_get_slot(mas, i)) gap = p_end - p_start + 1; } else { void *entry = mas_get_slot(mas, i); gap = gaps[i]; MT_BUG_ON(mas->tree, !entry); if (gap > p_end - p_start + 1) { pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n", mas_mn(mas), i, gap, p_end, p_start, p_end - p_start + 1); MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); } } if (gap > max_gap) max_gap = gap; p_start = p_end + 1; if (p_end >= mas->max) break; } counted: if (mt == maple_arange_64) { MT_BUG_ON(mas->tree, !gaps); offset = ma_meta_gap(node); if (offset > i) { pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset); MT_BUG_ON(mas->tree, 1); } if (gaps[offset] != max_gap) { pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n", node, offset, max_gap); MT_BUG_ON(mas->tree, 1); } for (i++ ; i < mt_slot_count(mte); i++) { if (gaps[i] != 0) { pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n", node, i); MT_BUG_ON(mas->tree, 1); } } } if (mte_is_root(mte)) return; p_slot = mte_parent_slot(mas->node); p_mn = mte_parent(mte); MT_BUG_ON(mas->tree, max_gap > mas->max); if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); MT_BUG_ON(mas->tree, 1); } } static void mas_validate_parent_slot(struct ma_state *mas) { struct maple_node *parent; struct maple_enode *node; enum maple_type p_type; unsigned char p_slot; void __rcu **slots; int i; if (mte_is_root(mas->node)) return; p_slot = mte_parent_slot(mas->node); p_type = mas_parent_type(mas, mas->node); parent = mte_parent(mas->node); slots = ma_slots(parent, p_type); MT_BUG_ON(mas->tree, mas_mn(mas) == parent); /* Check prev/next parent slot for duplicate node entry */ for (i = 0; i < mt_slots[p_type]; i++) { node = mas_slot(mas, slots, i); if (i == p_slot) { if (node != mas->node) pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n", parent, i, mas_mn(mas)); MT_BUG_ON(mas->tree, node != mas->node); } else if (node == mas->node) { pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n", mas_mn(mas), parent, i, p_slot); MT_BUG_ON(mas->tree, node == mas->node); } } } static void mas_validate_child_slot(struct ma_state *mas) { enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type); struct maple_enode *child; unsigned char i; if (mte_is_leaf(mas->node)) return; for (i = 0; i < mt_slots[type]; i++) { child = mas_slot(mas, slots, i); if (!child) { pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n", mas_mn(mas), i); MT_BUG_ON(mas->tree, 1); } if (mte_parent_slot(child) != i) { pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n", mas_mn(mas), i, mte_to_node(child), mte_parent_slot(child)); MT_BUG_ON(mas->tree, 1); } if (mte_parent(child) != mte_to_node(mas->node)) { pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n", mte_to_node(child), mte_parent(child), mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); } if (i < mt_pivots[type] && pivots[i] == mas->max) break; } } /* * Validate all pivots are within mas->min and mas->max, check metadata ends * where the maximum ends and ensure there is no slots or pivots set outside of * the end of the data. */ static void mas_validate_limits(struct ma_state *mas) { int i; unsigned long prev_piv = 0; enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mas_mn(mas), type); for (i = 0; i < mt_slots[type]; i++) { unsigned long piv; piv = mas_safe_pivot(mas, pivots, i, type); if (!piv && (i != 0)) { pr_err("Missing node limit pivot at " PTR_FMT "[%u]", mas_mn(mas), i); MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n", mas_mn(mas), i, piv, prev_piv); MAS_WARN_ON(mas, piv < prev_piv); } if (piv < mas->min) { pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i, piv, mas->min); MAS_WARN_ON(mas, piv < mas->min); } if (piv > mas->max) { pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i, piv, mas->max); MAS_WARN_ON(mas, piv > mas->max); } prev_piv = piv; if (piv == mas->max) break; } if (mas_data_end(mas) != i) { pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n", mas_mn(mas), mas_data_end(mas), i); MT_BUG_ON(mas->tree, 1); } for (i += 1; i < mt_slots[type]; i++) { void *entry = mas_slot(mas, slots, i); if (entry && (i != mt_slots[type] - 1)) { pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n", mas_mn(mas), i, entry); MT_BUG_ON(mas->tree, entry != NULL); } if (i < mt_pivots[type]) { unsigned long piv = pivots[i]; if (!piv) continue; pr_err(PTR_FMT "[%u] should not have piv %lu\n", mas_mn(mas), i, piv); MAS_WARN_ON(mas, i < mt_pivots[type] - 1); } } } static void mt_validate_nulls(struct maple_tree *mt) { void *entry, *last = (void *)1; unsigned char offset = 0; void __rcu **slots; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (mas_is_none(&mas) || (mas_is_ptr(&mas))) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); do { entry = mas_slot(&mas, slots, offset); if (!last && !entry) { pr_err("Sequential nulls end at " PTR_FMT "[%u]\n", mas_mn(&mas), offset); } MT_BUG_ON(mt, !last && !entry); last = entry; if (offset == mas_data_end(&mas)) { mas_next_node(&mas, mas_mn(&mas), ULONG_MAX); if (mas_is_overflow(&mas)) return; offset = 0; slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); } else { offset++; } } while (!mas_is_overflow(&mas)); } /* * validate a maple tree by checking: * 1. The limits (pivots are within mas->min to mas->max) * 2. The gap is correctly set in the parents */ void mt_validate(struct maple_tree *mt) __must_hold(mas->tree->ma_lock) { unsigned char end; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (!mas_is_active(&mas)) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); while (!mas_is_overflow(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && (!mte_is_root(mas.node)))) { pr_err("Invalid size %u of " PTR_FMT "\n", end, mas_mn(&mas)); } mas_validate_parent_slot(&mas); mas_validate_limits(&mas); mas_validate_child_slot(&mas); if (mt_is_alloc(mt)) mas_validate_gaps(&mas); mas_dfs_postorder(&mas, ULONG_MAX); } mt_validate_nulls(mt); } EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ", mas->tree, mas->node); switch (mas->status) { case ma_active: pr_err("(ma_active)"); break; case ma_none: pr_err("(ma_none)"); break; case ma_root: pr_err("(ma_root)"); break; case ma_start: pr_err("(ma_start) "); break; case ma_pause: pr_err("(ma_pause) "); break; case ma_overflow: pr_err("(ma_overflow) "); break; case ma_underflow: pr_err("(ma_underflow) "); break; case ma_error: pr_err("(ma_error) "); break; } pr_err("Store Type: "); switch (mas->store_type) { case wr_invalid: pr_err("invalid store type\n"); break; case wr_new_root: pr_err("new_root\n"); break; case wr_store_root: pr_err("store_root\n"); break; case wr_exact_fit: pr_err("exact_fit\n"); break; case wr_split_store: pr_err("split_store\n"); break; case wr_slot_store: pr_err("slot_store\n"); break; case wr_append: pr_err("append\n"); break; case wr_node_store: pr_err("node_store\n"); break; case wr_spanning_store: pr_err("spanning_store\n"); break; case wr_rebalance: pr_err("rebalance\n"); break; } pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n", mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); } EXPORT_SYMBOL_GPL(mas_dump); void mas_wr_dump(const struct ma_wr_state *wr_mas) { pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n", wr_mas->node, wr_mas->r_min, wr_mas->r_max); pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", wr_mas->type, wr_mas->offset_end, wr_mas->mas->end, wr_mas->end_piv); } EXPORT_SYMBOL_GPL(mas_wr_dump); #endif /* CONFIG_DEBUG_MAPLE_TREE */ |
22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM ipi #if !defined(_TRACE_IPI_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IPI_H #include <linux/tracepoint.h> /** * ipi_raise - called when a smp cross call is made * * @mask: mask of recipient CPUs for the IPI * @reason: string identifying the IPI purpose * * It is necessary for @reason to be a static string declared with * __tracepoint_string. */ TRACE_EVENT(ipi_raise, TP_PROTO(const struct cpumask *mask, const char *reason), TP_ARGS(mask, reason), TP_STRUCT__entry( __bitmask(target_cpus, nr_cpumask_bits) __field(const char *, reason) ), TP_fast_assign( __assign_bitmask(target_cpus, cpumask_bits(mask), nr_cpumask_bits); __entry->reason = reason; ), TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason) ); TRACE_EVENT(ipi_send_cpu, TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback), TP_ARGS(cpu, callsite, callback), TP_STRUCT__entry( __field(unsigned int, cpu) __field(void *, callsite) __field(void *, callback) ), TP_fast_assign( __entry->cpu = cpu; __entry->callsite = (void *)callsite; __entry->callback = callback; ), TP_printk("cpu=%u callsite=%pS callback=%pS", __entry->cpu, __entry->callsite, __entry->callback) ); TRACE_EVENT(ipi_send_cpumask, TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback), TP_ARGS(cpumask, callsite, callback), TP_STRUCT__entry( __cpumask(cpumask) __field(void *, callsite) __field(void *, callback) ), TP_fast_assign( __assign_cpumask(cpumask, cpumask_bits(cpumask)); __entry->callsite = (void *)callsite; __entry->callback = callback; ), TP_printk("cpumask=%s callsite=%pS callback=%pS", __get_cpumask(cpumask), __entry->callsite, __entry->callback) ); DECLARE_EVENT_CLASS(ipi_handler, TP_PROTO(const char *reason), TP_ARGS(reason), TP_STRUCT__entry( __field(const char *, reason) ), TP_fast_assign( __entry->reason = reason; ), TP_printk("(%s)", __entry->reason) ); /** * ipi_entry - called immediately before the IPI handler * * @reason: string identifying the IPI purpose * * It is necessary for @reason to be a static string declared with * __tracepoint_string, ideally the same as used with trace_ipi_raise * for that IPI. */ DEFINE_EVENT(ipi_handler, ipi_entry, TP_PROTO(const char *reason), TP_ARGS(reason) ); /** * ipi_exit - called immediately after the IPI handler returns * * @reason: string identifying the IPI purpose * * It is necessary for @reason to be a static string declared with * __tracepoint_string, ideally the same as used with trace_ipi_raise for * that IPI. */ DEFINE_EVENT(ipi_handler, ipi_exit, TP_PROTO(const char *reason), TP_ARGS(reason) ); #endif /* _TRACE_IPI_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
1041 1041 22 208 1041 1041 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_CPUMASK_H #define __LINUX_CPUMASK_H /* * Cpumasks provide a bitmap suitable for representing the * set of CPUs in a system, one bit position per CPU number. In general, * only nr_cpu_ids (<= NR_CPUS) bits are valid. */ #include <linux/cleanup.h> #include <linux/kernel.h> #include <linux/bitmap.h> #include <linux/cpumask_types.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/gfp_types.h> #include <linux/numa.h> /** * cpumask_pr_args - printf args to output a cpumask * @maskp: cpumask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a cpumask. */ #define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) #define nr_cpu_ids ((unsigned int)NR_CPUS) #else extern unsigned int nr_cpu_ids; #endif static __always_inline void set_nr_cpu_ids(unsigned int nr) { #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) WARN_ON(nr != nr_cpu_ids); #else nr_cpu_ids = nr; #endif } /* * We have several different "preferred sizes" for the cpumask * operations, depending on operation. * * For example, the bitmap scanning and operating operations have * optimized routines that work for the single-word case, but only when * the size is constant. So if NR_CPUS fits in one single word, we are * better off using that small constant, in order to trigger the * optimized bit finding. That is 'small_cpumask_size'. * * The clearing and copying operations will similarly perform better * with a constant size, but we limit that size arbitrarily to four * words. We call this 'large_cpumask_size'. * * Finally, some operations just want the exact limit, either because * they set bits or just don't have any faster fixed-sized versions. We * call this just 'nr_cpumask_bits'. * * Note that these optional constants are always guaranteed to be at * least as big as 'nr_cpu_ids' itself is, and all our cpumask * allocations are at least that size (see cpumask_size()). The * optimization comes from being able to potentially use a compile-time * constant instead of a run-time generated exact number of CPUs. */ #if NR_CPUS <= BITS_PER_LONG #define small_cpumask_bits ((unsigned int)NR_CPUS) #define large_cpumask_bits ((unsigned int)NR_CPUS) #elif NR_CPUS <= 4*BITS_PER_LONG #define small_cpumask_bits nr_cpu_ids #define large_cpumask_bits ((unsigned int)NR_CPUS) #else #define small_cpumask_bits nr_cpu_ids #define large_cpumask_bits nr_cpu_ids #endif #define nr_cpumask_bits nr_cpu_ids /* * The following particular system cpumasks and operations manage * possible, present, active and online cpus. * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * * The cpu_possible_mask is fixed at boot time, as the set of CPU IDs * that it is possible might ever be plugged in at anytime during the * life of that system boot. The cpu_present_mask is dynamic(*), * representing which CPUs are currently plugged in. And * cpu_online_mask is the dynamic subset of cpu_present_mask, * indicating those CPUs available for scheduling. * * If HOTPLUG is enabled, then cpu_present_mask varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise * cpu_present_mask is just a copy of cpu_possible_mask. * * (*) Well, cpu_present_mask is dynamic in the hotplug case. If not * hotplug, it's a copy of cpu_possible_mask, hence fixed at boot. * * Subtleties: * 1) UP ARCHes (NR_CPUS == 1, CONFIG_SMP not defined) hardcode * assumption that their single CPU is online. The UP * cpu_{online,possible,present}_masks are placebos. Changing them * will have no useful affect on the following num_*_cpus() * and cpu_*() macros in the UP case. This ugliness is a UP * optimization - don't waste any instructions or memory references * asking if you're online or how many CPUs there are if there is * only one CPU. */ extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_enabled_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; extern struct cpumask __cpu_dying_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) #define cpu_enabled_mask ((const struct cpumask *)&__cpu_enabled_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) extern atomic_t __num_online_cpus; extern cpumask_t cpus_booted_once_mask; static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ } /* verify cpu argument to cpumask_* operators */ static __always_inline unsigned int cpumask_check(unsigned int cpu) { cpu_max_bits_warn(cpu, small_cpumask_bits); return cpu; } /** * cpumask_first - get the first cpu in a cpumask * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_first(const struct cpumask *srcp) { return find_first_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_first_zero - get the first unset cpu in a cpumask * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if all cpus are set. */ static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp) { return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @srcp1: the first input * @srcp2: the second input * * Return: >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ static __always_inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3 * @srcp1: the first input * @srcp2: the second input * @srcp3: the third input * * Return: >= nr_cpu_ids if no cpus set in all. */ static __always_inline unsigned int cpumask_first_and_and(const struct cpumask *srcp1, const struct cpumask *srcp2, const struct cpumask *srcp3) { return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), cpumask_bits(srcp3), small_cpumask_bits); } /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer * * Return: >= nr_cpumask_bits if no CPUs set. */ static __always_inline unsigned int cpumask_last(const struct cpumask *srcp) { return find_last_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_next - get the next cpu in a cpumask * @n: the cpu prior to the place to search (i.e. return will be > @n) * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no further cpus set. */ static __always_inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1); } /** * cpumask_next_zero - get the next unset cpu in a cpumask * @n: the cpu prior to the place to search (i.e. return will be > @n) * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no further cpus unset. */ static __always_inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1); } #if NR_CPUS == 1 /* Uniprocessor: there is only one valid CPU */ static __always_inline unsigned int cpumask_local_spread(unsigned int i, int node) { return 0; } static __always_inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { return cpumask_first_and(src1p, src2p); } static __always_inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) { return cpumask_first(srcp); } #else unsigned int cpumask_local_spread(unsigned int i, int node); unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); unsigned int cpumask_any_distribute(const struct cpumask *srcp); #endif /* NR_CPUS */ /** * cpumask_next_and - get the next cpu in *src1p & *src2p * @n: the cpu prior to the place to search (i.e. return will be > @n) * @src1p: the first cpumask pointer * @src2p: the second cpumask pointer * * Return: >= nr_cpu_ids if no further cpus set in both. */ static __always_inline unsigned int cpumask_next_and(int n, const struct cpumask *src1p, const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits, n + 1); } /** * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from * @n+1. If nothing found, wrap around and start from * the beginning * @n: the cpu prior to the place to search (i.e. search starts from @n+1) * @src1p: the first cpumask pointer * @src2p: the second cpumask pointer * * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src1p & @src2p is empty. */ static __always_inline unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p, const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits, n + 1); } /** * cpumask_next_wrap - get the next cpu in *src, starting from @n+1. If nothing * found, wrap around and start from the beginning * @n: the cpu prior to the place to search (i.e. search starts from @n+1) * @src: cpumask pointer * * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src is empty. */ static __always_inline unsigned int cpumask_next_wrap(int n, const struct cpumask *src) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1); } /** * for_each_cpu - iterate over every cpu in a mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu(cpu, mask) \ for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits) /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * @start: the start location * * The implementation does not assume any bit in @mask is set (including @start). * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_wrap(cpu, mask, start) \ for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start) /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_and(cpu, mask1, mask2) \ for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding * those present in another. * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_andnot(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_andnot(cpu, mask1, mask2) \ for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_or - iterate over every cpu present in either mask * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_or(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_or(cpu, mask1, mask2) \ for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_from - iterate over CPUs present in @mask, from @cpu to the end of @mask. * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_from(cpu, mask) \ for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits) /** * cpumask_any_but - return an arbitrary cpu in a cpumask, but not this one. * @mask: the cpumask to search * @cpu: the cpu to ignore. * * Often used to find any cpu but smp_processor_id() in a mask. * Return: >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) { unsigned int i; cpumask_check(cpu); for_each_cpu(i, mask) if (i != cpu) break; return i; } /** * cpumask_any_and_but - pick an arbitrary cpu from *mask1 & *mask2, but not this one. * @mask1: the first input cpumask * @mask2: the second input cpumask * @cpu: the cpu to ignore * * Returns >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_and_but(const struct cpumask *mask1, const struct cpumask *mask2, unsigned int cpu) { unsigned int i; cpumask_check(cpu); i = cpumask_first_and(mask1, mask2); if (i != cpu) return i; return cpumask_next_and(cpu, mask1, mask2); } /** * cpumask_nth - get the Nth cpu in a cpumask * @srcp: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) { return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu)); } /** * cpumask_nth_and - get the Nth cpu in 2 cpumasks * @srcp1: the cpumask pointer * @srcp2: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits, cpumask_check(cpu)); } /** * cpumask_nth_andnot - get the Nth cpu set in 1st cpumask, and clear in 2nd. * @srcp1: the cpumask pointer * @srcp2: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits, cpumask_check(cpu)); } /** * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd. * @srcp1: the cpumask pointer * @srcp2: the cpumask pointer * @srcp3: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2, const struct cpumask *srcp3) { return find_nth_and_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), cpumask_bits(srcp3), small_cpumask_bits, cpumask_check(cpu)); } #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } #define CPU_BITS_CPU0 \ { \ [0] = 1UL \ } /** * cpumask_set_cpu - set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static __always_inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static __always_inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static __always_inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) { clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) { __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_assign_cpu - assign a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer * @bool: the value to assign */ static __always_inline void cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value) { assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value); } static __always_inline void __cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value) { __assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value); } /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Return: true if @cpu is set in @cpumask, else returns false */ static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask) { return test_bit(cpumask_check(cpu), cpumask_bits((cpumask))); } /** * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * test_and_set_bit wrapper for cpumasks. * * Return: true if @cpu is set in old bitmap of @cpumask, else returns false */ static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) { return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * test_and_clear_bit wrapper for cpumasks. * * Return: true if @cpu is set in old bitmap of @cpumask, else returns false */ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask) { return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static __always_inline void cpumask_setall(struct cpumask *dstp) { if (small_const_nbits(small_cpumask_bits)) { cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits); return; } bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static __always_inline void cpumask_clear(struct cpumask *dstp) { bitmap_zero(cpumask_bits(dstp), large_cpumask_bits); } /** * cpumask_and - *dstp = *src1p & *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * Return: false if *@dstp is empty, else returns true */ static __always_inline bool cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_or - *dstp = *src1p | *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static __always_inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static __always_inline void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_andnot - *dstp = *src1p & ~*src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * Return: false if *@dstp is empty, else returns true */ static __always_inline bool cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_equal - *src1p == *src2p * @src1p: the first input * @src2p: the second input * * Return: true if the cpumasks are equal, false if not */ static __always_inline bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_or_equal - *src1p | *src2p == *src3p * @src1p: the first input * @src2p: the second input * @src3p: the third input * * Return: true if first cpumask ORed with second cpumask == third cpumask, * otherwise false */ static __always_inline bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p, const struct cpumask *src3p) { return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p), cpumask_bits(src3p), small_cpumask_bits); } /** * cpumask_intersects - (*src1p & *src2p) != 0 * @src1p: the first input * @src2p: the second input * * Return: true if first cpumask ANDed with second cpumask is non-empty, * otherwise false */ static __always_inline bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_subset - (*src1p & ~*src2p) == 0 * @src1p: the first input * @src2p: the second input * * Return: true if *@src1p is a subset of *@src2p, else returns false */ static __always_inline bool cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_empty - *srcp == 0 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. * * Return: true if srcp is empty (has no bits set), else false */ static __always_inline bool cpumask_empty(const struct cpumask *srcp) { return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_full - *srcp == 0xFFFFFFFF... * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. * * Return: true if srcp is full (has all bits set), else false */ static __always_inline bool cpumask_full(const struct cpumask *srcp) { return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_weight - Count of bits in *srcp * @srcp: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in *srcp */ static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp) { return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2) * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in both *srcp1 and *srcp2 */ static __always_inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_weight_andnot - Count of bits in (*srcp1 & ~*srcp2) * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in both *srcp1 and *srcp2 */ static __always_inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2) { return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_shift_right - *dstp = *srcp >> n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static __always_inline void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, small_cpumask_bits); } /** * cpumask_shift_left - *dstp = *srcp << n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static __always_inline void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, nr_cpumask_bits); } /** * cpumask_copy - *dstp = *srcp * @dstp: the result * @srcp: the input cpumask */ static __always_inline void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits); } /** * cpumask_any - pick an arbitrary cpu from *srcp * @srcp: the input cpumask * * Return: >= nr_cpu_ids if no cpus set. */ #define cpumask_any(srcp) cpumask_first(srcp) /** * cpumask_any_and - pick an arbitrary cpu from *mask1 & *mask2 * @mask1: the first input cpumask * @mask2: the second input cpumask * * Return: >= nr_cpu_ids if no cpus set. */ #define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2)) /** * cpumask_of - the cpumask containing just a given cpu * @cpu: the cpu (<= nr_cpu_ids) */ #define cpumask_of(cpu) (get_cpu_mask(cpu)) /** * cpumask_parse_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parselist_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parse - extract a cpumask from a string * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp) { return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpulist_parse - extract a cpumask from a user string of ranges * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp) { return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_size - calculate size to allocate for a 'struct cpumask' in bytes * * Return: size to allocate for a &struct cpumask in bytes */ static __always_inline unsigned int cpumask_size(void) { return bitmap_size(large_cpumask_bits); } #ifdef CONFIG_CPUMASK_OFFSTACK #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node); } /** * alloc_cpumask_var - allocate a struct cpumask * @mask: pointer to cpumask_var_t where the cpumask is returned * @flags: GFP_ flags * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop returning a constant 1 (in <linux/cpumask.h>). * * See alloc_cpumask_var_node. * * Return: %true if allocation succeeded, %false if not */ static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE); } static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var(mask, flags | __GFP_ZERO); } void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); static __always_inline bool cpumask_available(cpumask_var_t mask) { return mask != NULL; } #else #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; } static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return true; } static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { cpumask_clear(*mask); return true; } static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { cpumask_clear(*mask); return true; } static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } static __always_inline void free_cpumask_var(cpumask_var_t mask) { } static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask) { } static __always_inline bool cpumask_available(cpumask_var_t mask) { return true; } #endif /* CONFIG_CPUMASK_OFFSTACK */ DEFINE_FREE(free_cpumask_var, struct cpumask *, if (_T) free_cpumask_var(_T)); /* It's common to want to use cpu_all_mask in struct member initializers, * so it has to refer to an address rather than a pointer. */ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define cpu_all_mask to_cpumask(cpu_all_bits) /* First bits of cpu_bit_bitmap are in fact unset. */ #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0]) #if NR_CPUS == 1 /* Uniprocessor: the possible/online/present masks are always "1" */ #define for_each_possible_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_present_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_possible_cpu_wrap(cpu, start) \ for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu_wrap(cpu, start) \ for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #else #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_enabled_cpu(cpu) for_each_cpu((cpu), cpu_enabled_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) #define for_each_possible_cpu_wrap(cpu, start) \ for_each_cpu_wrap((cpu), cpu_possible_mask, (start)) #define for_each_online_cpu_wrap(cpu, start) \ for_each_cpu_wrap((cpu), cpu_online_mask, (start)) #endif /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); #define assign_cpu(cpu, mask, val) \ assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) #define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) #define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) void set_cpu_online(unsigned int cpu, bool online); /** * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap * * There are a few places where cpumask_var_t isn't appropriate and * static cpumasks must be used (eg. very early boot), yet we don't * expose the definition of 'struct cpumask'. * * This does the conversion, and can be used as a constant initializer. */ #define to_cpumask(bitmap) \ ((struct cpumask *)(1 ? (bitmap) \ : (void *)sizeof(__check_is_bitmap(bitmap)))) static __always_inline int __check_is_bitmap(const unsigned long *bitmap) { return 1; } /* * Special-case data structure for "single bit set only" constant CPU masks. * * We pre-generate all the 64 (or 32) possible bit positions, with enough * padding to the left and the right, and return the constant pointer * appropriately offset. */ extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return to_cpumask(p); } #if NR_CPUS > 1 /** * num_online_cpus() - Read the number of online CPUs * * Despite the fact that __num_online_cpus is of type atomic_t, this * interface gives only a momentary snapshot and is not protected against * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. * * Return: momentary snapshot of the number of online CPUs */ static __always_inline unsigned int num_online_cpus(void) { return raw_atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_enabled_cpus() cpumask_weight(cpu_enabled_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) static __always_inline bool cpu_online(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_online_mask); } static __always_inline bool cpu_enabled(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_enabled_mask); } static __always_inline bool cpu_possible(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_possible_mask); } static __always_inline bool cpu_present(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_present_mask); } static __always_inline bool cpu_active(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_active_mask); } static __always_inline bool cpu_dying(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_dying_mask); } #else #define num_online_cpus() 1U #define num_possible_cpus() 1U #define num_enabled_cpus() 1U #define num_present_cpus() 1U #define num_active_cpus() 1U static __always_inline bool cpu_online(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_possible(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_enabled(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_present(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_active(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_dying(unsigned int cpu) { return false; } #endif /* NR_CPUS > 1 */ #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) #if NR_CPUS <= BITS_PER_LONG #define CPU_BITS_ALL \ { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #else /* NR_CPUS > BITS_PER_LONG */ #define CPU_BITS_ALL \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #endif /* NR_CPUS > BITS_PER_LONG */ /** * cpumap_print_to_pagebuf - copies the cpumask into the buffer either * as comma-separated list of cpus or hex values of cpumask * @list: indicates whether the cpumap must be list * @mask: the cpumask to copy * @buf: the buffer to copy into * * Return: the length of the (null-terminated) @buf string, zero if * nothing is copied. */ static __always_inline ssize_t cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) { return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), nr_cpu_ids); } /** * cpumap_print_bitmask_to_buf - copies the cpumask into the buffer as * hex values of cpumask * * @buf: the buffer to copy into * @mask: the cpumask to copy * @off: in the string from which we are copying, we copy to @buf * @count: the maximum number of bytes to print * * The function prints the cpumask into the buffer as hex values of * cpumask; Typically used by bin_attribute to export cpumask bitmask * ABI. * * Return: the length of how many bytes have been copied, excluding * terminating '\0'. */ static __always_inline ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; } /** * cpumap_print_list_to_buf - copies the cpumask into the buffer as * comma-separated list of cpus * @buf: the buffer to copy into * @mask: the cpumask to copy * @off: in the string from which we are copying, we copy to @buf * @count: the maximum number of bytes to print * * Everything is same with the above cpumap_print_bitmask_to_buf() * except the print format. * * Return: the length of how many bytes have been copied, excluding * terminating '\0'. */ static __always_inline ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_list_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; } #if NR_CPUS <= BITS_PER_LONG #define CPU_MASK_ALL \ (cpumask_t) { { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #else #define CPU_MASK_ALL \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #endif /* NR_CPUS > BITS_PER_LONG */ #define CPU_MASK_NONE \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } } #define CPU_MASK_CPU0 \ (cpumask_t) { { \ [0] = 1UL \ } } /* * Provide a valid theoretical max size for cpumap and cpulist sysfs files * to avoid breaking userspace which may allocate a buffer based on the size * reported by e.g. fstat. * * for cpumap NR_CPUS * 9/32 - 1 should be an exact length. * * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up * to 2 orders of magnitude larger than 8192. And then we divide by 2 to * cover a worst-case of every other cpu being on one of two nodes for a * very large NR_CPUS. * * Use PAGE_SIZE as a minimum for smaller configurations while avoiding * unsigned comparison to -1. */ #define CPUMAP_FILE_MAX_BYTES (((NR_CPUS * 9)/32 > PAGE_SIZE) \ ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE) #define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE) #endif /* __LINUX_CPUMASK_H */ |
2 2 24 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Operations on the network namespace */ #ifndef __NET_NET_NAMESPACE_H #define __NET_NET_NAMESPACE_H #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include <linux/list.h> #include <linux/sysctl.h> #include <linux/uidgid.h> #include <net/flow.h> #include <net/netns/core.h> #include <net/netns/mib.h> #include <net/netns/unix.h> #include <net/netns/packet.h> #include <net/netns/ipv4.h> #include <net/netns/ipv6.h> #include <net/netns/nexthop.h> #include <net/netns/ieee802154_6lowpan.h> #include <net/netns/sctp.h> #include <net/netns/netfilter.h> #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include <net/netns/conntrack.h> #endif #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) #include <net/netns/flow_table.h> #endif #include <net/netns/nftables.h> #include <net/netns/xfrm.h> #include <net/netns/mpls.h> #include <net/netns/can.h> #include <net/netns/xdp.h> #include <net/netns/smc.h> #include <net/netns/bpf.h> #include <net/netns/mctp.h> #include <net/net_trackers.h> #include <linux/ns_common.h> #include <linux/idr.h> #include <linux/skbuff.h> #include <linux/notifier.h> #include <linux/xarray.h> struct user_namespace; struct proc_dir_entry; struct net_device; struct sock; struct ctl_table_header; struct net_generic; struct uevent_sock; struct netns_ipvs; struct bpf_prog; #define NETDEV_HASHBITS 8 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) struct net { /* First cache line can be often dirtied. * Do not place here read-mostly fields. */ refcount_t passive; /* To decide when the network * namespace should be freed. */ spinlock_t rules_mod_lock; unsigned int dev_base_seq; /* protected by rtnl_mutex */ u32 ifindex; spinlock_t nsid_lock; atomic_t fnhe_genid; struct list_head list; /* list of network namespaces */ struct list_head exit_list; /* To linked to call pernet exit * methods on dead net ( * pernet_ops_rwsem read locked), * or to unregister pernet ops * (pernet_ops_rwsem write locked). */ struct llist_node defer_free_list; struct llist_node cleanup_list; /* namespaces on death row */ struct list_head ptype_all; struct list_head ptype_specific; #ifdef CONFIG_KEYS struct key_tag *key_domain; /* Key domain of operation tag */ #endif struct user_namespace *user_ns; /* Owning user namespace */ struct ucounts *ucounts; struct idr netns_ids; struct ns_common ns; struct ref_tracker_dir refcnt_tracker; struct ref_tracker_dir notrefcnt_tracker; /* tracker for objects not * refcounted against netns */ struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; #ifdef CONFIG_SYSCTL struct ctl_table_set sysctls; #endif struct sock *rtnl; /* rtnetlink socket */ struct sock *genl_sock; struct uevent_sock *uevent_sock; /* uevent socket */ struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; struct xarray dev_by_index; struct raw_notifier_head netdev_chain; /* Note that @hash_mix can be read millions times per second, * it is critical that it is on a read_mostly cache line. */ u32 hash_mix; struct net_device *loopback_dev; /* The loopback */ /* core fib_rules */ struct list_head rules_ops; struct netns_core core; struct netns_mib mib; struct netns_packet packet; #if IS_ENABLED(CONFIG_UNIX) struct netns_unix unx; #endif struct netns_nexthop nexthop; struct netns_ipv4 ipv4; #if IS_ENABLED(CONFIG_IPV6) struct netns_ipv6 ipv6; #endif #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) struct netns_ieee802154_lowpan ieee802154_lowpan; #endif #if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE) struct netns_sctp sctp; #endif #ifdef CONFIG_NETFILTER struct netns_nf nf; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct netns_ct ct; #endif #if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE) struct netns_nftables nft; #endif #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) struct netns_ft ft; #endif #endif #ifdef CONFIG_WEXT_CORE struct sk_buff_head wext_nlevents; #endif struct net_generic __rcu *gen; /* Used to store attached BPF programs */ struct netns_bpf bpf; /* Note : following structs are cache line aligned */ #ifdef CONFIG_XFRM struct netns_xfrm xfrm; #endif u64 net_cookie; /* written once */ #if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; #endif #if IS_ENABLED(CONFIG_MPLS) struct netns_mpls mpls; #endif #if IS_ENABLED(CONFIG_CAN) struct netns_can can; #endif #ifdef CONFIG_XDP_SOCKETS struct netns_xdp xdp; #endif #if IS_ENABLED(CONFIG_MCTP) struct netns_mctp mctp; #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) struct sock *crypto_nlsk; #endif struct sock *diag_nlsk; #if IS_ENABLED(CONFIG_SMC) struct netns_smc smc; #endif #ifdef CONFIG_DEBUG_NET_SMALL_RTNL /* Move to a better place when the config guard is removed. */ struct mutex rtnl_mutex; #endif } __randomize_layout; #include <linux/seq_file_net.h> /* Init's network namespace */ extern struct net init_net; #ifdef CONFIG_NET_NS struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net); void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); void net_ns_barrier(void); struct ns_common *get_net_ns(struct ns_common *ns); struct net *get_net_ns_by_fd(int fd); extern struct task_struct *cleanup_net_task; #else /* CONFIG_NET_NS */ #include <linux/sched.h> #include <linux/nsproxy.h> static inline struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net) { if (flags & CLONE_NEWNET) return ERR_PTR(-EINVAL); return old_net; } static inline void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) { *uid = GLOBAL_ROOT_UID; *gid = GLOBAL_ROOT_GID; } static inline void net_ns_barrier(void) {} static inline struct ns_common *get_net_ns(struct ns_common *ns) { return ERR_PTR(-EINVAL); } static inline struct net *get_net_ns_by_fd(int fd) { return ERR_PTR(-EINVAL); } #endif /* CONFIG_NET_NS */ extern struct list_head net_namespace_list; struct net *get_net_ns_by_pid(pid_t pid); #ifdef CONFIG_SYSCTL void ipx_register_sysctl(void); void ipx_unregister_sysctl(void); #else #define ipx_register_sysctl() #define ipx_unregister_sysctl() #endif #ifdef CONFIG_NET_NS void __put_net(struct net *net); /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { refcount_inc(&net->ns.count); return net; } static inline struct net *maybe_get_net(struct net *net) { /* Used when we know struct net exists but we * aren't guaranteed a previous reference count * exists. If the reference count is zero this * function fails and returns NULL. */ if (!refcount_inc_not_zero(&net->ns.count)) net = NULL; return net; } /* Try using put_net_track() instead */ static inline void put_net(struct net *net) { if (refcount_dec_and_test(&net->ns.count)) __put_net(net); } static inline int net_eq(const struct net *net1, const struct net *net2) { return net1 == net2; } static inline int check_net(const struct net *net) { return refcount_read(&net->ns.count) != 0; } void net_drop_ns(void *); void net_passive_dec(struct net *net); #else static inline struct net *get_net(struct net *net) { return net; } static inline void put_net(struct net *net) { } static inline struct net *maybe_get_net(struct net *net) { return net; } static inline int net_eq(const struct net *net1, const struct net *net2) { return 1; } static inline int check_net(const struct net *net) { return 1; } #define net_drop_ns NULL static inline void net_passive_dec(struct net *net) { refcount_dec(&net->passive); } #endif static inline void net_passive_inc(struct net *net) { refcount_inc(&net->passive); } /* Returns true if the netns initialization is completed successfully */ static inline bool net_initialized(const struct net *net) { return READ_ONCE(net->list.next); } static inline void __netns_tracker_alloc(struct net *net, netns_tracker *tracker, bool refcounted, gfp_t gfp) { #ifdef CONFIG_NET_NS_REFCNT_TRACKER ref_tracker_alloc(refcounted ? &net->refcnt_tracker : &net->notrefcnt_tracker, tracker, gfp); #endif } static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker, gfp_t gfp) { __netns_tracker_alloc(net, tracker, true, gfp); } static inline void __netns_tracker_free(struct net *net, netns_tracker *tracker, bool refcounted) { #ifdef CONFIG_NET_NS_REFCNT_TRACKER ref_tracker_free(refcounted ? &net->refcnt_tracker : &net->notrefcnt_tracker, tracker); #endif } static inline struct net *get_net_track(struct net *net, netns_tracker *tracker, gfp_t gfp) { get_net(net); netns_tracker_alloc(net, tracker, gfp); return net; } static inline void put_net_track(struct net *net, netns_tracker *tracker) { __netns_tracker_free(net, tracker, true); put_net(net); } typedef struct { #ifdef CONFIG_NET_NS struct net __rcu *net; #endif } possible_net_t; static inline void write_pnet(possible_net_t *pnet, struct net *net) { #ifdef CONFIG_NET_NS rcu_assign_pointer(pnet->net, net); #endif } static inline struct net *read_pnet(const possible_net_t *pnet) { #ifdef CONFIG_NET_NS return rcu_dereference_protected(pnet->net, true); #else return &init_net; #endif } static inline struct net *read_pnet_rcu(const possible_net_t *pnet) { #ifdef CONFIG_NET_NS return rcu_dereference(pnet->net); #else return &init_net; #endif } /* Protected by net_rwsem */ #define for_each_net(VAR) \ list_for_each_entry(VAR, &net_namespace_list, list) #define for_each_net_continue_reverse(VAR) \ list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list) #define for_each_net_rcu(VAR) \ list_for_each_entry_rcu(VAR, &net_namespace_list, list) #ifdef CONFIG_NET_NS #define __net_init #define __net_exit #define __net_initdata #define __net_initconst #else #define __net_init __init #define __net_exit __ref #define __net_initdata __initdata #define __net_initconst __initconst #endif int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp); int peernet2id(const struct net *net, struct net *peer); bool peernet_has_id(const struct net *net, struct net *peer); struct net *get_net_ns_by_id(const struct net *net, int id); struct pernet_operations { struct list_head list; /* * Below methods are called without any exclusive locks. * More than one net may be constructed and destructed * in parallel on several cpus. Every pernet_operations * have to keep in mind all other pernet_operations and * to introduce a locking, if they share common resources. * * The only time they are called with exclusive lock is * from register_pernet_subsys(), unregister_pernet_subsys() * register_pernet_device() and unregister_pernet_device(). * * Exit methods using blocking RCU primitives, such as * synchronize_rcu(), should be implemented via exit_batch. * Then, destruction of a group of net requires single * synchronize_rcu() related to these pernet_operations, * instead of separate synchronize_rcu() for every net. * Please, avoid synchronize_rcu() at all, where it's possible. * * Note that a combination of pre_exit() and exit() can * be used, since a synchronize_rcu() is guaranteed between * the calls. */ int (*init)(struct net *net); void (*pre_exit)(struct net *net); void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); /* Following method is called with RTNL held. */ void (*exit_batch_rtnl)(struct list_head *net_exit_list, struct list_head *dev_kill_list); unsigned int * const id; const size_t size; }; /* * Use these carefully. If you implement a network device and it * needs per network namespace operations use device pernet operations, * otherwise use pernet subsys operations. * * Network interfaces need to be removed from a dying netns _before_ * subsys notifiers can be called, as most of the network code cleanup * (which is done from subsys notifiers) runs with the assumption that * dev_remove_pack has been called so no new packets will arrive during * and after the cleanup functions have been called. dev_remove_pack * is not per namespace so instead the guarantee of no more packets * arriving in a network namespace is provided by ensuring that all * network devices and all sockets have left the network namespace * before the cleanup methods are called. * * For the longest time the ipv4 icmp code was registered as a pernet * device which caused kernel oops, and panics during network * namespace cleanup. So please don't get this wrong. */ int register_pernet_subsys(struct pernet_operations *); void unregister_pernet_subsys(struct pernet_operations *); int register_pernet_device(struct pernet_operations *); void unregister_pernet_device(struct pernet_operations *); struct ctl_table; #define register_net_sysctl(net, path, table) \ register_net_sysctl_sz(net, path, table, ARRAY_SIZE(table)) #ifdef CONFIG_SYSCTL int net_sysctl_init(void); struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path, struct ctl_table *table, size_t table_size); void unregister_net_sysctl_table(struct ctl_table_header *header); #else static inline int net_sysctl_init(void) { return 0; } static inline struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path, struct ctl_table *table, size_t table_size) { return NULL; } static inline void unregister_net_sysctl_table(struct ctl_table_header *header) { } #endif static inline int rt_genid_ipv4(const struct net *net) { return atomic_read(&net->ipv4.rt_genid); } #if IS_ENABLED(CONFIG_IPV6) static inline int rt_genid_ipv6(const struct net *net) { return atomic_read(&net->ipv6.fib6_sernum); } #endif static inline void rt_genid_bump_ipv4(struct net *net) { atomic_inc(&net->ipv4.rt_genid); } extern void (*__fib6_flush_trees)(struct net *net); static inline void rt_genid_bump_ipv6(struct net *net) { if (__fib6_flush_trees) __fib6_flush_trees(net); } #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) static inline struct netns_ieee802154_lowpan * net_ieee802154_lowpan(struct net *net) { return &net->ieee802154_lowpan; } #endif /* For callers who don't really care about whether it's IPv4 or IPv6 */ static inline void rt_genid_bump_all(struct net *net) { rt_genid_bump_ipv4(net); rt_genid_bump_ipv6(net); } static inline int fnhe_genid(const struct net *net) { return atomic_read(&net->fnhe_genid); } static inline void fnhe_genid_bump(struct net *net) { atomic_inc(&net->fnhe_genid); } #ifdef CONFIG_NET void net_ns_init(void); #else static inline void net_ns_init(void) {} #endif #endif /* __NET_NET_NAMESPACE_H */ |
248 34 3 72 255 255 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM timer #if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TIMER_H #include <linux/tracepoint.h> #include <linux/hrtimer.h> #include <linux/timer.h> DECLARE_EVENT_CLASS(timer_class, TP_PROTO(struct timer_list *timer), TP_ARGS(timer), TP_STRUCT__entry( __field( void *, timer ) ), TP_fast_assign( __entry->timer = timer; ), TP_printk("timer=%p", __entry->timer) ); /** * timer_init - called when the timer is initialized * @timer: pointer to struct timer_list */ DEFINE_EVENT(timer_class, timer_init, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); #define decode_timer_flags(flags) \ __print_flags(flags, "|", \ { TIMER_MIGRATING, "M" }, \ { TIMER_DEFERRABLE, "D" }, \ { TIMER_PINNED, "P" }, \ { TIMER_IRQSAFE, "I" }) /** * timer_start - called when the timer is started * @timer: pointer to struct timer_list * @bucket_expiry: the bucket expiry time */ TRACE_EVENT(timer_start, TP_PROTO(struct timer_list *timer, unsigned long bucket_expiry), TP_ARGS(timer, bucket_expiry), TP_STRUCT__entry( __field( void *, timer ) __field( void *, function ) __field( unsigned long, expires ) __field( unsigned long, bucket_expiry ) __field( unsigned long, now ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->timer = timer; __entry->function = timer->function; __entry->expires = timer->expires; __entry->bucket_expiry = bucket_expiry; __entry->now = jiffies; __entry->flags = timer->flags; ), TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s", __entry->timer, __entry->function, __entry->expires, (long)__entry->expires - __entry->now, __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK, __entry->flags >> TIMER_ARRAYSHIFT, decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK)) ); /** * timer_expire_entry - called immediately before the timer callback * @timer: pointer to struct timer_list * @baseclk: value of timer_base::clk when timer expires * * Allows to determine the timer latency. */ TRACE_EVENT(timer_expire_entry, TP_PROTO(struct timer_list *timer, unsigned long baseclk), TP_ARGS(timer, baseclk), TP_STRUCT__entry( __field( void *, timer ) __field( unsigned long, now ) __field( void *, function) __field( unsigned long, baseclk ) ), TP_fast_assign( __entry->timer = timer; __entry->now = jiffies; __entry->function = timer->function; __entry->baseclk = baseclk; ), TP_printk("timer=%p function=%ps now=%lu baseclk=%lu", __entry->timer, __entry->function, __entry->now, __entry->baseclk) ); /** * timer_expire_exit - called immediately after the timer callback returns * @timer: pointer to struct timer_list * * When used in combination with the timer_expire_entry tracepoint we can * determine the runtime of the timer callback function. * * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might * be invalid. We solely track the pointer. */ DEFINE_EVENT(timer_class, timer_expire_exit, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); /** * timer_cancel - called when the timer is canceled * @timer: pointer to struct timer_list */ DEFINE_EVENT(timer_class, timer_cancel, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); TRACE_EVENT(timer_base_idle, TP_PROTO(bool is_idle, unsigned int cpu), TP_ARGS(is_idle, cpu), TP_STRUCT__entry( __field( bool, is_idle ) __field( unsigned int, cpu ) ), TP_fast_assign( __entry->is_idle = is_idle; __entry->cpu = cpu; ), TP_printk("is_idle=%d cpu=%d", __entry->is_idle, __entry->cpu) ); #define decode_clockid(type) \ __print_symbolic(type, \ { CLOCK_REALTIME, "CLOCK_REALTIME" }, \ { CLOCK_MONOTONIC, "CLOCK_MONOTONIC" }, \ { CLOCK_BOOTTIME, "CLOCK_BOOTTIME" }, \ { CLOCK_TAI, "CLOCK_TAI" }) #define decode_hrtimer_mode(mode) \ __print_symbolic(mode, \ { HRTIMER_MODE_ABS, "ABS" }, \ { HRTIMER_MODE_REL, "REL" }, \ { HRTIMER_MODE_ABS_PINNED, "ABS|PINNED" }, \ { HRTIMER_MODE_REL_PINNED, "REL|PINNED" }, \ { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \ { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \ { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \ { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" }, \ { HRTIMER_MODE_ABS_HARD, "ABS|HARD" }, \ { HRTIMER_MODE_REL_HARD, "REL|HARD" }, \ { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" }, \ { HRTIMER_MODE_REL_PINNED_HARD, "REL|PINNED|HARD" }) /** * hrtimer_setup - called when the hrtimer is initialized * @hrtimer: pointer to struct hrtimer * @clockid: the hrtimers clock * @mode: the hrtimers mode */ TRACE_EVENT(hrtimer_setup, TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid, enum hrtimer_mode mode), TP_ARGS(hrtimer, clockid, mode), TP_STRUCT__entry( __field( void *, hrtimer ) __field( clockid_t, clockid ) __field( enum hrtimer_mode, mode ) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->clockid = clockid; __entry->mode = mode; ), TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer, decode_clockid(__entry->clockid), decode_hrtimer_mode(__entry->mode)) ); /** * hrtimer_start - called when the hrtimer is started * @hrtimer: pointer to struct hrtimer * @mode: the hrtimers mode */ TRACE_EVENT(hrtimer_start, TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode), TP_ARGS(hrtimer, mode), TP_STRUCT__entry( __field( void *, hrtimer ) __field( void *, function ) __field( s64, expires ) __field( s64, softexpires ) __field( enum hrtimer_mode, mode ) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->function = ACCESS_PRIVATE(hrtimer, function); __entry->expires = hrtimer_get_expires(hrtimer); __entry->softexpires = hrtimer_get_softexpires(hrtimer); __entry->mode = mode; ), TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu " "mode=%s", __entry->hrtimer, __entry->function, (unsigned long long) __entry->expires, (unsigned long long) __entry->softexpires, decode_hrtimer_mode(__entry->mode)) ); /** * hrtimer_expire_entry - called immediately before the hrtimer callback * @hrtimer: pointer to struct hrtimer * @now: pointer to variable which contains current time of the * timers base. * * Allows to determine the timer latency. */ TRACE_EVENT(hrtimer_expire_entry, TP_PROTO(struct hrtimer *hrtimer, ktime_t *now), TP_ARGS(hrtimer, now), TP_STRUCT__entry( __field( void *, hrtimer ) __field( s64, now ) __field( void *, function) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->now = *now; __entry->function = ACCESS_PRIVATE(hrtimer, function); ), TP_printk("hrtimer=%p function=%ps now=%llu", __entry->hrtimer, __entry->function, (unsigned long long) __entry->now) ); DECLARE_EVENT_CLASS(hrtimer_class, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer), TP_STRUCT__entry( __field( void *, hrtimer ) ), TP_fast_assign( __entry->hrtimer = hrtimer; ), TP_printk("hrtimer=%p", __entry->hrtimer) ); /** * hrtimer_expire_exit - called immediately after the hrtimer callback returns * @hrtimer: pointer to struct hrtimer * * When used in combination with the hrtimer_expire_entry tracepoint we can * determine the runtime of the callback function. */ DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer) ); /** * hrtimer_cancel - called when the hrtimer is canceled * @hrtimer: pointer to struct hrtimer */ DEFINE_EVENT(hrtimer_class, hrtimer_cancel, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer) ); /** * itimer_state - called when itimer is started or canceled * @which: name of the interval timer * @value: the itimers value, itimer is canceled if value->it_value is * zero, otherwise it is started * @expires: the itimers expiry time */ TRACE_EVENT(itimer_state, TP_PROTO(int which, const struct itimerspec64 *const value, unsigned long long expires), TP_ARGS(which, value, expires), TP_STRUCT__entry( __field( int, which ) __field( unsigned long long, expires ) __field( long, value_sec ) __field( long, value_nsec ) __field( long, interval_sec ) __field( long, interval_nsec ) ), TP_fast_assign( __entry->which = which; __entry->expires = expires; __entry->value_sec = value->it_value.tv_sec; __entry->value_nsec = value->it_value.tv_nsec; __entry->interval_sec = value->it_interval.tv_sec; __entry->interval_nsec = value->it_interval.tv_nsec; ), TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld", __entry->which, __entry->expires, __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC, __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC) ); /** * itimer_expire - called when itimer expires * @which: type of the interval timer * @pid: pid of the process which owns the timer * @now: current time, used to calculate the latency of itimer */ TRACE_EVENT(itimer_expire, TP_PROTO(int which, struct pid *pid, unsigned long long now), TP_ARGS(which, pid, now), TP_STRUCT__entry( __field( int , which ) __field( pid_t, pid ) __field( unsigned long long, now ) ), TP_fast_assign( __entry->which = which; __entry->now = now; __entry->pid = pid_nr(pid); ), TP_printk("which=%d pid=%d now=%llu", __entry->which, (int) __entry->pid, __entry->now) ); #ifdef CONFIG_NO_HZ_COMMON #define TICK_DEP_NAMES \ tick_dep_mask_name(NONE) \ tick_dep_name(POSIX_TIMER) \ tick_dep_name(PERF_EVENTS) \ tick_dep_name(SCHED) \ tick_dep_name(CLOCK_UNSTABLE) \ tick_dep_name(RCU) \ tick_dep_name_end(RCU_EXP) #undef tick_dep_name #undef tick_dep_mask_name #undef tick_dep_name_end /* The MASK will convert to their bits and they need to be processed too */ #define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); #define tick_dep_name_end(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); /* NONE only has a mask defined for it */ #define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); TICK_DEP_NAMES #undef tick_dep_name #undef tick_dep_mask_name #undef tick_dep_name_end #define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, #define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, #define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep } #define show_tick_dep_name(val) \ __print_symbolic(val, TICK_DEP_NAMES) TRACE_EVENT(tick_stop, TP_PROTO(int success, int dependency), TP_ARGS(success, dependency), TP_STRUCT__entry( __field( int , success ) __field( int , dependency ) ), TP_fast_assign( __entry->success = success; __entry->dependency = dependency; ), TP_printk("success=%d dependency=%s", __entry->success, \ show_tick_dep_name(__entry->dependency)) ); #endif #endif /* _TRACE_TIMER_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
4 3 3 4 4 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_NETDEV_LOCK_H #define _NET_NETDEV_LOCK_H #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> static inline bool netdev_trylock(struct net_device *dev) { return mutex_trylock(&dev->lock); } static inline void netdev_assert_locked(const struct net_device *dev) { lockdep_assert_held(&dev->lock); } static inline void netdev_assert_locked_or_invisible(const struct net_device *dev) { if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) netdev_assert_locked(dev); } static inline bool netdev_need_ops_lock(const struct net_device *dev) { bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; #if IS_ENABLED(CONFIG_NET_SHAPER) ret |= !!dev->netdev_ops->net_shaper_ops; #endif return ret; } static inline void netdev_lock_ops(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_lock(dev); } static inline void netdev_unlock_ops(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_unlock(dev); } static inline void netdev_ops_assert_locked(const struct net_device *dev) { if (netdev_need_ops_lock(dev)) lockdep_assert_held(&dev->lock); else ASSERT_RTNL(); } static inline void netdev_ops_assert_locked_or_invisible(const struct net_device *dev) { if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) netdev_ops_assert_locked(dev); } static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { /* Only lower devices currently grab the instance lock, so no * real ordering issues can occur. In the near future, only * hardware devices will grab instance lock which also does not * involve any ordering. Suppress lockdep ordering warnings * until (if) we start grabbing instance lock on pure SW * devices (bond/team/veth/etc). */ if (a == b) return 0; return -1; } #define netdev_lockdep_set_classes(dev) \ { \ static struct lock_class_key qdisc_tx_busylock_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ static struct lock_class_key dev_addr_list_lock_key; \ static struct lock_class_key dev_instance_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ lockdep_set_class(&(dev)->addr_list_lock, \ &dev_addr_list_lock_key); \ lockdep_set_class(&(dev)->lock, \ &dev_instance_lock_key); \ lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ &qdisc_xmit_lock_key); \ } int netdev_debug_event(struct notifier_block *nb, unsigned long event, void *ptr); #endif |
1513 1507 1509 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_STACKTRACE_H #define __ASM_STACKTRACE_H #include <linux/percpu.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/llist.h> #include <asm/memory.h> #include <asm/pointer_auth.h> #include <asm/ptrace.h> #include <asm/sdei.h> #include <asm/stacktrace/common.h> extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, const char *loglvl); DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); static inline struct stack_info stackinfo_get_irq(void) { unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); unsigned long high = low + IRQ_STACK_SIZE; return (struct stack_info) { .low = low, .high = high, }; } static inline bool on_irq_stack(unsigned long sp, unsigned long size) { struct stack_info info = stackinfo_get_irq(); return stackinfo_on_stack(&info, sp, size); } static inline struct stack_info stackinfo_get_task(const struct task_struct *tsk) { unsigned long low = (unsigned long)task_stack_page(tsk); unsigned long high = low + THREAD_SIZE; return (struct stack_info) { .low = low, .high = high, }; } static inline bool on_task_stack(const struct task_struct *tsk, unsigned long sp, unsigned long size) { struct stack_info info = stackinfo_get_task(tsk); return stackinfo_on_stack(&info, sp, size); } #define on_thread_stack() (on_task_stack(current, current_stack_pointer, 1)) #ifdef CONFIG_VMAP_STACK DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); static inline struct stack_info stackinfo_get_overflow(void) { unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack); unsigned long high = low + OVERFLOW_STACK_SIZE; return (struct stack_info) { .low = low, .high = high, }; } #else #define stackinfo_get_overflow() stackinfo_get_unknown() #endif #if defined(CONFIG_ARM_SDE_INTERFACE) && defined(CONFIG_VMAP_STACK) DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); static inline struct stack_info stackinfo_get_sdei_normal(void) { unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr); unsigned long high = low + SDEI_STACK_SIZE; return (struct stack_info) { .low = low, .high = high, }; } static inline struct stack_info stackinfo_get_sdei_critical(void) { unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr); unsigned long high = low + SDEI_STACK_SIZE; return (struct stack_info) { .low = low, .high = high, }; } #else #define stackinfo_get_sdei_normal() stackinfo_get_unknown() #define stackinfo_get_sdei_critical() stackinfo_get_unknown() #endif #ifdef CONFIG_EFI extern u64 *efi_rt_stack_top; static inline struct stack_info stackinfo_get_efi(void) { unsigned long high = (u64)efi_rt_stack_top; unsigned long low = high - THREAD_SIZE; return (struct stack_info) { .low = low, .high = high, }; } #endif #endif /* __ASM_STACKTRACE_H */ |
8 1 3 1 1 1 179 179 12 11 1 1 72 72 63 8 1 8 64 71 72 72 385 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/kvm/reset.c * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/hw_breakpoint.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> #include <kvm/arm_arch_timer.h> #include <asm/cpufeature.h> #include <asm/cputype.h> #include <asm/fpsimd.h> #include <asm/ptrace.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> #include <asm/kvm_nested.h> #include <asm/virt.h> /* Maximum phys_shift supported for any VM on this host */ static u32 __ro_after_init kvm_ipa_limit; unsigned int __ro_after_init kvm_host_sve_max_vl; /* * ARMv8 Reset Values */ #define VCPU_RESET_PSTATE_EL1 (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \ PSR_F_BIT | PSR_D_BIT) #define VCPU_RESET_PSTATE_EL2 (PSR_MODE_EL2h | PSR_A_BIT | PSR_I_BIT | \ PSR_F_BIT | PSR_D_BIT) #define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ PSR_AA32_I_BIT | PSR_AA32_F_BIT) unsigned int __ro_after_init kvm_sve_max_vl; int __init kvm_arm_init_sve(void) { if (system_supports_sve()) { kvm_sve_max_vl = sve_max_virtualisable_vl(); kvm_host_sve_max_vl = sve_max_vl(); kvm_nvhe_sym(kvm_host_sve_max_vl) = kvm_host_sve_max_vl; /* * The get_sve_reg()/set_sve_reg() ioctl interface will need * to be extended with multiple register slice support in * order to support vector lengths greater than * VL_ARCH_MAX: */ if (WARN_ON(kvm_sve_max_vl > VL_ARCH_MAX)) kvm_sve_max_vl = VL_ARCH_MAX; /* * Don't even try to make use of vector lengths that * aren't available on all CPUs, for now: */ if (kvm_sve_max_vl < sve_max_vl()) pr_warn("KVM: SVE vector length for guests limited to %u bytes\n", kvm_sve_max_vl); } return 0; } static void kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) { vcpu->arch.sve_max_vl = kvm_sve_max_vl; /* * Userspace can still customize the vector lengths by writing * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until * kvm_arm_vcpu_finalize(), which freezes the configuration. */ set_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &vcpu->kvm->arch.flags); } /* * Finalize vcpu's maximum SVE vector length, allocating * vcpu->arch.sve_state as necessary. */ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) { void *buf; unsigned int vl; size_t reg_sz; int ret; vl = vcpu->arch.sve_max_vl; /* * Responsibility for these properties is shared between * kvm_arm_init_sve(), kvm_vcpu_enable_sve() and * set_sve_vls(). Double-check here just to be sure: */ if (WARN_ON(!sve_vl_valid(vl) || vl > sve_max_virtualisable_vl() || vl > VL_ARCH_MAX)) return -EIO; reg_sz = vcpu_sve_state_size(vcpu); buf = kzalloc(reg_sz, GFP_KERNEL_ACCOUNT); if (!buf) return -ENOMEM; ret = kvm_share_hyp(buf, buf + reg_sz); if (ret) { kfree(buf); return ret; } vcpu->arch.sve_state = buf; vcpu_set_flag(vcpu, VCPU_SVE_FINALIZED); return 0; } int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature) { switch (feature) { case KVM_ARM_VCPU_SVE: if (!vcpu_has_sve(vcpu)) return -EINVAL; if (kvm_arm_vcpu_sve_finalized(vcpu)) return -EPERM; return kvm_vcpu_finalize_sve(vcpu); } return -EINVAL; } bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) { if (vcpu_has_sve(vcpu) && !kvm_arm_vcpu_sve_finalized(vcpu)) return false; return true; } void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) { void *sve_state = vcpu->arch.sve_state; kvm_unshare_hyp(vcpu, vcpu + 1); if (sve_state) kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); kfree(sve_state); free_page((unsigned long)vcpu->arch.ctxt.vncr_array); kfree(vcpu->arch.vncr_tlb); kfree(vcpu->arch.ccsidr); } static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) { if (vcpu_has_sve(vcpu)) memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); } /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer * * This function sets the registers on the virtual CPU struct to their * architecturally defined reset values, except for registers whose reset is * deferred until kvm_arm_vcpu_finalize(). * * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT * ioctl or as part of handling a request issued by another VCPU in the PSCI * handling code. In the first case, the VCPU will not be loaded, and in the * second case the VCPU will be loaded. Because this function operates purely * on the memory-backed values of system registers, we want to do a full put if * we were loaded (handling a request) and load the values back at the end of * the function. Otherwise we leave the state alone. In both cases, we * disable preemption around the vcpu reset as we would otherwise race with * preempt notifiers which also call put/load. */ void kvm_reset_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_reset_state reset_state; bool loaded; u32 pstate; spin_lock(&vcpu->arch.mp_state_lock); reset_state = vcpu->arch.reset_state; vcpu->arch.reset_state.reset = false; spin_unlock(&vcpu->arch.mp_state_lock); preempt_disable(); loaded = (vcpu->cpu != -1); if (loaded) kvm_arch_vcpu_put(vcpu); if (!kvm_arm_vcpu_sve_finalized(vcpu)) { if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) kvm_vcpu_enable_sve(vcpu); } else { kvm_vcpu_reset_sve(vcpu); } if (vcpu_el1_is_32bit(vcpu)) pstate = VCPU_RESET_PSTATE_SVC; else if (vcpu_has_nv(vcpu)) pstate = VCPU_RESET_PSTATE_EL2; else pstate = VCPU_RESET_PSTATE_EL1; /* Reset core registers */ memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu))); memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs)); vcpu->arch.ctxt.spsr_abt = 0; vcpu->arch.ctxt.spsr_und = 0; vcpu->arch.ctxt.spsr_irq = 0; vcpu->arch.ctxt.spsr_fiq = 0; vcpu_gp_regs(vcpu)->pstate = pstate; /* Reset system registers */ kvm_reset_sys_regs(vcpu); /* * Additional reset state handling that PSCI may have imposed on us. * Must be done after all the sys_reg reset. */ if (reset_state.reset) { unsigned long target_pc = reset_state.pc; /* Gracefully handle Thumb2 entry point */ if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) { target_pc &= ~1UL; vcpu_set_thumb(vcpu); } /* Propagate caller endianness */ if (reset_state.be) kvm_vcpu_set_be(vcpu); *vcpu_pc(vcpu) = target_pc; vcpu_set_reg(vcpu, 0, reset_state.r0); } /* Reset timer */ kvm_timer_vcpu_reset(vcpu); if (loaded) kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); } u32 kvm_get_pa_bits(struct kvm *kvm) { /* Fixed limit until we can configure ID_AA64MMFR0.PARange */ return kvm_ipa_limit; } u32 get_kvm_ipa_limit(void) { return kvm_ipa_limit; } int __init kvm_set_ipa_limit(void) { unsigned int parange; u64 mmfr0; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); /* * IPA size beyond 48 bits for 4K and 16K page size is only supported * when LPA2 is available. So if we have LPA2, enable it, else cap to 48 * bits, in case it's reported as larger on the system. */ if (!kvm_lpa2_is_enabled() && PAGE_SIZE != SZ_64K) parange = min(parange, (unsigned int)ID_AA64MMFR0_EL1_PARANGE_48); /* * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at * Stage-2. If not, things will stop very quickly. */ switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN_2_SHIFT)) { case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_NONE: kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); return -EINVAL; case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_DEFAULT: kvm_debug("PAGE_SIZE supported at Stage-2 (default)\n"); break; case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MIN ... ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MAX: kvm_debug("PAGE_SIZE supported at Stage-2 (advertised)\n"); break; default: kvm_err("Unsupported value for TGRAN_2, giving up\n"); return -EINVAL; } kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit, ((kvm_ipa_limit < KVM_PHYS_SHIFT) ? " (Reduced IPA size, limited VM/VMM compatibility)" : "")); return 0; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 | /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt.h: declarations for per-file encryption * * Filesystems that implement per-file encryption must include this header * file. * * Copyright (C) 2015, Google, Inc. * * Written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. */ #ifndef _LINUX_FSCRYPT_H #define _LINUX_FSCRYPT_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/slab.h> #include <uapi/linux/fscrypt.h> /* * The lengths of all file contents blocks must be divisible by this value. * This is needed to ensure that all contents encryption modes will work, as * some of the supported modes don't support arbitrarily byte-aligned messages. * * Since the needed alignment is 16 bytes, most filesystems will meet this * requirement naturally, as typical block sizes are powers of 2. However, if a * filesystem can generate arbitrarily byte-aligned block lengths (e.g., via * compression), then it will need to pad to this alignment before encryption. */ #define FSCRYPT_CONTENTS_ALIGNMENT 16 union fscrypt_policy; struct fscrypt_inode_info; struct fs_parameter; struct seq_file; struct fscrypt_str { unsigned char *name; u32 len; }; struct fscrypt_name { const struct qstr *usr_fname; struct fscrypt_str disk_name; u32 hash; u32 minor_hash; struct fscrypt_str crypto_buf; bool is_nokey_name; }; #define FSTR_INIT(n, l) { .name = n, .len = l } #define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) /* Maximum value for the third parameter of fscrypt_operations.set_context(). */ #define FSCRYPT_SET_CONTEXT_MAX_SIZE 40 #ifdef CONFIG_FS_ENCRYPTION /* Crypto operations for filesystems */ struct fscrypt_operations { /* * If set, then fs/crypto/ will allocate a global bounce page pool the * first time an encryption key is set up for a file. The bounce page * pool is required by the following functions: * * - fscrypt_encrypt_pagecache_blocks() * - fscrypt_zeroout_range() for files not using inline crypto * * If the filesystem doesn't use those, it doesn't need to set this. */ unsigned int needs_bounce_pages : 1; /* * If set, then fs/crypto/ will allow the use of encryption settings * that assume inode numbers fit in 32 bits (i.e. * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64}), provided that the other * prerequisites for these settings are also met. This is only useful * if the filesystem wants to support inline encryption hardware that is * limited to 32-bit or 64-bit data unit numbers and where programming * keyslots is very slow. */ unsigned int has_32bit_inodes : 1; /* * If set, then fs/crypto/ will allow users to select a crypto data unit * size that is less than the filesystem block size. This is done via * the log2_data_unit_size field of the fscrypt policy. This flag is * not compatible with filesystems that encrypt variable-length blocks * (i.e. blocks that aren't all equal to filesystem's block size), for * example as a result of compression. It's also not compatible with * the fscrypt_encrypt_block_inplace() and * fscrypt_decrypt_block_inplace() functions. */ unsigned int supports_subblock_data_units : 1; /* * This field exists only for backwards compatibility reasons and should * only be set by the filesystems that are setting it already. It * contains the filesystem-specific key description prefix that is * accepted for "logon" keys for v1 fscrypt policies. This * functionality is deprecated in favor of the generic prefix * "fscrypt:", which itself is deprecated in favor of the filesystem * keyring ioctls such as FS_IOC_ADD_ENCRYPTION_KEY. Filesystems that * are newly adding fscrypt support should not set this field. */ const char *legacy_key_prefix; /* * Get the fscrypt context of the given inode. * * @inode: the inode whose context to get * @ctx: the buffer into which to get the context * @len: length of the @ctx buffer in bytes * * Return: On success, returns the length of the context in bytes; this * may be less than @len. On failure, returns -ENODATA if the * inode doesn't have a context, -ERANGE if the context is * longer than @len, or another -errno code. */ int (*get_context)(struct inode *inode, void *ctx, size_t len); /* * Set an fscrypt context on the given inode. * * @inode: the inode whose context to set. The inode won't already have * an fscrypt context. * @ctx: the context to set * @len: length of @ctx in bytes (at most FSCRYPT_SET_CONTEXT_MAX_SIZE) * @fs_data: If called from fscrypt_set_context(), this will be the * value the filesystem passed to fscrypt_set_context(). * Otherwise (i.e. when called from * FS_IOC_SET_ENCRYPTION_POLICY) this will be NULL. * * i_rwsem will be held for write. * * Return: 0 on success, -errno on failure. */ int (*set_context)(struct inode *inode, const void *ctx, size_t len, void *fs_data); /* * Get the dummy fscrypt policy in use on the filesystem (if any). * * Filesystems only need to implement this function if they support the * test_dummy_encryption mount option. * * Return: A pointer to the dummy fscrypt policy, if the filesystem is * mounted with test_dummy_encryption; otherwise NULL. */ const union fscrypt_policy *(*get_dummy_policy)(struct super_block *sb); /* * Check whether a directory is empty. i_rwsem will be held for write. */ bool (*empty_dir)(struct inode *inode); /* * Check whether the filesystem's inode numbers and UUID are stable, * meaning that they will never be changed even by offline operations * such as filesystem shrinking and therefore can be used in the * encryption without the possibility of files becoming unreadable. * * Filesystems only need to implement this function if they want to * support the FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags. These * flags are designed to work around the limitations of UFS and eMMC * inline crypto hardware, and they shouldn't be used in scenarios where * such hardware isn't being used. * * Leaving this NULL is equivalent to always returning false. */ bool (*has_stable_inodes)(struct super_block *sb); /* * Return an array of pointers to the block devices to which the * filesystem may write encrypted file contents, NULL if the filesystem * only has a single such block device, or an ERR_PTR() on error. * * On successful non-NULL return, *num_devs is set to the number of * devices in the returned array. The caller must free the returned * array using kfree(). * * If the filesystem can use multiple block devices (other than block * devices that aren't used for encrypted file contents, such as * external journal devices), and wants to support inline encryption, * then it must implement this function. Otherwise it's not needed. */ struct block_device **(*get_devices)(struct super_block *sb, unsigned int *num_devs); }; int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). * I.e., another task may publish ->i_crypt_info concurrently, executing * a RELEASE barrier. We need to use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ return smp_load_acquire(&inode->i_crypt_info); } /** * fscrypt_needs_contents_encryption() - check whether an inode needs * contents encryption * @inode: the inode to check * * Return: %true iff the inode is an encrypted regular file and the kernel was * built with fscrypt support. * * If you need to know whether the encrypt bit is set even when the kernel was * built without fscrypt support, you must use IS_ENCRYPTED() directly instead. */ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } /* * When d_splice_alias() moves a directory's no-key alias to its * plaintext alias as a result of the encryption key being added, * DCACHE_NOKEY_NAME must be cleared and there might be an opportunity * to disable d_revalidate. Note that we don't have to support the * inverse operation because fscrypt doesn't allow no-key names to be * the source or target of a rename(). */ static inline void fscrypt_handle_d_move(struct dentry *dentry) { /* * VFS calls fscrypt_handle_d_move even for non-fscrypt * filesystems. */ if (dentry->d_flags & DCACHE_NOKEY_NAME) { dentry->d_flags &= ~DCACHE_NOKEY_NAME; /* * Other filesystem features might be handling dentry * revalidation, in which case it cannot be disabled. */ if (dentry->d_op->d_revalidate == fscrypt_d_revalidate) dentry->d_flags &= ~DCACHE_OP_REVALIDATE; } } /** * fscrypt_is_nokey_name() - test whether a dentry is a no-key name * @dentry: the dentry to check * * This returns true if the dentry is a no-key dentry. A no-key dentry is a * dentry that was created in an encrypted directory that hasn't had its * encryption key added yet. Such dentries may be either positive or negative. * * When a filesystem is asked to create a new filename in an encrypted directory * and the new filename's dentry is a no-key dentry, it must fail the operation * with ENOKEY. This includes ->create(), ->mkdir(), ->mknod(), ->symlink(), * ->rename(), and ->link(). (However, ->rename() and ->link() are already * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().) * * This is necessary because creating a filename requires the directory's * encryption key, but just checking for the key on the directory inode during * the final filesystem operation doesn't guarantee that the key was available * during the preceding dentry lookup. And the key must have already been * available during the dentry lookup in order for it to have been checked * whether the filename already exists in the directory and for the new file's * dentry not to be invalidated due to it incorrectly having the no-key flag. * * Return: %true if the dentry is a no-key name */ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return dentry->d_flags & DCACHE_NOKEY_NAME; } static inline void fscrypt_prepare_dentry(struct dentry *dentry, bool is_nokey_name) { /* * This code tries to only take ->d_lock when necessary to write * to ->d_flags. We shouldn't be peeking on d_flags for * DCACHE_OP_REVALIDATE unlocked, but in the unlikely case * there is a race, the worst it can happen is that we fail to * unset DCACHE_OP_REVALIDATE and pay the cost of an extra * d_revalidate. */ if (is_nokey_name) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); } else if (dentry->d_flags & DCACHE_OP_REVALIDATE && dentry->d_op->d_revalidate == fscrypt_d_revalidate) { /* * Unencrypted dentries and encrypted dentries where the * key is available are always valid from fscrypt * perspective. Avoid the cost of calling * fscrypt_d_revalidate unnecessarily. */ spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_OP_REVALIDATE; spin_unlock(&dentry->d_lock); } } /* crypto.c */ void fscrypt_enqueue_decrypt_work(struct work_struct *); struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs, gfp_t gfp_flags); int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags); int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs); int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num); static inline bool fscrypt_is_bounce_page(struct page *page) { return page->mapping == NULL; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { return (struct page *)page_private(bounce_page); } static inline bool fscrypt_is_bounce_folio(struct folio *folio) { return folio->mapping == NULL; } static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) { return bounce_folio->private; } void fscrypt_free_bounce_page(struct page *bounce_page); /* policy.c */ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); int fscrypt_set_context(struct inode *inode, void *fs_data); struct fscrypt_dummy_policy { const union fscrypt_policy *policy; }; int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy); bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb); static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return dummy_policy->policy != NULL; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { kfree(dummy_policy->policy); dummy_policy->policy = NULL; } /* keyring.c */ void fscrypt_destroy_keyring(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); /* keysetup.c */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret); void fscrypt_put_encryption_info(struct inode *inode); void fscrypt_free_inode(struct inode *inode); int fscrypt_drop_inode(struct inode *inode); /* fname.c */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, int lookup, struct fscrypt_name *fname); static inline void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); } int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str); void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname); bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len); u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname); int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_readdir(struct inode *dir); int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr); int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags); int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link); const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done); int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat); static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { sb->s_cop = s_cop; } #else /* !CONFIG_FS_ENCRYPTION */ static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { return NULL; } static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return false; } static inline void fscrypt_handle_d_move(struct dentry *dentry) { } static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return false; } static inline void fscrypt_prepare_dentry(struct dentry *dentry, bool is_nokey_name) { } /* crypto.c */ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { } static inline struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs, gfp_t gfp_flags) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num) { return -EOPNOTSUPP; } static inline bool fscrypt_is_bounce_page(struct page *page) { return false; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline bool fscrypt_is_bounce_folio(struct folio *folio) { return false; } static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline void fscrypt_free_bounce_page(struct page *bounce_page) { } /* policy.c */ static inline int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { return 0; } static inline int fscrypt_set_context(struct inode *inode, void *fs_data) { return -EOPNOTSUPP; } struct fscrypt_dummy_policy { }; static inline int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy) { return -EINVAL; } static inline bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2) { return true; } static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { } static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return false; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { } /* keyring.c */ static inline void fscrypt_destroy_keyring(struct super_block *sb) { } static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } /* keysetup.c */ static inline int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; return 0; } static inline void fscrypt_put_encryption_info(struct inode *inode) { return; } static inline void fscrypt_free_inode(struct inode *inode) { } static inline int fscrypt_drop_inode(struct inode *inode) { return 0; } /* fname.c */ static inline int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; memset(fname, 0, sizeof(*fname)); fname->usr_fname = iname; fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } static inline void fscrypt_free_filename(struct fscrypt_name *fname) { return; } static inline int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { return -EOPNOTSUPP; } static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { return; } static inline int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname) { return -EOPNOTSUPP; } static inline bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len) { /* Encryption support disabled; use standard comparison */ if (de_name_len != fname->disk_name.len) return false; return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); } static inline u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { WARN_ON_ONCE(1); return 0; } static inline int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { return 1; } /* bio.c */ static inline bool fscrypt_decrypt_bio(struct bio *bio) { return true; } static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { return -EOPNOTSUPP; } /* hooks.c */ static inline int fscrypt_file_open(struct inode *inode, struct file *filp) { if (IS_ENCRYPTED(inode)) return -EOPNOTSUPP; return 0; } static inline int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_readdir(struct inode *dir) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { return 0; } static inline int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; disk_link->name = (unsigned char *)target; disk_link->len = len + 1; if (disk_link->len > max_len) return -ENAMETOOLONG; return 0; } static inline int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { return -EOPNOTSUPP; } static inline const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) { return -EOPNOTSUPP; } static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { } #endif /* !CONFIG_FS_ENCRYPTION */ /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode); void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask); void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh); bool fscrypt_dio_supported(struct inode *inode); u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return false; } static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) { } static inline void fscrypt_set_bio_crypt_ctx_bh( struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask) { } static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { return true; } static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh) { return true; } static inline bool fscrypt_dio_supported(struct inode *inode) { return !fscrypt_needs_contents_encryption(inode); } static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) { return nr_blocks; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ /** * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the block layer via blk-crypto rather * than in the filesystem layer. */ static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && __fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the filesystem layer rather than in the * block layer via blk-crypto. */ static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && !__fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_has_encryption_key() - check whether an inode has had its key set up * @inode: the inode to check * * Return: %true if the inode has had its encryption key set up, else %false. * * Usually this should be preceded by fscrypt_get_encryption_info() to try to * set up the key first. */ static inline bool fscrypt_has_encryption_key(const struct inode *inode) { return fscrypt_get_inode_info(inode) != NULL; } /** * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted * directory * @old_dentry: an existing dentry for the inode being linked * @dir: the target directory * @dentry: negative dentry for the target filename * * A new link can only be added to an encrypted directory if the directory's * encryption key is available --- since otherwise we'd have no way to encrypt * the filename. * * We also verify that the link will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if the directory's encryption key is missing, * -EXDEV if the link would result in an inconsistent encryption policy, or * another -errno code. */ static inline int fscrypt_prepare_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_link(d_inode(old_dentry), dir, dentry); return 0; } /** * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted * directories * @old_dir: source directory * @old_dentry: dentry for source file * @new_dir: target directory * @new_dentry: dentry for target location (may be negative unless exchanging) * @flags: rename flags (we care at least about %RENAME_EXCHANGE) * * Prepare for ->rename() where the source and/or target directories may be * encrypted. A new link can only be added to an encrypted directory if the * directory's encryption key is available --- since otherwise we'd have no way * to encrypt the filename. A rename to an existing name, on the other hand, * *is* cryptographically possible without the key. However, we take the more * conservative approach and just forbid all no-key renames. * * We also verify that the rename will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the * rename would cause inconsistent encryption policies, or another -errno code. */ static inline int fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir)) return __fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); return 0; } /** * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted * directory * @dir: directory being searched * @dentry: filename being looked up * @fname: (output) the name to use to search the on-disk directory * * Prepare for ->lookup() in a directory which may be encrypted by determining * the name that will actually be used to search the directory on-disk. If the * directory's encryption policy is supported by this kernel and its encryption * key is available, then the lookup is assumed to be by plaintext name; * otherwise, it is assumed to be by no-key name. * * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key * name. In this case the filesystem must assign the dentry a dentry_operations * which contains fscrypt_d_revalidate (or contains a d_revalidate method that * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the * directory's encryption key is later added. * * Return: 0 on success; -ENOENT if the directory's key is unavailable but the * filename isn't a valid no-key name, so a negative dentry should be created; * or another -errno code. */ static inline int fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_lookup(dir, dentry, fname); memset(fname, 0, sizeof(*fname)); fname->usr_fname = &dentry->d_name; fname->disk_name.name = (unsigned char *)dentry->d_name.name; fname->disk_name.len = dentry->d_name.len; fscrypt_prepare_dentry(dentry, false); return 0; } /** * fscrypt_prepare_readdir() - prepare to read a possibly-encrypted directory * @dir: the directory inode * * If the directory is encrypted and it doesn't already have its encryption key * set up, try to set it up so that the filenames will be listed in plaintext * form rather than in no-key form. * * Return: 0 on success; -errno on error. Note that the encryption key being * unavailable is not considered an error. It is also not an error if * the encryption policy is unsupported by this kernel; that is treated * like the key being unavailable, so that files can still be deleted. */ static inline int fscrypt_prepare_readdir(struct inode *dir) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_readdir(dir); return 0; } /** * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's * attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * * Prepare for ->setattr() on a possibly-encrypted inode. On an encrypted file, * most attribute changes are allowed even without the encryption key. However, * without the encryption key we do have to forbid truncates. This is needed * because the size being truncated to may not be a multiple of the filesystem * block size, and in that case we'd have to decrypt the final block, zero the * portion past i_size, and re-encrypt it. (We *could* allow truncating to a * filesystem block boundary, but it's simpler to just forbid all truncates --- * and we already forbid all other contents modifications without the key.) * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (IS_ENCRYPTED(d_inode(dentry))) return __fscrypt_prepare_setattr(dentry, attr); return 0; } /** * fscrypt_encrypt_symlink() - encrypt the symlink target if needed * @inode: symlink inode * @target: plaintext symlink target * @len: length of @target excluding null terminator * @disk_link: (in/out) the on-disk symlink target being prepared * * If the symlink target needs to be encrypted, then this function encrypts it * into @disk_link->name. fscrypt_prepare_symlink() must have been called * previously to compute @disk_link->len. If the filesystem did not allocate a * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one * will be kmalloc()'ed and the filesystem will be responsible for freeing it. * * Return: 0 on success, -errno on failure */ static inline int fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(inode)) return __fscrypt_encrypt_symlink(inode, target, len, disk_link); return 0; } /* If *pagep is a bounce page, free it and set *pagep to the pagecache page */ static inline void fscrypt_finalize_bounce_page(struct page **pagep) { struct page *page = *pagep; if (fscrypt_is_bounce_page(page)) { *pagep = fscrypt_pagecache_page(page); fscrypt_free_bounce_page(page); } } #endif /* _LINUX_FSCRYPT_H */ |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 | // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Generic netlink support functions to configure an SMC-R PNET table * * Copyright IBM Corp. 2016 * * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com> */ #include <linux/module.h> #include <linux/list.h> #include <linux/ctype.h> #include <linux/mutex.h> #include <net/netlink.h> #include <net/genetlink.h> #include <uapi/linux/if.h> #include <uapi/linux/smc.h> #include <rdma/ib_verbs.h> #include <net/netns/generic.h> #include "smc_netns.h" #include "smc_pnet.h" #include "smc_ib.h" #include "smc_ism.h" #include "smc_core.h" static struct net_device *__pnet_find_base_ndev(struct net_device *ndev); static struct net_device *pnet_find_base_ndev(struct net_device *ndev); static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, .len = SMC_MAX_PNETID_LEN }, [SMC_PNETID_ETHNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [SMC_PNETID_IBNAME] = { .type = NLA_NUL_STRING, .len = IB_DEVICE_NAME_MAX - 1 }, [SMC_PNETID_IBPORT] = { .type = NLA_U8 } }; static struct genl_family smc_pnet_nl_family; enum smc_pnet_nametype { SMC_PNET_ETH = 1, SMC_PNET_IB = 2, }; /* pnet entry stored in pnet table */ struct smc_pnetentry { struct list_head list; char pnet_name[SMC_MAX_PNETID_LEN + 1]; enum smc_pnet_nametype type; union { struct { char eth_name[IFNAMSIZ + 1]; struct net_device *ndev; netdevice_tracker dev_tracker; }; struct { char ib_name[IB_DEVICE_NAME_MAX + 1]; u8 ib_port; }; }; }; /* Check if the pnetid is set */ bool smc_pnet_is_pnetid_set(u8 *pnetid) { if (pnetid[0] == 0 || pnetid[0] == _S) return false; return true; } /* Check if two given pnetids match */ static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2) { int i; for (i = 0; i < SMC_MAX_PNETID_LEN; i++) { if ((pnetid1[i] == 0 || pnetid1[i] == _S) && (pnetid2[i] == 0 || pnetid2[i] == _S)) break; if (pnetid1[i] != pnetid2[i]) return false; } return true; } /* Remove a pnetid from the pnet table. */ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct smc_ib_device *ibdev; struct smcd_dev *smcd; struct smc_net *sn; int rc = -ENOENT; int ibport; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; /* remove table entry */ mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) { netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); pr_warn_ratelimited("smc: net device %s " "erased user defined " "pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); } kfree(pnetelem); rc = 0; } } mutex_unlock(&pnettable->lock); /* if this is not the initial namespace, stop here */ if (net != &init_net) return rc; /* remove ib devices */ mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { if (ibdev->pnetid_by_user[ibport] && (!pnet_name || smc_pnet_match(pnet_name, ibdev->pnetid[ibport]))) { pr_warn_ratelimited("smc: ib device %s ibport " "%d erased user defined " "pnetid %.16s\n", ibdev->ibdev->name, ibport + 1, ibdev->pnetid[ibport]); memset(ibdev->pnetid[ibport], 0, SMC_MAX_PNETID_LEN); ibdev->pnetid_by_user[ibport] = false; rc = 0; } } } mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { if (smcd->pnetid_by_user && (!pnet_name || smc_pnet_match(pnet_name, smcd->pnetid))) { pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " "%.16s\n", dev_name(smcd->ops->get_dev(smcd)), smcd->pnetid); memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); smcd->pnetid_by_user = false; rc = 0; } } mutex_unlock(&smcd_dev_list.mutex); return rc; } /* Add the reference to a given network device to the pnet table. */ static int smc_pnet_add_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { netdev_hold(ndev, &pnetelem->dev_tracker, GFP_ATOMIC); pnetelem->ndev = ndev; rc = 0; pr_warn_ratelimited("smc: adding net device %s with " "user defined pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); break; } } mutex_unlock(&pnettable->lock); return rc; } /* Remove the reference to a given network device from the pnet table. */ static int smc_pnet_remove_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); pnetelem->ndev = NULL; rc = 0; pr_warn_ratelimited("smc: removing net device %s with " "user defined pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); break; } } mutex_unlock(&pnettable->lock); return rc; } /* Apply pnetid to ib device when no pnetid is set. */ static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, char *pnet_name) { bool applied = false; mutex_lock(&smc_ib_devices.mutex); if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) { memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, SMC_MAX_PNETID_LEN); ib_dev->pnetid_by_user[ib_port - 1] = true; applied = true; } mutex_unlock(&smc_ib_devices.mutex); return applied; } /* Apply pnetid to smcd device when no pnetid is set. */ static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) { bool applied = false; mutex_lock(&smcd_dev_list.mutex); if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) { memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); smcd_dev->pnetid_by_user = true; applied = true; } mutex_unlock(&smcd_dev_list.mutex); return applied; } /* The limit for pnetid is 16 characters. * Valid characters should be (single-byte character set) a-z, A-Z, 0-9. * Lower case letters are converted to upper case. * Interior blanks should not be used. */ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) { char *bf = skip_spaces(pnet_name); size_t len = strlen(bf); char *end = bf + len; if (!len) return false; while (--end >= bf && isspace(*end)) ; if (end - bf >= SMC_MAX_PNETID_LEN) return false; while (bf <= end) { if (!isalnum(*bf)) return false; *pnetid++ = islower(*bf) ? toupper(*bf) : *bf; bf++; } *pnetid = '\0'; return true; } /* Find an infiniband device by a given name. The device might not exist. */ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name) { struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (!strncmp(ibdev->ibdev->name, ib_name, sizeof(ibdev->ibdev->name)) || (ibdev->ibdev->dev.parent && !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name, IB_DEVICE_NAME_MAX - 1))) { goto out; } } ibdev = NULL; out: mutex_unlock(&smc_ib_devices.mutex); return ibdev; } /* Find an smcd device by a given name. The device might not exist. */ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) { struct smcd_dev *smcd_dev; mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), smcd_name, IB_DEVICE_NAME_MAX - 1)) goto out; } smcd_dev = NULL; out: mutex_unlock(&smcd_dev_list.mutex); return smcd_dev; } static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, char *eth_name, char *pnet_name) { struct smc_pnetentry *tmp_pe, *new_pe; struct net_device *ndev, *base_ndev; u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; bool new_netdev; int rc; /* check if (base) netdev already has a pnetid. If there is one, we do * not want to add a pnet table entry */ rc = -EEXIST; ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ if (ndev) { base_ndev = pnet_find_base_ndev(ndev); if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid)) goto out_put; } /* add a new netdev entry to the pnet table if there isn't one */ rc = -ENOMEM; new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); if (!new_pe) goto out_put; new_pe->type = SMC_PNET_ETH; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); rc = -EEXIST; new_netdev = true; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_ETH && !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { new_netdev = false; break; } } if (new_netdev) { if (ndev) { new_pe->ndev = ndev; netdev_tracker_alloc(ndev, &new_pe->dev_tracker, GFP_ATOMIC); } list_add_tail(&new_pe->list, &pnettable->pnetlist); mutex_unlock(&pnettable->lock); } else { mutex_unlock(&pnettable->lock); kfree(new_pe); goto out_put; } if (ndev) pr_warn_ratelimited("smc: net device %s " "applied user defined pnetid %.16s\n", new_pe->eth_name, new_pe->pnet_name); return 0; out_put: dev_put(ndev); return rc; } static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, u8 ib_port, char *pnet_name) { struct smc_pnetentry *tmp_pe, *new_pe; struct smc_ib_device *ib_dev; bool smcddev_applied = true; bool ibdev_applied = true; struct smcd_dev *smcd; struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ ib_dev = smc_pnet_find_ib(ib_name); if (ib_dev) { ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); if (ibdev_applied) pr_warn_ratelimited("smc: ib device %s ibport %d " "applied user defined pnetid " "%.16s\n", ib_dev->ibdev->name, ib_port, ib_dev->pnetid[ib_port - 1]); } smcd = smc_pnet_find_smcd(ib_name); if (smcd) { smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); if (smcddev_applied) { dev = smcd->ops->get_dev(smcd); pr_warn_ratelimited("smc: smcd device %s " "applied user defined pnetid " "%.16s\n", dev_name(dev), smcd->pnetid); } } /* Apply fails when a device has a hardware-defined pnetid set, do not * add a pnet table entry in that case. */ if (!ibdev_applied || !smcddev_applied) return -EEXIST; /* add a new ib entry to the pnet table if there isn't one */ new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); if (!new_pe) return -ENOMEM; new_pe->type = SMC_PNET_IB; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); new_pe->ib_port = ib_port; new_ibdev = true; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { new_ibdev = false; break; } } if (new_ibdev) { list_add_tail(&new_pe->list, &pnettable->pnetlist); mutex_unlock(&pnettable->lock); } else { mutex_unlock(&pnettable->lock); kfree(new_pe); } return (new_ibdev) ? 0 : -EEXIST; } /* Append a pnetid to the end of the pnet table if not already on this list. */ static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) { char pnet_name[SMC_MAX_PNETID_LEN + 1]; struct smc_pnettable *pnettable; bool new_netdev = false; bool new_ibdev = false; struct smc_net *sn; u8 ibport = 1; char *string; int rc; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; rc = -EINVAL; if (!tb[SMC_PNETID_NAME]) goto error; string = (char *)nla_data(tb[SMC_PNETID_NAME]); if (!smc_pnetid_valid(string, pnet_name)) goto error; if (tb[SMC_PNETID_ETHNAME]) { string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); if (!rc) new_netdev = true; else if (rc != -EEXIST) goto error; } /* if this is not the initial namespace, stop here */ if (net != &init_net) return new_netdev ? 0 : -EEXIST; rc = -EINVAL; if (tb[SMC_PNETID_IBNAME]) { string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); string = strim(string); if (tb[SMC_PNETID_IBPORT]) { ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); if (ibport < 1 || ibport > SMC_MAX_PORTS) goto error; } rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); if (!rc) new_ibdev = true; else if (rc != -EEXIST) goto error; } return (new_netdev || new_ibdev) ? 0 : -EEXIST; error: return rc; } /* Convert an smc_pnetentry to a netlink attribute sequence */ static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem) { if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) return -1; if (pnetelem->type == SMC_PNET_ETH) { if (nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->eth_name)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) return -1; } if (pnetelem->type == SMC_PNET_IB) { if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) return -1; } return 0; } static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); return smc_pnet_enter(net, info->attrs); } static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); if (!info->attrs[SMC_PNETID_NAME]) return -EINVAL; return smc_pnet_remove_by_pnetid(net, (char *)nla_data(info->attrs[SMC_PNETID_NAME])); } static int smc_pnet_dump_start(struct netlink_callback *cb) { cb->args[0] = 0; return 0; } static int smc_pnet_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, struct smc_pnetentry *pnetelem) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family, flags, SMC_PNETID_GET); if (!hdr) return -ENOMEM; if (smc_pnet_set_nla(skb, pnetelem) < 0) { genlmsg_cancel(skb, hdr); return -EMSGSIZE; } genlmsg_end(skb, hdr); return 0; } static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u8 *pnetid, int start_idx) { struct smc_pnettable *pnettable; struct smc_pnetentry *pnetelem; struct smc_net *sn; int idx = 0; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; /* dump pnettable entries */ mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; if (idx++ < start_idx) continue; /* if this is not the initial namespace, dump only netdev */ if (net != &init_net && pnetelem->type != SMC_PNET_ETH) continue; if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, pnetelem)) { --idx; break; } } mutex_unlock(&pnettable->lock); return idx; } static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int idx; idx = _smc_pnet_dump(net, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NULL, cb->args[0]); cb->args[0] = idx; return skb->len; } /* Retrieve one PNETID entry */ static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct sk_buff *msg; void *hdr; if (!info->attrs[SMC_PNETID_NAME]) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; _smc_pnet_dump(net, msg, info->snd_portid, info->snd_seq, nla_data(info->attrs[SMC_PNETID_NAME]), 0); /* finish multi part message and send it */ hdr = nlmsg_put(msg, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, NLM_F_MULTI); if (!hdr) { nlmsg_free(msg); return -EMSGSIZE; } return genlmsg_reply(msg, info); } /* Remove and delete all pnetids from pnet table. */ static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); smc_pnet_remove_by_pnetid(net, NULL); return 0; } /* SMC_PNETID generic netlink operation definition */ static const struct genl_ops smc_pnet_ops[] = { { .cmd = SMC_PNETID_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .doit = smc_pnet_get, .dumpit = smc_pnet_dump, .start = smc_pnet_dump_start }, { .cmd = SMC_PNETID_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_add }, { .cmd = SMC_PNETID_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_del }, { .cmd = SMC_PNETID_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_flush } }; /* SMC_PNETID family definition */ static struct genl_family smc_pnet_nl_family __ro_after_init = { .hdrsize = 0, .name = SMCR_GENL_FAMILY_NAME, .version = SMCR_GENL_FAMILY_VERSION, .maxattr = SMC_PNETID_MAX, .policy = smc_pnet_policy, .netnsok = true, .module = THIS_MODULE, .ops = smc_pnet_ops, .n_ops = ARRAY_SIZE(smc_pnet_ops), .resv_start_op = SMC_PNETID_FLUSH + 1, }; bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe; bool rc = false; read_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pe, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pe->pnetid)) { rc = true; goto unlock; } } unlock: read_unlock(&sn->pnetids_ndev.lock); return rc; } static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *pi; pe = kzalloc(sizeof(*pe), GFP_KERNEL); if (!pe) return -ENOMEM; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pi, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pi->pnetid)) { refcount_inc(&pi->refcnt); kfree(pe); goto unlock; } } refcount_set(&pe->refcnt, 1); memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN); list_add_tail(&pe->list, &sn->pnetids_ndev.list); unlock: write_unlock(&sn->pnetids_ndev.lock); return 0; } static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *pe2; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pe->pnetid)) { if (refcount_dec_and_test(&pe->refcnt)) { list_del(&pe->list); kfree(pe); } break; } } write_unlock(&sn->pnetids_ndev.lock); } static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev, u8 *ndev_pnetid) { struct net_device *base_dev; base_dev = __pnet_find_base_ndev(dev); if (base_dev->flags & IFF_UP && !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port, ndev_pnetid)) { /* add to PNETIDs list */ smc_pnet_add_pnetid(net, ndev_pnetid); } } /* create initial list of netdevice pnetids */ static void smc_pnet_create_pnetids_list(struct net *net) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net_device *dev; /* Newly created netns do not have devices. * Do not even acquire rtnl. */ if (list_empty(&net->dev_base_head)) return; /* Note: This might not be needed, because smc_pnet_netdev_event() * is also calling smc_pnet_add_base_pnetid() when handling * NETDEV_UP event. */ rtnl_lock(); for_each_netdev(net, dev) smc_pnet_add_base_pnetid(net, dev, ndev_pnetid); rtnl_unlock(); } /* clean up list of netdevice pnetids */ static void smc_pnet_destroy_pnetids_list(struct net *net) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *temp_pe; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) { list_del(&pe->list); kfree(pe); } write_unlock(&sn->pnetids_ndev.lock); } static int smc_pnet_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(event_dev); u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; switch (event) { case NETDEV_REBOOT: case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_REGISTER: smc_pnet_add_by_ndev(event_dev); smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_UP: smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid); return NOTIFY_OK; case NETDEV_DOWN: event_dev = __pnet_find_base_ndev(event_dev); if (!smc_pnetid_by_dev_port(event_dev->dev.parent, event_dev->dev_port, ndev_pnetid)) { /* remove from PNETIDs list */ smc_pnet_remove_pnetid(net, ndev_pnetid); } return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block smc_netdev_notifier = { .notifier_call = smc_pnet_netdev_event }; /* init network namespace */ int smc_pnet_net_init(struct net *net) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnettable *pnettable = &sn->pnettable; struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev; INIT_LIST_HEAD(&pnettable->pnetlist); mutex_init(&pnettable->lock); INIT_LIST_HEAD(&pnetids_ndev->list); rwlock_init(&pnetids_ndev->lock); smc_pnet_create_pnetids_list(net); return 0; } int __init smc_pnet_init(void) { int rc; rc = genl_register_family(&smc_pnet_nl_family); if (rc) return rc; rc = register_netdevice_notifier(&smc_netdev_notifier); if (rc) genl_unregister_family(&smc_pnet_nl_family); return rc; } /* exit network namespace */ void smc_pnet_net_exit(struct net *net) { /* flush pnet table */ smc_pnet_remove_by_pnetid(net, NULL); smc_pnet_destroy_pnetids_list(net); } void smc_pnet_exit(void) { unregister_netdevice_notifier(&smc_netdev_notifier); genl_unregister_family(&smc_pnet_nl_family); } static struct net_device *__pnet_find_base_ndev(struct net_device *ndev) { int i, nest_lvl; ASSERT_RTNL(); nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; if (list_empty(lower)) break; lower = lower->next; ndev = netdev_lower_get_next(ndev, &lower); } return ndev; } /* Determine one base device for stacked net devices. * If the lower device level contains more than one devices * (for instance with bonding slaves), just the first device * is used to reach a base device. */ static struct net_device *pnet_find_base_ndev(struct net_device *ndev) { rtnl_lock(); ndev = __pnet_find_base_ndev(ndev); rtnl_unlock(); return ndev; } static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, u8 *pnetid) { struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_pnetentry *pnetelem; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; } static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, struct smc_init_info *ini) { if (!ini->check_smcrv2 && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL, NULL)) { ini->ib_dev = ibdev; ini->ib_port = i; return 0; } if (ini->check_smcrv2 && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2, NULL, &ini->smcrv2)) { ini->smcrv2.ib_dev_v2 = ibdev; ini->smcrv2.ib_port_v2 = i; return 0; } return -ENODEV; } /* find a roce device for the given pnetid */ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_init_info *ini, struct smc_ib_device *known_dev, struct net *net) { struct smc_ib_device *ibdev; int i; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (ibdev == known_dev || !rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && smc_ib_port_active(ibdev, i) && !test_bit(i - 1, ibdev->ports_going_away)) { if (!smc_pnet_determine_gid(ibdev, i, ini)) goto out; } } } out: mutex_unlock(&smc_ib_devices.mutex); } /* find alternate roce device with same pnet_id, vlan_id and net namespace */ void smc_pnet_find_alt_roce(struct smc_link_group *lgr, struct smc_init_info *ini, struct smc_ib_device *known_dev) { struct net *net = lgr->net; _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net); } /* if handshake network device belongs to a roce device, return its * IB device and port */ static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct smc_init_info *ini) { struct net *net = dev_net(netdev); struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { struct net_device *ndev; int i; /* check rdma net namespace */ if (!rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; ndev = ib_device_get_netdev(ibdev->ibdev, i); if (!ndev) continue; dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && !test_bit(i - 1, ibdev->ports_going_away)) { if (!smc_pnet_determine_gid(ibdev, i, ini)) break; } } } mutex_unlock(&smc_ib_devices.mutex); } /* Determine the corresponding IB device port based on the hardware PNETID. * Searching stops at the first matching active IB device port with vlan_id * configured. * If nothing found, check pnetid table. * If nothing found, try to use handshake device */ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net_device *base_ndev; struct net *net; base_ndev = pnet_find_base_ndev(ndev); net = dev_net(ndev); if (smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(base_ndev, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { smc_pnet_find_rdma_dev(base_ndev, ini); return; /* pnetid could not be determined */ } _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smcd_dev *ismdev; ndev = pnet_find_base_ndev(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) return; /* pnetid could not be determined */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && !ismdev->going_away && (!ini->ism_peer_gid[0].gid || !smc_ism_cantalk(&ini->ism_peer_gid[0], ini->vlan_id, ismdev))) { ini->ism_dev[0] = ismdev; break; } } mutex_unlock(&smcd_dev_list.mutex); } /* PNET table analysis for a given sock: * determine ib_device and port belonging to used internal TCP socket * ethernet interface. */ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); if (!dst) goto out; if (!dst->dev) goto out_rel; smc_pnet_find_roce_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); out: return; } void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); ini->ism_dev[0] = NULL; if (!dst) goto out; if (!dst->dev) goto out_rel; smc_pnet_find_ism_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); out: return; } /* Lookup and apply a pnet table entry to the given ib device. */ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) { char *ib_name = smcibdev->ibdev->name; struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for init namespace */ sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && tmp_pe->ib_port == ib_port) { smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; } /* Lookup and apply a pnet table entry to the given smcd device. */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for init namespace */ sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; } |
112 112 904 222 177 142 611 991 938 177 142 1001 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/cmpxchg.h * * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_CMPXCHG_H #define __ASM_CMPXCHG_H #include <linux/build_bug.h> #include <linux/compiler.h> #include <asm/barrier.h> #include <asm/lse.h> /* * We need separate acquire parameters for ll/sc and lse, since the full * barrier case is generated as release+dmb for the former and * acquire+release for the latter. */ #define __XCHG_CASE(w, sfx, name, sz, mb, nop_lse, acq, acq_lse, rel, cl) \ static inline u##sz __xchg_case_##name##sz(u##sz x, volatile void *ptr) \ { \ u##sz ret; \ unsigned long tmp; \ \ asm volatile(ARM64_LSE_ATOMIC_INSN( \ /* LL/SC */ \ " prfm pstl1strm, %2\n" \ "1: ld" #acq "xr" #sfx "\t%" #w "0, %2\n" \ " st" #rel "xr" #sfx "\t%w1, %" #w "3, %2\n" \ " cbnz %w1, 1b\n" \ " " #mb, \ /* LSE atomics */ \ " swp" #acq_lse #rel #sfx "\t%" #w "3, %" #w "0, %2\n" \ __nops(3) \ " " #nop_lse) \ : "=&r" (ret), "=&r" (tmp), "+Q" (*(u##sz *)ptr) \ : "r" (x) \ : cl); \ \ return ret; \ } __XCHG_CASE(w, b, , 8, , , , , , ) __XCHG_CASE(w, h, , 16, , , , , , ) __XCHG_CASE(w, , , 32, , , , , , ) __XCHG_CASE( , , , 64, , , , , , ) __XCHG_CASE(w, b, acq_, 8, , , a, a, , "memory") __XCHG_CASE(w, h, acq_, 16, , , a, a, , "memory") __XCHG_CASE(w, , acq_, 32, , , a, a, , "memory") __XCHG_CASE( , , acq_, 64, , , a, a, , "memory") __XCHG_CASE(w, b, rel_, 8, , , , , l, "memory") __XCHG_CASE(w, h, rel_, 16, , , , , l, "memory") __XCHG_CASE(w, , rel_, 32, , , , , l, "memory") __XCHG_CASE( , , rel_, 64, , , , , l, "memory") __XCHG_CASE(w, b, mb_, 8, dmb ish, nop, , a, l, "memory") __XCHG_CASE(w, h, mb_, 16, dmb ish, nop, , a, l, "memory") __XCHG_CASE(w, , mb_, 32, dmb ish, nop, , a, l, "memory") __XCHG_CASE( , , mb_, 64, dmb ish, nop, , a, l, "memory") #undef __XCHG_CASE #define __XCHG_GEN(sfx) \ static __always_inline unsigned long \ __arch_xchg##sfx(unsigned long x, volatile void *ptr, int size) \ { \ switch (size) { \ case 1: \ return __xchg_case##sfx##_8(x, ptr); \ case 2: \ return __xchg_case##sfx##_16(x, ptr); \ case 4: \ return __xchg_case##sfx##_32(x, ptr); \ case 8: \ return __xchg_case##sfx##_64(x, ptr); \ default: \ BUILD_BUG(); \ } \ \ unreachable(); \ } __XCHG_GEN() __XCHG_GEN(_acq) __XCHG_GEN(_rel) __XCHG_GEN(_mb) #undef __XCHG_GEN #define __xchg_wrapper(sfx, ptr, x) \ ({ \ __typeof__(*(ptr)) __ret; \ __ret = (__typeof__(*(ptr))) \ __arch_xchg##sfx((unsigned long)(x), (ptr), sizeof(*(ptr))); \ __ret; \ }) /* xchg */ #define arch_xchg_relaxed(...) __xchg_wrapper( , __VA_ARGS__) #define arch_xchg_acquire(...) __xchg_wrapper(_acq, __VA_ARGS__) #define arch_xchg_release(...) __xchg_wrapper(_rel, __VA_ARGS__) #define arch_xchg(...) __xchg_wrapper( _mb, __VA_ARGS__) #define __CMPXCHG_CASE(name, sz) \ static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr, \ u##sz old, \ u##sz new) \ { \ return __lse_ll_sc_body(_cmpxchg_case_##name##sz, \ ptr, old, new); \ } __CMPXCHG_CASE( , 8) __CMPXCHG_CASE( , 16) __CMPXCHG_CASE( , 32) __CMPXCHG_CASE( , 64) __CMPXCHG_CASE(acq_, 8) __CMPXCHG_CASE(acq_, 16) __CMPXCHG_CASE(acq_, 32) __CMPXCHG_CASE(acq_, 64) __CMPXCHG_CASE(rel_, 8) __CMPXCHG_CASE(rel_, 16) __CMPXCHG_CASE(rel_, 32) __CMPXCHG_CASE(rel_, 64) __CMPXCHG_CASE(mb_, 8) __CMPXCHG_CASE(mb_, 16) __CMPXCHG_CASE(mb_, 32) __CMPXCHG_CASE(mb_, 64) #undef __CMPXCHG_CASE #define __CMPXCHG128(name) \ static inline u128 __cmpxchg128##name(volatile u128 *ptr, \ u128 old, u128 new) \ { \ return __lse_ll_sc_body(_cmpxchg128##name, \ ptr, old, new); \ } __CMPXCHG128( ) __CMPXCHG128(_mb) #undef __CMPXCHG128 #define __CMPXCHG_GEN(sfx) \ static __always_inline unsigned long __cmpxchg##sfx(volatile void *ptr, \ unsigned long old, \ unsigned long new, \ int size) \ { \ switch (size) { \ case 1: \ return __cmpxchg_case##sfx##_8(ptr, old, new); \ case 2: \ return __cmpxchg_case##sfx##_16(ptr, old, new); \ case 4: \ return __cmpxchg_case##sfx##_32(ptr, old, new); \ case 8: \ return __cmpxchg_case##sfx##_64(ptr, old, new); \ default: \ BUILD_BUG(); \ } \ \ unreachable(); \ } __CMPXCHG_GEN() __CMPXCHG_GEN(_acq) __CMPXCHG_GEN(_rel) __CMPXCHG_GEN(_mb) #undef __CMPXCHG_GEN #define __cmpxchg_wrapper(sfx, ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ __ret = (__typeof__(*(ptr))) \ __cmpxchg##sfx((ptr), (unsigned long)(o), \ (unsigned long)(n), sizeof(*(ptr))); \ __ret; \ }) /* cmpxchg */ #define arch_cmpxchg_relaxed(...) __cmpxchg_wrapper( , __VA_ARGS__) #define arch_cmpxchg_acquire(...) __cmpxchg_wrapper(_acq, __VA_ARGS__) #define arch_cmpxchg_release(...) __cmpxchg_wrapper(_rel, __VA_ARGS__) #define arch_cmpxchg(...) __cmpxchg_wrapper( _mb, __VA_ARGS__) #define arch_cmpxchg_local arch_cmpxchg_relaxed /* cmpxchg64 */ #define arch_cmpxchg64_relaxed arch_cmpxchg_relaxed #define arch_cmpxchg64_acquire arch_cmpxchg_acquire #define arch_cmpxchg64_release arch_cmpxchg_release #define arch_cmpxchg64 arch_cmpxchg #define arch_cmpxchg64_local arch_cmpxchg_local /* cmpxchg128 */ #define system_has_cmpxchg128() 1 #define arch_cmpxchg128(ptr, o, n) \ ({ \ __cmpxchg128_mb((ptr), (o), (n)); \ }) #define arch_cmpxchg128_local(ptr, o, n) \ ({ \ __cmpxchg128((ptr), (o), (n)); \ }) #define __CMPWAIT_CASE(w, sfx, sz) \ static inline void __cmpwait_case_##sz(volatile void *ptr, \ unsigned long val) \ { \ unsigned long tmp; \ \ asm volatile( \ " sevl\n" \ " wfe\n" \ " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \ " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \ " cbnz %" #w "[tmp], 1f\n" \ " wfe\n" \ "1:" \ : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \ : [val] "r" (val)); \ } __CMPWAIT_CASE(w, b, 8); __CMPWAIT_CASE(w, h, 16); __CMPWAIT_CASE(w, , 32); __CMPWAIT_CASE( , , 64); #undef __CMPWAIT_CASE #define __CMPWAIT_GEN(sfx) \ static __always_inline void __cmpwait##sfx(volatile void *ptr, \ unsigned long val, \ int size) \ { \ switch (size) { \ case 1: \ return __cmpwait_case##sfx##_8(ptr, (u8)val); \ case 2: \ return __cmpwait_case##sfx##_16(ptr, (u16)val); \ case 4: \ return __cmpwait_case##sfx##_32(ptr, val); \ case 8: \ return __cmpwait_case##sfx##_64(ptr, val); \ default: \ BUILD_BUG(); \ } \ \ unreachable(); \ } __CMPWAIT_GEN() #undef __CMPWAIT_GEN #define __cmpwait_relaxed(ptr, val) \ __cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr))) #endif /* __ASM_CMPXCHG_H */ |
247 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2020 - Google LLC * Author: Quentin Perret <qperret@google.com> */ #include <linux/init.h> #include <linux/interval_tree_generic.h> #include <linux/kmemleak.h> #include <linux/kvm_host.h> #include <asm/kvm_mmu.h> #include <linux/memblock.h> #include <linux/mutex.h> #include <asm/kvm_pkvm.h> #include "hyp_constants.h" DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); phys_addr_t hyp_mem_base; phys_addr_t hyp_mem_size; static int __init register_memblock_regions(void) { struct memblock_region *reg; for_each_mem_region(reg) { if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) return -ENOMEM; hyp_memory[*hyp_memblock_nr_ptr] = *reg; (*hyp_memblock_nr_ptr)++; } return 0; } void __init kvm_hyp_reserve(void) { u64 hyp_mem_pages = 0; int ret; if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) return; if (kvm_get_mode() != KVM_MODE_PROTECTED) return; ret = register_memblock_regions(); if (ret) { *hyp_memblock_nr_ptr = 0; kvm_err("Failed to register hyp memblocks: %d\n", ret); return; } hyp_mem_pages += hyp_s1_pgtable_pages(); hyp_mem_pages += host_s2_pgtable_pages(); hyp_mem_pages += hyp_vm_table_pages(); hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); hyp_mem_pages += pkvm_selftest_pages(); hyp_mem_pages += hyp_ffa_proxy_pages(); /* * Try to allocate a PMD-aligned region to reduce TLB pressure once * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. */ hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), PMD_SIZE); if (!hyp_mem_base) hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); else hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); if (!hyp_mem_base) { kvm_err("Failed to reserve hyp memory\n"); return; } kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, hyp_mem_base); } static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) { if (host_kvm->arch.pkvm.handle) { WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, host_kvm->arch.pkvm.handle)); } host_kvm->arch.pkvm.handle = 0; free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); free_hyp_memcache(&host_kvm->arch.pkvm.stage2_teardown_mc); } static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) { size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle; void *hyp_vcpu; int ret; vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); if (!hyp_vcpu) return -ENOMEM; ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu); if (!ret) vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED); else free_pages_exact(hyp_vcpu, hyp_vcpu_sz); return ret; } /* * Allocates and donates memory for hypervisor VM structs at EL2. * * Allocates space for the VM state, which includes the hyp vm as well as * the hyp vcpus. * * Stores an opaque handler in the kvm struct for future reference. * * Return 0 on success, negative error code on failure. */ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) { size_t pgd_sz, hyp_vm_sz; void *pgd, *hyp_vm; int ret; if (host_kvm->created_vcpus < 1) return -EINVAL; pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.mmu.vtcr); /* * The PGD pages will be reclaimed using a hyp_memcache which implies * page granularity. So, use alloc_pages_exact() to get individual * refcounts. */ pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT); if (!pgd) return -ENOMEM; /* Allocate memory to donate to hyp for vm and vcpu pointers. */ hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, size_mul(sizeof(void *), host_kvm->created_vcpus))); hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT); if (!hyp_vm) { ret = -ENOMEM; goto free_pgd; } /* Donate the VM memory to hyp and let hyp initialize it. */ ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd); if (ret < 0) goto free_vm; host_kvm->arch.pkvm.handle = ret; host_kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE); return 0; free_vm: free_pages_exact(hyp_vm, hyp_vm_sz); free_pgd: free_pages_exact(pgd, pgd_sz); return ret; } int pkvm_create_hyp_vm(struct kvm *host_kvm) { int ret = 0; mutex_lock(&host_kvm->arch.config_lock); if (!host_kvm->arch.pkvm.handle) ret = __pkvm_create_hyp_vm(host_kvm); mutex_unlock(&host_kvm->arch.config_lock); return ret; } int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) { int ret = 0; mutex_lock(&vcpu->kvm->arch.config_lock); if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED)) ret = __pkvm_create_hyp_vcpu(vcpu); mutex_unlock(&vcpu->kvm->arch.config_lock); return ret; } void pkvm_destroy_hyp_vm(struct kvm *host_kvm) { mutex_lock(&host_kvm->arch.config_lock); __pkvm_destroy_hyp_vm(host_kvm); mutex_unlock(&host_kvm->arch.config_lock); } int pkvm_init_host_vm(struct kvm *host_kvm) { return 0; } static void __init _kvm_host_prot_finalize(void *arg) { int *err = arg; if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize))) WRITE_ONCE(*err, -EINVAL); } static int __init pkvm_drop_host_privileges(void) { int ret = 0; /* * Flip the static key upfront as that may no longer be possible * once the host stage 2 is installed. */ static_branch_enable(&kvm_protected_mode_initialized); on_each_cpu(_kvm_host_prot_finalize, &ret, 1); return ret; } static int __init finalize_pkvm(void) { int ret; if (!is_protected_kvm_enabled() || !is_kvm_arm_initialised()) return 0; /* * Exclude HYP sections from kmemleak so that they don't get peeked * at, which would end badly once inaccessible. */ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); kmemleak_free_part(__hyp_data_start, __hyp_data_end - __hyp_data_start); kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start); kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); ret = pkvm_drop_host_privileges(); if (ret) pr_err("Failed to finalize Hyp protection: %d\n", ret); return ret; } device_initcall_sync(finalize_pkvm); static u64 __pkvm_mapping_start(struct pkvm_mapping *m) { return m->gfn * PAGE_SIZE; } static u64 __pkvm_mapping_end(struct pkvm_mapping *m) { return (m->gfn + m->nr_pages) * PAGE_SIZE - 1; } INTERVAL_TREE_DEFINE(struct pkvm_mapping, node, u64, __subtree_last, __pkvm_mapping_start, __pkvm_mapping_end, static, pkvm_mapping); /* * __tmp is updated to iter_first(pkvm_mappings) *before* entering the body of the loop to allow * freeing of __map inline. */ #define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \ for (struct pkvm_mapping *__tmp = pkvm_mapping_iter_first(&(__pgt)->pkvm_mappings, \ __start, __end - 1); \ __tmp && ({ \ __map = __tmp; \ __tmp = pkvm_mapping_iter_next(__map, __start, __end - 1); \ true; \ }); \ ) int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops) { pgt->pkvm_mappings = RB_ROOT_CACHED; pgt->mmu = mmu; return 0; } static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); pkvm_handle_t handle = kvm->arch.pkvm.handle; struct pkvm_mapping *mapping; int ret; if (!handle) return 0; for_each_mapping_in_range_safe(pgt, start, end, mapping) { ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn, mapping->nr_pages); if (WARN_ON(ret)) return ret; pkvm_mapping_remove(mapping, &pgt->pkvm_mappings); kfree(mapping); } return 0; } void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) { __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL)); } int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); struct pkvm_mapping *mapping = NULL; struct kvm_hyp_memcache *cache = mc; u64 gfn = addr >> PAGE_SHIFT; u64 pfn = phys >> PAGE_SHIFT; int ret; if (size != PAGE_SIZE && size != PMD_SIZE) return -EINVAL; lockdep_assert_held_write(&kvm->mmu_lock); /* * Calling stage2_map() on top of existing mappings is either happening because of a race * with another vCPU, or because we're changing between page and block mappings. As per * user_mem_abort(), same-size permission faults are handled in the relax_perms() path. */ mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, addr + size - 1); if (mapping) { if (size == (mapping->nr_pages * PAGE_SIZE)) return -EAGAIN; /* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */ ret = __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); if (ret) return ret; mapping = NULL; } ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot); if (WARN_ON(ret)) return ret; swap(mapping, cache->mapping); mapping->gfn = gfn; mapping->pfn = pfn; mapping->nr_pages = size / PAGE_SIZE; pkvm_mapping_insert(mapping, &pgt->pkvm_mappings); return ret; } int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock); return __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); } int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); pkvm_handle_t handle = kvm->arch.pkvm.handle; struct pkvm_mapping *mapping; int ret = 0; lockdep_assert_held(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn, mapping->nr_pages); if (WARN_ON(ret)) break; } return ret; } int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); struct pkvm_mapping *mapping; lockdep_assert_held(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), PAGE_SIZE * mapping->nr_pages); return 0; } bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); pkvm_handle_t handle = kvm->arch.pkvm.handle; struct pkvm_mapping *mapping; bool young = false; lockdep_assert_held(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn, mapping->nr_pages, mkold); return young; } int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot); } void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags) { WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT)); } void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { WARN_ON_ONCE(1); } kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte) { WARN_ON_ONCE(1); return NULL; } int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_mmu_memory_cache *mc) { WARN_ON_ONCE(1); return -EINVAL; } |
270 270 9 319 156 61 205 205 205 321 320 3 320 319 320 1 309 19 225 131 318 1 91 123 1 118 2 24 134 132 2 3 195 1 1 72 2 90 1 1 4 217 116 24 94 65 90 203 321 69 12 69 154 1 160 153 319 318 316 318 152 254 319 319 92 206 77 132 1 318 316 97 97 270 274 517 519 13 91 301 1 302 272 23 34 189 62 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 | // SPDX-License-Identifier: GPL-2.0-only /* * mm/mmap.c * * Written by obz. * * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/capability.h> #include <linux/init.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/profile.h> #include <linux/export.h> #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> #include <linux/mmdebug.h> #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> #include <linux/uprobes.h> #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> #include <linux/pkeys.h> #include <linux/oom.h> #include <linux/sched/mm.h> #include <linux/ksm.h> #include <linux/memfd.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include <asm/mmu_context.h> #define CREATE_TRACE_POINTS #include <trace/events/mmap.h> #include "internal.h" #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX; int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { unsigned long vm_flags = vma->vm_flags; pgprot_t vm_page_prot; vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); if (vma_wants_writenotify(vma, vm_page_prot)) { vm_flags &= ~VM_SHARED; vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. * @addr: The address to check * @len: The size of increase. * * Return: 0 on success. */ static int check_brk_limits(unsigned long addr, unsigned long len) { unsigned long mapped_addr; mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (IS_ERR_VALUE(mapped_addr)) return mapped_addr; return mlock_future_ok(current->mm, current->mm->def_flags, len) ? 0 : -EAGAIN; } SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate = false; LIST_HEAD(uf); struct vma_iterator vmi; if (mmap_write_lock_killable(mm)) return -EINTR; origbrk = mm->brk; #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ if (current->brk_randomized) min_brk = mm->start_brk; else min_brk = mm->end_data; #else min_brk = mm->start_brk; #endif if (brk < min_brk) goto out; /* * Check against rlimit here. If this check is done later after the test * of oldbrk with newbrk then it can escape the test and let the data * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, mm->end_data, mm->start_data)) goto out; newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); if (oldbrk == newbrk) { mm->brk = brk; goto success; } /* Always allow shrinking brk. */ if (brk <= mm->brk) { /* Search one past newbrk */ vma_iter_init(&vmi, mm, newbrk); brkvma = vma_find(&vmi, oldbrk); if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. * do_vmi_align_munmap() will drop the lock on success, so * update it before calling do_vma_munmap(). */ mm->brk = brk; if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf, /* unlock = */ true)) goto out; goto success_unlocked; } if (check_brk_limits(oldbrk, newbrk - oldbrk)) goto out; /* * Only check if the next VMA is within the stack_guard_gap of the * expansion area */ vma_iter_init(&vmi, mm, oldbrk); next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; mm->brk = brk; if (mm->def_flags & VM_LOCKED) populate = true; success: mmap_write_unlock(mm); success_unlocked: userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: mm->brk = origbrk; mmap_write_unlock(mm); return origbrk; } /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr */ static inline unsigned long round_hint_to_min(unsigned long hint) { hint &= PAGE_MASK; if (((void *)hint != NULL) && (hint < mmap_min_addr)) return PAGE_ALIGN(mmap_min_addr); return hint; } bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) return true; locked_pages = bytes >> PAGE_SHIFT; locked_pages += mm->locked_vm; limit_pages = rlimit(RLIMIT_MEMLOCK); limit_pages >>= PAGE_SHIFT; return locked_pages <= limit_pages; } static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) { if (S_ISREG(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISBLK(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISSOCK(inode->i_mode)) return MAX_LFS_FILESIZE; /* Special "we do even unsigned file positions" case */ if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET) return 0; /* Yes, random drivers might want more. But I'm tired of buggy drivers */ return ULONG_MAX; } static inline bool file_mmap_ok(struct file *file, struct inode *inode, unsigned long pgoff, unsigned long len) { u64 maxsize = file_mmap_size_max(file, inode); if (maxsize && len > maxsize) return false; maxsize -= len; if (pgoff > maxsize >> PAGE_SHIFT) return false; return true; } /** * do_mmap() - Perform a userland memory mapping into the current process * address space of length @len with protection bits @prot, mmap flags @flags * (from which VMA flags will be inferred), and any additional VMA flags to * apply @vm_flags. If this is a file-backed mapping then the file is specified * in @file and page offset into the file via @pgoff. * * This function does not perform security checks on the file and assumes, if * @uf is non-NULL, the caller has provided a list head to track unmap events * for userfaultfd @uf. * * It also simply indicates whether memory population is required by setting * @populate, which must be non-NULL, expecting the caller to actually perform * this task itself if appropriate. * * This function will invoke architecture-specific (and if provided and * relevant, file system-specific) logic to determine the most appropriate * unmapped area in which to place the mapping if not MAP_FIXED. * * Callers which require userland mmap() behaviour should invoke vm_mmap(), * which is also exported for module use. * * Those which require this behaviour less security checks, userfaultfd and * populate behaviour, and who handle the mmap write lock themselves, should * call this function. * * Note that the returned address may reside within a merged VMA if an * appropriate merge were to take place, so it doesn't necessarily specify the * start of a VMA, rather only the start of a valid mapped range of length * @len bytes, rounded down to the nearest page size. * * The caller must write-lock current->mm->mmap_lock. * * @file: An optional struct file pointer describing the file which is to be * mapped, if a file-backed mapping. * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the * address at which to perform this mapping. See mmap (2) for details. Must be * page-aligned. * @len: The length of the mapping. Will be page-aligned and must be at least 1 * page in size. * @prot: Protection bits describing access required to the mapping. See mmap * (2) for details. * @flags: Flags specifying how the mapping should be performed, see mmap (2) * for details. * @vm_flags: VMA flags which should be set by default, or 0 otherwise. * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise. * @populate: A pointer to a value which will be set to 0 if no population of * the range is required, or the number of bytes to populate if it is. Must be * non-NULL. See mmap (2) for details as to under what circumstances population * of the range occurs. * @uf: An optional pointer to a list head to track userfaultfd unmap events * should unmapping events arise. If provided, it is up to the caller to manage * this. * * Returns: Either an error, or the address at which the requested mapping has * been performed. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { struct mm_struct *mm = current->mm; int pkey = 0; *populate = 0; mmap_assert_write_locked(mm); if (!len) return -EINVAL; /* * Does the application expect PROT_READ to imply PROT_EXEC? * * (the exception is when the underlying filesystem is noexec * mounted, in which case we don't add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; /* force arch specific MAP_FIXED handling in get_unmapped_area */ if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED; if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len) return -ENOMEM; /* offset overflow? */ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) return -EOVERFLOW; /* Too many mappings? */ if (mm->map_count > sysctl_max_map_count) return -ENOMEM; /* * addr is returned from get_unmapped_area, * There are two cases: * 1> MAP_FIXED == false * unallocated memory, no need to check sealing. * 1> MAP_FIXED == true * sealing is checked inside mmap_region when * do_vmi_munmap is called. */ if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) pkey = 0; } /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags); if (IS_ERR_VALUE(addr)) return addr; if (flags & MAP_FIXED_NOREPLACE) { if (find_vma_intersection(mm, addr, addr + len)) return -EEXIST; } if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; if (!mlock_future_ok(mm, vm_flags, len)) return -EAGAIN; if (file) { struct inode *inode = file_inode(file); unsigned long flags_mask; int err; if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; flags_mask = LEGACY_MAP_MASK; if (file->f_op->fop_flags & FOP_MMAP_SYNC) flags_mask |= MAP_SYNC; switch (flags & MAP_TYPE) { case MAP_SHARED: /* * Force use of MAP_SHARED_VALIDATE with non-legacy * flags. E.g. MAP_SYNC is dangerous to use with * MAP_SHARED as you don't know which consistency model * you will get. We silently ignore unsupported flags * with MAP_SHARED to preserve backward compatibility. */ flags &= LEGACY_MAP_MASK; fallthrough; case MAP_SHARED_VALIDATE: if (flags & ~flags_mask) return -EOPNOTSUPP; if (prot & PROT_WRITE) { if (!(file->f_mode & FMODE_WRITE)) return -EACCES; if (IS_SWAPFILE(file->f_mapping->host)) return -ETXTBSY; } /* * Make sure we don't allow writing to an append-only * file.. */ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) return -EACCES; vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; if (path_noexec(&file->f_path)) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; } if (!file->f_op->mmap) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; break; default: return -EINVAL; } /* * Check to see if we are violating any seals and update VMA * flags if necessary to avoid future seal violations. */ err = memfd_check_seals_mmap(file, &vm_flags); if (err) return (unsigned long)err; } else { switch (flags & MAP_TYPE) { case MAP_SHARED: if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; /* * Ignore pgoff. */ pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_DROPPABLE: if (VM_DROPPABLE == VM_NONE) return -ENOTSUPP; /* * A locked or stack area makes no sense to be droppable. * * Also, since droppable pages can just go away at any time * it makes no sense to copy them on fork or dump them. * * And don't attempt to combine with hugetlb for now. */ if (flags & (MAP_LOCKED | MAP_HUGETLB)) return -EINVAL; if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) return -EINVAL; vm_flags |= VM_DROPPABLE; /* * If the pages can be dropped, then it doesn't make * sense to reserve them. */ vm_flags |= VM_NORESERVE; /* * Likewise, they're volatile enough that they * shouldn't survive forks or coredumps. */ vm_flags |= VM_WIPEONFORK | VM_DONTDUMP; fallthrough; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. */ pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; } } /* * Set 'VM_NORESERVE' if we should not account for the * memory use of this mapping. */ if (flags & MAP_NORESERVE) { /* We honor MAP_NORESERVE if allowed to overcommit */ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) vm_flags |= VM_NORESERVE; /* hugetlb applies strict overcommit unless MAP_NORESERVE */ if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; } addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; return addr; } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { struct file *file = NULL; unsigned long retval; if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); if (!file) return -EBADF; if (is_file_hugepages(file)) { len = ALIGN(len, huge_page_size(hstate_file(file))); } else if (unlikely(flags & MAP_HUGETLB)) { retval = -EINVAL; goto out_fput; } } else if (flags & MAP_HUGETLB) { struct hstate *hs; hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (!hs) return -EINVAL; len = ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); } retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); out_fput: if (file) fput(file); return retval; } SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } #ifdef __ARCH_WANT_SYS_OLD_MMAP struct mmap_arg_struct { unsigned long addr; unsigned long len; unsigned long prot; unsigned long flags; unsigned long fd; unsigned long offset; }; SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) { struct mmap_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; if (offset_in_page(a.offset)) return -EINVAL; return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ /* * Determine if the allocation needs to ensure that there is no * existing mapping within it's guard gaps, for use as start_gap. */ static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) { if (vm_flags & VM_SHADOW_STACK) return PAGE_SIZE; return 0; } /* * Search for an unmapped address range. * * We are looking for a range that: * - does not intersect with any VMA; * - is contained within the [low_limit, high_limit) interval; * - is at least the desired size. * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) */ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) { unsigned long addr; if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) addr = unmapped_area_topdown(info); else addr = unmapped_area(info); trace_vm_unmapped_area(addr, info); return addr; } /* Get an address range which is currently unmapped. * For shmat() with addr=0. * * Ugly calling convention alert: * Return value with the low bits set means error value, * ie * if (ret & ~PAGE_MASK) * error = ret; * * This function "knows" that -ENOMEM has the bits set. */ unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.length = len; info.low_limit = mm->mmap_base; info.high_limit = mmap_end; info.start_gap = stack_guard_placement(vm_flags); if (filp && is_file_hugepages(filp)) info.align_mask = huge_page_mask_align(filp); return vm_unmapped_area(&info); } #ifndef HAVE_ARCH_UNMAPPED_AREA unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } #endif /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): */ unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); /* requested length too big for entire address space */ if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.start_gap = stack_guard_placement(vm_flags); if (filp && is_file_hugepages(filp)) info.align_mask = huge_page_mask_align(filp); addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = mmap_end; addr = vm_unmapped_area(&info); } return addr; } #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); } #endif unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { if (test_bit(MMF_TOPDOWN, &mm->flags)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } unsigned long __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; unsigned long error = arch_mmap_check(addr, len, flags); if (error) return error; /* Careful about overflows.. */ if (len > TASK_SIZE) return -ENOMEM; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; } else if (flags & MAP_SHARED) { /* * mmap_region() will call shmem_zero_setup() to create a file, * so use shmem's get_unmapped_area in case it can be huge. */ get_area = shmem_get_unmapped_area; } /* Always treat pgoff as zero for anonymous memory. */ if (!file) pgoff = 0; if (get_area) { addr = get_area(file, addr, len, pgoff, flags); } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } else { addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff, flags, vm_flags); } if (IS_ERR_VALUE(addr)) return addr; if (addr > TASK_SIZE - len) return -ENOMEM; if (offset_in_page(addr)) return -EINVAL; error = security_mmap_addr(addr); return error ? error : addr; } unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { if (test_bit(MMF_TOPDOWN, &mm->flags)) return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0); return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. * @start_addr: The inclusive start user address. * @end_addr: The exclusive end user address. * * Returns: The first VMA within the provided range, %NULL otherwise. Assumes * start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr) { unsigned long index = start_addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, end_addr - 1); } EXPORT_SYMBOL(find_vma_intersection); /** * find_vma() - Find the VMA for a given address, or the next VMA. * @mm: The mm_struct to check * @addr: The address * * Returns: The VMA associated with addr, or the next VMA. * May return %NULL in the case of no VMA at addr or above. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { unsigned long index = addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, ULONG_MAX); } EXPORT_SYMBOL(find_vma); /** * find_vma_prev() - Find the VMA for a given address, or the next vma and * set %pprev to the previous VMA, if any. * @mm: The mm_struct to check * @addr: The address * @pprev: The pointer to set to the previous VMA * * Note that RCU lock is missing here since the external mmap_lock() is used * instead. * * Returns: The VMA associated with @addr, or the next vma. * May return %NULL in the case of no vma at addr or above. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, addr); vma = vma_iter_load(&vmi); *pprev = vma_prev(&vmi); if (!vma) vma = vma_next(&vmi); return vma; } /* enforced gap between the expanding stack and other mappings. */ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; static int __init cmdline_parse_stack_guard_gap(char *p) { unsigned long val; char *endptr; val = simple_strtoul(p, &endptr, 10); if (!*endptr) stack_guard_gap = val << PAGE_SHIFT; return 1; } __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); #ifdef CONFIG_STACK_GROWSUP int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_upwards(vma, address); } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; addr &= PAGE_MASK; vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; if (!prev) return NULL; if (expand_stack_locked(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_downwards(vma, address); } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; unsigned long start; addr &= PAGE_MASK; vma = find_vma(mm, addr); if (!vma) return NULL; if (vma->vm_start <= addr) return vma; start = vma->vm_start; if (expand_stack_locked(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) populate_vma_page_range(vma, addr, start, NULL); return vma; } #endif #if defined(CONFIG_STACK_GROWSUP) #define vma_expand_up(vma,addr) expand_upwards(vma, addr) #define vma_expand_down(vma, addr) (-EFAULT) #else #define vma_expand_up(vma,addr) (-EFAULT) #define vma_expand_down(vma, addr) expand_downwards(vma, addr) #endif /* * expand_stack(): legacy interface for page faulting. Don't use unless * you have to. * * This is called with the mm locked for reading, drops the lock, takes * the lock for writing, tries to look up a vma again, expands it if * necessary, and downgrades the lock to reading again. * * If no vma is found or it can't be expanded, it returns NULL and has * dropped the lock. */ struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; mmap_read_unlock(mm); if (mmap_write_lock_killable(mm)) return NULL; vma = find_vma_prev(mm, addr, &prev); if (vma && vma->vm_start <= addr) goto success; if (prev && !vma_expand_up(prev, addr)) { vma = prev; goto success; } if (vma && !vma_expand_down(vma, addr)) goto success; mmap_write_unlock(mm); return NULL; success: mmap_write_downgrade(mm); return vma; } /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. * @mm: The mm_struct * @start: The start address to munmap * @len: The length to be munmapped. * @uf: The userfaultfd list_head * * Return: 0 on success, error otherwise. */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { VMA_ITERATOR(vmi, mm, start); return do_vmi_munmap(&vmi, mm, start, len, uf, false); } int vm_munmap(unsigned long start, size_t len) { return __vm_munmap(start, len, false); } EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { addr = untagged_addr(addr); return __vm_munmap(addr, len, true); } /* * Emulation of deprecated remap_file_pages() syscall. */ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long, prot, unsigned long, pgoff, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long populate = 0; unsigned long ret = -EINVAL; struct file *file; vm_flags_t vm_flags; pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", current->comm, current->pid); if (prot) return ret; start = start & PAGE_MASK; size = size & PAGE_MASK; if (start + size <= start) return ret; /* Does pgoff wrap? */ if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; if (mmap_read_lock_killable(mm)) return -EINTR; /* * Look up VMA under read lock first so we can perform the security * without holding locks (which can be problematic). We reacquire a * write lock later and check nothing changed underneath us. */ vma = vma_lookup(mm, start); if (!vma || !(vma->vm_flags & VM_SHARED)) { mmap_read_unlock(mm); return -EINVAL; } prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; /* Save vm_flags used to calculate prot and flags, and recheck later. */ vm_flags = vma->vm_flags; file = get_file(vma->vm_file); mmap_read_unlock(mm); /* Call outside mmap_lock to be consistent with other callers. */ ret = security_mmap_file(file, prot, flags); if (ret) { fput(file); return ret; } ret = -EINVAL; /* OK security check passed, take write lock + let it rip. */ if (mmap_write_lock_killable(mm)) { fput(file); return -EINTR; } vma = vma_lookup(mm, start); if (!vma) goto out; /* Make sure things didn't change under us. */ if (vma->vm_flags != vm_flags) goto out; if (vma->vm_file != file) goto out; if (start + size > vma->vm_end) { VMA_ITERATOR(vmi, mm, vma->vm_end); struct vm_area_struct *next, *prev = vma; for_each_vma_range(vmi, next, start + size) { /* hole between vmas ? */ if (next->vm_start != prev->vm_end) goto out; if (next->vm_file != vma->vm_file) goto out; if (next->vm_flags != vma->vm_flags) goto out; if (start + size <= next->vm_end) break; prev = next; } if (!next) goto out; } ret = do_mmap(vma->vm_file, start, size, prot, flags, 0, pgoff, &populate, NULL); out: mmap_write_unlock(mm); fput(file); if (populate) mm_populate(ret, populate); if (!IS_ERR_VALUE(ret)) ret = 0; return ret; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; int ret; bool populate; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, addr); len = PAGE_ALIGN(request); if (len < request) return -ENOMEM; if (!len) return 0; /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) return -EINVAL; if (mmap_write_lock_killable(mm)) return -EINTR; ret = check_brk_limits(addr, len); if (ret) goto limits_failed; ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; vma = vma_prev(&vmi); ret = do_brk_flags(&vmi, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; munmap_failed: limits_failed: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(vm_brk_flags); /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); mmap_read_lock(mm); arch_exit_mmap(mm); vma = vma_next(&vmi); if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); mmap_write_lock(mm); goto destroy; } flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false); mmap_read_unlock(mm); /* * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper * because the memory has been already freed. */ set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); /* * Walk the list again, actually closing and freeing it, with preemption * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ vma_iter_set(&vmi, vma->vm_end); do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma_mark_detached(vma); remove_vma(vma); count++; cond_resched(); vma = vma_next(&vmi); } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); trace_exit_mmap(mm); destroy: __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_rwsem is taken here. */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long charged = vma_pages(vma); if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; /* * The vm_pgoff of a purely anonymous vma should be irrelevant * until its first write fault, when page's anon_vma and index * are set. But now set the vm_pgoff it will almost certainly * end up with (unless mremap moves it elsewhere before that * first wfault), so /proc/pid/maps tells a consistent story. * * By setting it to reflect the virtual start address of the * vma, merges and splits can happen in a seamless way, just * using the existing file pgoff checks and manipulations. * Similarly in do_mmap and in do_brk_flags. */ if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } if (vma_link(mm, vma)) { if (vma->vm_flags & VM_ACCOUNT) vm_unacct_memory(charged); return -ENOMEM; } return 0; } /* * Return true if the calling process may expand its vm space by the passed * number of pages */ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) { if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) return false; if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { /* Workaround for Valgrind */ if (rlimit(RLIMIT_DATA) == 0 && mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) return true; pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n", current->comm, current->pid, (mm->data_vm + npages) << PAGE_SHIFT, rlimit(RLIMIT_DATA), ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data"); if (!ignore_rlimit_data) return false; } return true; } void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) { WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); if (is_exec_mapping(flags)) mm->exec_vm += npages; else if (is_stack_mapping(flags)) mm->stack_vm += npages; else if (is_data_mapping(flags)) mm->data_vm += npages; } static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Close hook, called for unmap() and on the old vma for mremap(). * * Having a close hook prevents vma merging regardless of flags. */ static void special_mapping_close(struct vm_area_struct *vma) { const struct vm_special_mapping *sm = vma->vm_private_data; if (sm->close) sm->close(sm, vma); } static const char *special_mapping_name(struct vm_area_struct *vma) { return ((struct vm_special_mapping *)vma->vm_private_data)->name; } static int special_mapping_mremap(struct vm_area_struct *new_vma) { struct vm_special_mapping *sm = new_vma->vm_private_data; if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; if (sm->mremap) return sm->mremap(sm, new_vma); return 0; } static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr) { /* * Forbid splitting special mappings - kernel has expectations over * the number of pages in mapping. Together with VM_DONTEXPAND * the size of vma should stay the same over the special mapping's * lifetime. */ return -EINVAL; } static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, .mremap = special_mapping_mremap, .name = special_mapping_name, /* vDSO code relies that VVAR can't be accessed remotely */ .access = NULL, .may_split = special_mapping_split, }; static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; struct page **pages; struct vm_special_mapping *sm = vma->vm_private_data; if (sm->fault) return sm->fault(sm, vmf->vma, vmf); pages = sm->pages; for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; if (*pages) { struct page *page = *pages; get_page(page); vmf->page = page; return 0; } return VM_FAULT_SIGBUS; } static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, void *priv, const struct vm_operations_struct *ops) { int ret; struct vm_area_struct *vma; vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); vma_set_range(vma, addr, addr + len, 0); vm_flags_init(vma, (vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) goto out; vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); perf_event_mmap(vma); return vma; out: vm_area_free(vma); return ERR_PTR(ret); } bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm) { return vma->vm_private_data == sm && vma->vm_ops == &special_mapping_vmops; } /* * Called with mm->mmap_lock held for writing. * Insert a new vma covering the given region, with the given flags. * Its pages are supplied by the given array of struct page *. * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. * The region past the last page supplied will always produce SIGBUS. * The array pointer and the pages it points to are assumed to stay alive * for as long as this mapping might exist. */ struct vm_area_struct *_install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, const struct vm_special_mapping *spec) { return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, &special_mapping_vmops); } #ifdef CONFIG_SYSCTL #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif static const struct ctl_table mmap_table[] = { { .procname = "max_map_count", .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) { .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS { .procname = "mmap_rnd_bits", .data = &mmap_rnd_bits, .maxlen = sizeof(mmap_rnd_bits), .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&mmap_rnd_bits_min, .extra2 = (void *)&mmap_rnd_bits_max, }, #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS { .procname = "mmap_rnd_compat_bits", .data = &mmap_rnd_compat_bits, .maxlen = sizeof(mmap_rnd_compat_bits), .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif }; #endif /* CONFIG_SYSCTL */ /* * initialise the percpu counter for VM */ void __init mmap_init(void) { int ret; ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); #ifdef CONFIG_SYSCTL register_sysctl_init("vm", mmap_table); #endif } /* * Initialise sysctl_user_reserve_kbytes. * * This is intended to prevent a user from starting a single memory hogging * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER * mode. * * The default value is min(3% of free memory, 128MB) * 128MB is enough to recover with sshd/login, bash, and top/kill. */ static int init_user_reserve(void) { unsigned long free_kbytes; free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K); return 0; } subsys_initcall(init_user_reserve); /* * Initialise sysctl_admin_reserve_kbytes. * * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin * to log in and kill a memory hogging process. * * Systems with more than 256MB will reserve 8MB, enough to recover * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will * only reserve 3% of free pages by default. */ static int init_admin_reserve(void) { unsigned long free_kbytes; free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K); return 0; } subsys_initcall(init_admin_reserve); /* * Reinititalise user and admin reserves if memory is added or removed. * * The default user reserve max is 128MB, and the default max for the * admin reserve is 8MB. These are usually, but not always, enough to * enable recovery from a memory hogging process using login/sshd, a shell, * and tools like top. It may make sense to increase or even disable the * reserve depending on the existence of swap or variations in the recovery * tools. So, the admin may have changed them. * * If memory is added and the reserves have been eliminated or increased above * the default max, then we'll trust the admin. * * If memory is removed and there isn't enough free memory, then we * need to reset the reserves. * * Otherwise keep the reserve set by the admin. */ static int reserve_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { unsigned long tmp, free_kbytes; switch (action) { case MEM_ONLINE: /* Default max is 128MB. Leave alone if modified by operator. */ tmp = sysctl_user_reserve_kbytes; if (tmp > 0 && tmp < SZ_128K) init_user_reserve(); /* Default max is 8MB. Leave alone if modified by operator. */ tmp = sysctl_admin_reserve_kbytes; if (tmp > 0 && tmp < SZ_8K) init_admin_reserve(); break; case MEM_OFFLINE: free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); if (sysctl_user_reserve_kbytes > free_kbytes) { init_user_reserve(); pr_info("vm.user_reserve_kbytes reset to %lu\n", sysctl_user_reserve_kbytes); } if (sysctl_admin_reserve_kbytes > free_kbytes) { init_admin_reserve(); pr_info("vm.admin_reserve_kbytes reset to %lu\n", sysctl_admin_reserve_kbytes); } break; default: break; } return NOTIFY_OK; } static int __meminit init_reserve_notifier(void) { if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI)) pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; } subsys_initcall(init_reserve_notifier); /* * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between * this VMA and its relocated range, which will now reside at [vma->vm_start - * shift, vma->vm_end - shift). * * This function is almost certainly NOT what you want for anything other than * early executable temporary stack relocation. */ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) { /* * The process proceeds as follows: * * 1) Use shift to calculate the new vma endpoints. * 2) Extend vma to cover both the old and new ranges. This ensures the * arguments passed to subsequent functions are consistent. * 3) Move vma's page tables to the new range. * 4) Free up any cleared pgd range. * 5) Shrink the vma to cover only the new range. */ struct mm_struct *mm = vma->vm_mm; unsigned long old_start = vma->vm_start; unsigned long old_end = vma->vm_end; unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; VMA_ITERATOR(vmi, mm, new_start); VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); struct vm_area_struct *next; struct mmu_gather tlb; PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length); BUG_ON(new_start > new_end); /* * ensure there are no vmas between where we want to go * and where we are */ if (vma != vma_next(&vmi)) return -EFAULT; vma_iter_prev_range(&vmi); /* * cover the whole range: [new_start, old_end) */ vmg.middle = vma; if (vma_expand(&vmg)) return -ENOMEM; /* * move the page tables downwards, on failure we rely on * process cleanup to remove whatever mess we made. */ pmc.for_stack = true; if (length != move_page_tables(&pmc)) return -ENOMEM; tlb_gather_mmu(&tlb, mm); next = vma_next(&vmi); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, next ? next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch * the address space in [new_end, old_start) some architectures * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, next ? next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb); vma_prev(&vmi); /* Shrink the vma to just the new range */ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } #ifdef CONFIG_MMU /* * Obtain a read lock on mm->mmap_lock, if the specified address is below the * start of the VMA, the intent is to perform a write, and it is a * downward-growing stack, then attempt to expand the stack to contain it. * * This function is intended only for obtaining an argument page from an ELF * image, and is almost certainly NOT what you want to use for any other * purpose. * * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the * VMA referenced must not be linked in any user-visible tree, i.e. it must be a * new VMA being mapped. * * The function assumes that addr is either contained within the VMA or below * it, and makes no attempt to validate this value beyond that. * * Returns true if the read lock was obtained and a stack was perhaps expanded, * false if the stack expansion failed. * * On stack expansion the function temporarily acquires an mmap write lock * before downgrading it. */ bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *new_vma, unsigned long addr, bool write) { if (!write || addr >= new_vma->vm_start) { mmap_read_lock(mm); return true; } if (!(new_vma->vm_flags & VM_GROWSDOWN)) return false; mmap_write_lock(mm); if (expand_downwards(new_vma, addr)) { mmap_write_unlock(mm); return false; } mmap_write_downgrade(mm); return true; } #else bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, bool write) { return false; } #endif |
3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018, Intel Corporation. */ /* A common module to handle registrations and notifications for paravirtual * drivers to enable accelerated datapath and support VF live migration. * * The notifier and event handling code is based on netvsc driver. */ #include <linux/module.h> #include <linux/etherdevice.h> #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/failover.h> static LIST_HEAD(failover_list); static DEFINE_SPINLOCK(failover_lock); static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) { struct net_device *failover_dev; struct failover *failover; spin_lock(&failover_lock); list_for_each_entry(failover, &failover_list, list) { failover_dev = rtnl_dereference(failover->failover_dev); if (ether_addr_equal(failover_dev->perm_addr, mac)) { *ops = rtnl_dereference(failover->ops); spin_unlock(&failover_lock); return failover_dev; } } spin_unlock(&failover_lock); return NULL; } /** * failover_slave_register - Register a slave netdev * * @slave_dev: slave netdev that is being registered * * Registers a slave device to a failover instance. Only ethernet devices * are supported. */ static int failover_slave_register(struct net_device *slave_dev) { struct netdev_lag_upper_info lag_upper_info; struct net_device *failover_dev; struct failover_ops *fops; int err; if (slave_dev->type != ARPHRD_ETHER) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_register && fops->slave_pre_register(slave_dev, failover_dev)) goto done; err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, failover_dev); if (err) { netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", err); goto done; } lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, &lag_upper_info, NULL); if (err) { netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", failover_dev->name, err); goto err_upper_link; } slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: return NOTIFY_DONE; } /** * failover_slave_unregister - Unregister a slave netdev * * @slave_dev: slave netdev that is being unregistered * * Unregisters a slave device from a failover instance. */ int failover_slave_unregister(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_unregister && fops->slave_pre_unregister(slave_dev, failover_dev)) goto done; netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(failover_slave_unregister); static int failover_slave_link_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_link_change && !fops->slave_link_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_slave_name_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_name_change && !fops->slave_name_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); /* Skip parent events */ if (netif_is_failover(event_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: return failover_slave_register(event_dev); case NETDEV_UNREGISTER: return failover_slave_unregister(event_dev); case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: return failover_slave_link_change(event_dev); case NETDEV_CHANGENAME: return failover_slave_name_change(event_dev); default: return NOTIFY_DONE; } } static struct notifier_block failover_notifier = { .notifier_call = failover_event, }; static void failover_existing_slave_register(struct net_device *failover_dev) { struct net *net = dev_net(failover_dev); struct net_device *dev; rtnl_lock(); for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) failover_slave_register(dev); } rtnl_unlock(); } /** * failover_register - Register a failover instance * * @dev: failover netdev * @ops: failover ops * * Allocate and register a failover instance for a failover netdev. ops * provides handlers for slave device register/unregister/link change/ * name change events. * * Return: pointer to failover instance */ struct failover *failover_register(struct net_device *dev, struct failover_ops *ops) { struct failover *failover; if (dev->type != ARPHRD_ETHER) return ERR_PTR(-EINVAL); failover = kzalloc(sizeof(*failover), GFP_KERNEL); if (!failover) return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); spin_lock(&failover_lock); list_add_tail(&failover->list, &failover_list); spin_unlock(&failover_lock); netdev_info(dev, "failover master:%s registered\n", dev->name); failover_existing_slave_register(dev); return failover; } EXPORT_SYMBOL_GPL(failover_register); /** * failover_unregister - Unregister a failover instance * * @failover: pointer to failover instance * * Unregisters and frees a failover instance. */ void failover_unregister(struct failover *failover) { struct net_device *failover_dev; failover_dev = rcu_dereference(failover->failover_dev); netdev_info(failover_dev, "failover master:%s unregistered\n", failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; netdev_put(failover_dev, &failover->dev_tracker); spin_lock(&failover_lock); list_del(&failover->list); spin_unlock(&failover_lock); kfree(failover); } EXPORT_SYMBOL_GPL(failover_unregister); static __init int failover_init(void) { register_netdevice_notifier(&failover_notifier); return 0; } module_init(failover_init); static __exit void failover_exit(void) { unregister_netdevice_notifier(&failover_notifier); } module_exit(failover_exit); MODULE_DESCRIPTION("Generic failover infrastructure/interface"); MODULE_LICENSE("GPL v2"); |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 | /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); /** * rdma_roce_rescan_port - Rescan all of the network devices in the system * and add their gids if relevant to the port of the RoCE device. * * @ib_dev: IB device * @port: Port number */ void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) { struct net_device *ndev = NULL; if (rdma_protocol_roce(ib_dev, port)) { ndev = ib_device_get_netdev(ib_dev, port); if (!ndev) return; enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); dev_put(ndev); } } EXPORT_SYMBOL(rdma_roce_rescan_port); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); } |
76 76 235 233 235 229 21 235 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/posix-timers.h> #include <linux/cn_proc.h> #include <linux/mutex.h> #include <linux/futex.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> #include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> #include <linux/rethook.h> #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> #include <linux/pidfs.h> #include <uapi/linux/wait.h> #include <asm/unistd.h> #include <asm/mmu_context.h> #include "exit.h" /* * The default value should be high enough to not crash a system that randomly * crashes its kernel from time to time, but low enough to at least not permit * overflowing 32-bit refcounts or the ldsem writer count. */ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL static const struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, .maxlen = sizeof(oops_limit), .mode = 0644, .proc_handler = proc_douintvec, }, }; static __init int kernel_exit_sysctls_init(void) { register_sysctl_init("kernel", kern_exit_table); return 0; } late_initcall(kernel_exit_sysctls_init); #endif static atomic_t oops_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); } static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); static __init int kernel_exit_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); return 0; } late_initcall(kernel_exit_sysfs_init); #endif /* * For things release_task() would like to do *after* tasklist_lock is released. */ struct release_task_post { struct pid *pids[PIDTYPE_MAX]; }; static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { nr_threads--; detach_pid(post->pids, p, PIDTYPE_PID); if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) posix_cpu_timers_exit_group(tsk); #endif if (group_dead) { tty = sig->tty; sig->tty = NULL; } else { /* * If there is any task waiting for the group exit * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); } /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, * but we want to avoid the race with thread_group_cputime() which can * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); if (group_dead) tty_kref_put(tty); } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } void put_task_struct_rcu_user(struct task_struct *task) { if (refcount_dec_and_test(&task->rcu_users)) call_rcu(&task->rcu, delayed_put_task_struct); } void __weak release_thread(struct task_struct *dead_task) { } void release_task(struct task_struct *p) { struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: memset(&post, 0, sizeof(post)); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); pidfs_exit(p); cgroup_release(p); thread_pid = get_pid(p->thread_pid); write_lock_irq(&tasklist_lock); ptrace_release_task(p); __exit_signal(&post, p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the * group leader's parent process. (if it wants notification.) */ zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { /* for pidfs_exit() and do_notify_parent() */ if (leader->signal->flags & SIGNAL_GROUP_EXIT) leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. */ zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); proc_flush_pid(thread_pid); put_pid(thread_pid); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); release_thread(p); /* * This task was already removed from the process/thread/pid lists * and lock_task_sighand(p) can't succeed. Nobody else can touch * ->pending or, if group dead, signal->shared_pending. We can call * flush_sigqueue() lockless. */ flush_sigqueue(&p->pending); if (thread_group_leader(p)) flush_sigqueue(&p->signal->shared_pending); put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } int rcuwait_wake_up(struct rcuwait *w) { int ret = 0; struct task_struct *task; rcu_read_lock(); /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE * [S] tsk = current [S] cond = true * MB (A) MB (B) * [L] cond [L] tsk */ smp_mb(); /* (B) */ task = rcu_dereference(w->task); if (task) ret = wake_up_process(task); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are * to receive a SIGHUP and a SIGCONT. * * "I ask you, have you ever known what it is to be an orphan?" */ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if ((p == ignored_task) || (p->exit_state && thread_group_empty(p)) || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return 1; } int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; } static bool has_stopped_jobs(struct pid *pgrp) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->signal->flags & SIGNAL_STOP_STOPPED) return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return false; } /* * Check to see if any process groups have become orphaned as * a result of our exiting, and if they have any stopped jobs, * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) { struct pid *pgrp = task_pgrp(tsk); struct task_struct *ignored_task = tsk; if (!parent) /* exit: our father is in a different pgrp than * we are and we were the only connection outside. */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than * we are, and it was the only connection outside. */ ignored_task = NULL; if (task_pgrp(parent) != pgrp && task_session(parent) == task_session(tsk) && will_become_orphaned_pgrp(pgrp, ignored_task) && has_stopped_jobs(pgrp)) { __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } static void coredump_task_exit(struct task_struct *tsk) { struct core_state *core_state; /* * Serialize with any possible pending coredump. * We must hold siglock around checking core_state * and setting PF_POSTCOREDUMP. The core-inducing thread * will increment ->nr_threads for each thread in the * group without PF_POSTCOREDUMP set. */ spin_lock_irq(&tsk->sighand->siglock); tsk->flags |= PF_POSTCOREDUMP; core_state = tsk->signal->core_state; spin_unlock_irq(&tsk->sighand->siglock); if (core_state) { struct core_thread self; self.task = current; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); else self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_current_state(TASK_IDLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; schedule(); } __set_current_state(TASK_RUNNING); } } #ifdef CONFIG_MEMCG /* drops tasklist_lock if succeeds */ static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) { bool ret = false; task_lock(tsk); if (likely(tsk->mm == mm)) { /* tsk can't pass exit_mm/exec_mmap and exit */ read_unlock(&tasklist_lock); WRITE_ONCE(mm->owner, tsk); lru_gen_migrate_mm(mm); ret = true; } task_unlock(tsk); return ret; } static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) { struct task_struct *t; for_each_thread(g, t) { struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm == mm) { if (__try_to_set_owner(t, mm)) return true; } else if (t_mm) break; } return false; } /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *g, *p = current; /* * If the exiting or execing task is not the owner, it's * someone else's problem. */ if (mm->owner != p) return; /* * The current owner is exiting/execing and there are no other * candidates. Do not leave the mm pointing to a possibly * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { WRITE_ONCE(mm->owner, NULL); return; } read_lock(&tasklist_lock); /* * Search in the children */ list_for_each_entry(g, &p->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search in the siblings */ list_for_each_entry(g, &p->real_parent->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search through everything else, we should not get here often. */ for_each_process(g) { if (atomic_read(&mm->mm_users) <= 1) break; if (g->flags & PF_KTHREAD) continue; if (try_to_set_owner(g, mm)) goto ret; } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); ret: return; } #endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ static void exit_mm(void) { struct mm_struct *mm = current->mm; exit_mm_release(current, mm); if (!mm) return; mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); /* * When a thread stops operating on an address space, the loop * in membarrier_private_expedited() may not observe that * tsk->mm, and the loop in membarrier_global_expedited() may * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED * rq->membarrier_state, so those would not issue an IPI. * Membarrier requires a memory barrier after accessing * user-space memory, before clearing tsk->mm or the * rq->membarrier_state. */ smp_mb__after_spinlock(); local_irq_disable(); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); local_irq_enable(); task_unlock(current); mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) { struct task_struct *t; for_each_thread(p, t) { if (!(t->flags & PF_EXITING)) return t; } return NULL; } static struct task_struct *find_child_reaper(struct task_struct *father, struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; struct task_struct *p, *n; if (likely(reaper != father)) return reaper; reaper = find_alive_thread(father); if (reaper) { pid_ns->child_reaper = reaper; return reaper; } write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); return father; } /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists * 2. give it to the first ancestor process which prctl'd itself as a * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father, struct task_struct *child_reaper) { struct task_struct *thread, *reaper; thread = find_alive_thread(father); if (thread) return thread; if (father->signal->has_child_subreaper) { unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. * We can't check reaper != child_reaper to ensure we do not * cross the namespaces, the exiting parent could be injected * by setns() + fork(). * We check pid->level, this is slightly more efficient than * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ for (reaper = father->real_parent; task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; thread = find_alive_thread(reaper); if (thread) return thread; } } return child_reaper; } /* * Any that need to be release_task'd are put on the @dead list. */ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } /* * This does two things: * * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) { struct task_struct *p, *t, *reaper; if (unlikely(!list_empty(&father->ptraced))) exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { RCU_INIT_POINTER(t->real_parent, reaper); BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t, PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. */ if (!same_thread_group(reaper, father)) reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); } /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; struct task_struct *p, *n; LIST_HEAD(dead); write_lock_irq(&tasklist_lock); forget_original_parent(tsk, &dead); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && !ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD; autoreap = do_notify_parent(tsk, sig); } else if (thread_group_leader(tsk)) { autoreap = thread_group_empty(tsk) && do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; /* untraced sub-thread */ do_notify_pidfd(tsk); } if (autoreap) { tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } } #ifdef CONFIG_DEBUG_STACK_USAGE unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ # ifdef CONFIG_STACK_GROWSUP n--; # else n++; # endif } while (!*n); # ifdef CONFIG_STACK_GROWSUP return (unsigned long)end_of_stack(p) - (unsigned long)n; # else return (unsigned long)n - (unsigned long)end_of_stack(p); # endif } /* Count the maximum pages reached in kernel stacks */ static inline void kstack_histogram(unsigned long used_stack) { #ifdef CONFIG_VM_EVENT_COUNTERS if (used_stack <= 1024) count_vm_event(KSTACK_1K); #if THREAD_SIZE > 1024 else if (used_stack <= 2048) count_vm_event(KSTACK_2K); #endif #if THREAD_SIZE > 2048 else if (used_stack <= 4096) count_vm_event(KSTACK_4K); #endif #if THREAD_SIZE > 4096 else if (used_stack <= 8192) count_vm_event(KSTACK_8K); #endif #if THREAD_SIZE > 8192 else if (used_stack <= 16384) count_vm_event(KSTACK_16K); #endif #if THREAD_SIZE > 16384 else if (used_stack <= 32768) count_vm_event(KSTACK_32K); #endif #if THREAD_SIZE > 32768 else if (used_stack <= 65536) count_vm_event(KSTACK_64K); #endif #if THREAD_SIZE > 65536 else count_vm_event(KSTACK_REST); #endif #endif /* CONFIG_VM_EVENT_COUNTERS */ } static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; free = stack_not_used(current); kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); } #else static inline void check_stack_usage(void) {} #endif static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; spin_lock_irq(&sighand->siglock); signal->quick_threads--; if ((signal->quick_threads == 0) && !(signal->flags & SIGNAL_GROUP_EXIT)) { signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = code; signal->group_stop_count = 0; } spin_unlock_irq(&sighand->siglock); } void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; WARN_ON(irqs_disabled()); synchronize_group_exit(tsk, code); WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { /* * If the last thread of global init has exited, panic * immediately to get a useable coredump. */ if (unlikely(is_global_init(tsk))) panic("Attempted to kill init! exitcode=0x%08x\n", tsk->signal->group_exit_code ?: (int)code); #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(); if (group_dead) acct_process(); trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. * * because of cgroup mode, must be called before cgroup_exit() */ perf_event_exit_task(tsk); sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(); if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); exit_tasks_rcu_finish(); lockdep_free_task(tsk); do_task_dead(); } void __noreturn make_task_dead(int signr) { /* * Take the task off the cpu after something catastrophic has * happened. * * We can get here from a kernel oops, sometimes with preemption off. * Start by checking for critical errors. * Then fix up important state like USER_DS and preemption. * Then do everything else. */ struct task_struct *tsk = current; unsigned int limit; if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); if (unlikely(irqs_disabled())) { pr_info("note: %s[%d] exited with irqs disabled\n", current->comm, task_pid_nr(current)); local_irq_enable(); } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); } /* * Every time the system oopses, if the oops happens while a reference * to an object was held, the reference leaks. * If the oops doesn't also leak memory, repeated oopsing can cause * reference counters to wrap around (if they're not using refcount_t). * This means that repeated oopsing can make unexploitable-looking bugs * exploitable through repeated oopsing. * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ limit = READ_ONCE(oops_limit); if (atomic_inc_return(&oops_count) >= limit && limit) panic("Oopsed too often (kernel.oops_limit is %d)", limit); /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); do_task_dead(); } do_exit(signr); } SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); } do_exit(exit_code); /* NOTREACHED */ } /* * this kills every thread in the thread group. Note that any externally * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ return 0; } static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; } static int eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; /* * Wait for all children (clone and not) if __WALL is set or * if it is traced by us. */ if (ptrace || (wo->wo_flags & __WALL)) return 1; /* * Otherwise, wait for clone children *only* if __WCLONE is set; * otherwise, wait for non-clone children *only*. * * Note: a "clone" child here is one that reports to its parent * using a signal other than SIGCHLD, or a non-leader thread which * we can only see if it is traced by us. */ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; } /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ state = (ptrace_reparented(p) && thread_group_leader(p)) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* * We own this thread, nobody else can reap it. */ read_unlock(&tasklist_lock); sched_annotate_sleep(); /* * Check thread_group_leader() to exclude the traced sub-threads. */ if (state == EXIT_DEAD && thread_group_leader(p)) { struct signal_struct *sig = p->signal; struct signal_struct *psig = current->signal; unsigned long maxrss; u64 tgutime, tgstime; /* * The resource counters for the group leader are in its * own task_struct. Those for dead threads in the group * are in its signal_struct, as are those for the child * processes it has previously reaped. All these * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these * p->signal fields because the whole thread group is dead * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += p->maj_flt + sig->maj_flt + sig->cmaj_flt; psig->cnvcsw += p->nvcsw + sig->nvcsw + sig->cnvcsw; psig->cnivcsw += p->nivcsw + sig->nivcsw + sig->cnivcsw; psig->cinblock += task_io_get_inblock(p) + sig->inblock + sig->cinblock; psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; maxrss = max(sig->maxrss, sig->cmaxrss); if (psig->cmaxrss < maxrss) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* If parent wants a zombie, don't release it now */ state = EXIT_ZOMBIE; if (do_notify_parent(p, p->exit_signal)) state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } if (state == EXIT_DEAD) release_task(p); out_info: infop = wo->wo_info; if (infop) { if ((status & 0x7f) == 0) { infop->cause = CLD_EXITED; infop->status = status >> 8; } else { infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; infop->status = status & 0x7f; } infop->pid = pid; infop->uid = uid; } return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) return &p->signal->group_exit_code; } return NULL; } /** * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED * @wo: wait options * @ptrace: is the wait for ptrace * @p: task to wait for * * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. * * CONTEXT: * read_lock(&tasklist_lock), which is released if return value is * non-zero. Also, grabs and releases @p->sighand->siglock. * * RETURNS: * 0 if wait condition didn't exist and search for other wait conditions * should continue. Non-zero return, -errno on failure and @p's pid on * success, implies that tasklist_lock is released and wait condition * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { struct waitid_info *infop; int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; /* * Traditionally we see ptrace'd stopped tasks regardless of options. */ if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; if (!task_stopped_code(p, ptrace)) return 0; exit_code = 0; spin_lock_irq(&p->sighand->siglock); p_code = task_stopped_code(p, ptrace); if (unlikely(!p_code)) goto unlock_sig; exit_code = *p_code; if (!exit_code) goto unlock_sig; if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) return 0; /* * Now we are pretty sure this task is interesting. * Make sure it doesn't get reaped out from under us while we * give up the lock and then examine it below. We don't want to * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ get_task_struct(p); pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); if (likely(!(wo->wo_flags & WNOWAIT))) wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { infop->cause = why; infop->status = exit_code; infop->pid = pid; infop->uid = uid; } return pid; } /* * Handle do_wait work for one task in a live, non-stopped state. * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { struct waitid_info *infop; pid_t pid; uid_t uid; if (!unlikely(wo->wo_flags & WCONTINUED)) return 0; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; spin_lock_irq(&p->sighand->siglock); /* Re-check with the lock held. */ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { spin_unlock_irq(&p->sighand->siglock); return 0; } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; if (!infop) { wo->wo_stat = 0xffff; } else { infop->cause = CLD_CONTINUED; infop->pid = pid; infop->uid = uid; infop->status = SIGCONT; } return pid; } /* * Consider @p for a wait by @parent. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { /* * We can race with wait_task_zombie() from another thread. * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, ptrace, p); if (!ret) return ret; if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. */ if (likely(!ptrace)) wo->notask_error = 0; return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { /* * If it is traced by its real parent's group, just pretend * the caller is ptrace_do_wait() and reap this child if it * is zombie. * * This also hides group stop state from real parent; otherwise * a single stop can be reported twice as group and ptrace stop. * If a ptracer wants to distinguish these two events for its * own children it should create a separate process which takes * the role of real parent. */ if (!ptrace_reparented(p)) ptrace = 1; } /* slay zombie? */ if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* * A zombie ptracee is only visible to its ptracer. * Notification and reaping will be cascaded to the * real parent when the ptracer detaches. */ if (unlikely(ptrace) || likely(!p->ptrace)) return wait_task_zombie(wo, p); } /* * Allow access to stopped/continued state via zombie by * falling through. Clearing of notask_error is complex. * * When !@ptrace: * * If WEXITED is set, notask_error should naturally be * cleared. If not, subset of WSTOPPED|WCONTINUED is set, * so, if there are live subthreads, there are events to * wait for. If all subthreads are dead, it's still safe * to clear - this function will be called again in finite * amount time once all the subthreads are released and * will then return without clearing. * * When @ptrace: * * Stopped state is per-task and thus can't change once the * target task dies. Only continued and exited can happen. * Clear notask_error if WCONTINUED | WEXITED. */ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ wo->notask_error = 0; } /* * Wait for stopped. Depending on @ptrace, different stopped state * is used and the two don't interact with each other. */ ret = wait_task_stopped(wo, ptrace, p); if (ret) return ret; /* * Wait for continued. There's only one continued state and the * ptracer can consume it which can confuse the real parent. Don't * use WCONTINUED from ptracer. You don't need or want it. */ return wait_task_continued(wo, p); } /* * Do the work of do_wait() for one thread in the group, @tsk. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } return 0; } static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } return 0; } bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) { if (!eligible_pid(wo, p)) return false; if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) return false; return true; } static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct task_struct *p = key; if (pid_child_should_wake(wo, p)) return default_wake_function(wait, mode, sync, key); return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, TASK_INTERRUPTIBLE, p); } static bool is_effectively_child(struct wait_opts *wo, bool ptrace, struct task_struct *target) { struct task_struct *parent = !ptrace ? target->real_parent : target->parent; return current == parent || (!(wo->wo_flags & __WNOTHREAD) && same_thread_group(current, parent)); } /* * Optimization for waiting on PIDTYPE_PID. No need to iterate through child * and tracee lists to find the target task. */ static int do_wait_pid(struct wait_opts *wo) { bool ptrace; struct task_struct *target; int retval; ptrace = false; target = pid_task(wo->wo_pid, PIDTYPE_TGID); if (target && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } ptrace = true; target = pid_task(wo->wo_pid, PIDTYPE_PID); if (target && target->ptrace && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } return 0; } long __do_wait(struct wait_opts *wo) { long retval; /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; read_lock(&tasklist_lock); if (wo->wo_type == PIDTYPE_PID) { retval = do_wait_pid(wo); if (retval) return retval; } else { struct task_struct *tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) return retval; retval = ptrace_do_wait(wo, tsk); if (retval) return retval; if (wo->wo_flags & __WNOTHREAD) break; } while_each_thread(current, tsk); } read_unlock(&tasklist_lock); notask: retval = wo->notask_error; if (!retval && !(wo->wo_flags & WNOHANG)) return -ERESTARTSYS; return retval; } static long do_wait(struct wait_opts *wo) { int retval; trace_sched_process_wait(wo->wo_pid); init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); wo->child_wait.private = current; add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); do { set_current_state(TASK_INTERRUPTIBLE); retval = __do_wait(wo); if (retval != -ERESTARTSYS) break; if (signal_pending(current)) break; schedule(); } while (1); __set_current_state(TASK_RUNNING); remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; switch (which) { case P_ALL: type = PIDTYPE_MAX; break; case P_PID: type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; if (upid < 0) return -EINVAL; if (upid) pid = find_get_pid(upid); else pid = get_task_pid(current, PIDTYPE_PGID); break; case P_PIDFD: type = PIDTYPE_PID; if (upid < 0) return -EINVAL; pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); break; default: return -EINVAL; } wo->wo_type = type; wo->wo_pid = pid; wo->wo_flags = options; wo->wo_info = infop; wo->wo_rusage = ru; if (f_flags & O_NONBLOCK) wo->wo_flags |= WNOHANG; return 0; } static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; long ret; ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); if (ret) return ret; ret = do_wait(&wo); if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) ret = -EAGAIN; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } long kernel_wait4(pid_t upid, int __user *stat_addr, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; /* -INT_MIN is not defined */ if (upid == INT_MIN) return -ESRCH; if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { type = PIDTYPE_PGID; pid = find_get_pid(-upid); } else if (upid == 0) { type = PIDTYPE_PGID; pid = get_task_pid(current, PIDTYPE_PGID); } else /* upid > 0 */ { type = PIDTYPE_PID; pid = find_get_pid(upid); } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) ret = -EFAULT; return ret; } int kernel_wait(pid_t pid, int *stat) { struct wait_opts wo = { .wo_type = PIDTYPE_PID, .wo_pid = find_get_pid(pid), .wo_flags = WEXITED, }; int ret; ret = do_wait(&wo); if (ret > 0 && wo.wo_stat) *stat = wo.wo_stat; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { struct rusage r; long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } return err; } #ifdef __ARCH_WANT_SYS_WAITPID /* * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return kernel_wait4(pid, stat_addr, options, NULL); } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, int, options, struct compat_rusage __user *, ru) { struct rusage r; long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && put_compat_rusage(&r, ru)) return -EFAULT; } return err; } COMPAT_SYSCALL_DEFINE5(waitid, int, which, compat_pid_t, pid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (uru) { /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) err = copy_to_user(uru, &ru, sizeof(ru)); else err = put_compat_rusage(&ru, uru); if (err) return -EFAULT; } } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } #endif /* * This needs to be __function_aligned as GCC implicitly makes any * implementation of abort() cold and drops alignment specified by * -falign-functions=N. * * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 */ __weak __function_aligned void abort(void) { BUG(); /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } EXPORT_SYMBOL(abort); |
161 246 246 189 166 246 166 190 190 246 8 8 8 8 24 24 13 22 24 22 13 13 24 24 209 209 208 209 209 208 209 209 209 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 | // SPDX-License-Identifier: GPL-2.0-only /* * jump label support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> * Copyright (C) 2011 Peter Zijlstra * */ #include <linux/memory.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/sort.h> #include <linux/err.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/bug.h> #include <linux/cpu.h> #include <asm/sections.h> /* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) { mutex_lock(&jump_label_mutex); } void jump_label_unlock(void) { mutex_unlock(&jump_label_mutex); } static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; const struct jump_entry *jeb = b; /* * Entrires are sorted by key. */ if (jump_entry_key(jea) < jump_entry_key(jeb)) return -1; if (jump_entry_key(jea) > jump_entry_key(jeb)) return 1; /* * In the batching mode, entries should also be sorted by the code * inside the already sorted list of entries, enabling a bsearch in * the vector. */ if (jump_entry_code(jea) < jump_entry_code(jeb)) return -1; if (jump_entry_code(jea) > jump_entry_code(jeb)) return 1; return 0; } static void jump_label_swap(void *a, void *b, int size) { long delta = (unsigned long)a - (unsigned long)b; struct jump_entry *jea = a; struct jump_entry *jeb = b; struct jump_entry tmp = *jea; jea->code = jeb->code - delta; jea->target = jeb->target - delta; jea->key = jeb->key - delta; jeb->code = tmp.code + delta; jeb->target = tmp.target + delta; jeb->key = tmp.key + delta; } static void jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) { unsigned long size; void *swapfn = NULL; if (IS_ENABLED(CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE)) swapfn = jump_label_swap; size = (((unsigned long)stop - (unsigned long)start) / sizeof(struct jump_entry)); sort(start, size, sizeof(struct jump_entry), jump_label_cmp, swapfn); } static void jump_label_update(struct static_key *key); /* * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h. * The use of 'atomic_read()' requires atomic.h and its problematic for some * kernel headers such as kernel.h and others. Since static_key_count() is not * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok * to have it be a function here. Similarly, for 'static_key_enable()' and * 'static_key_disable()', which require bug.h. This should allow jump_label.h * to be included from most/all places for CONFIG_JUMP_LABEL. */ int static_key_count(struct static_key *key) { /* * -1 means the first static_key_slow_inc() is in progress. * static_key_enabled() must return true, so return 1 here. */ int n = atomic_read(&key->enabled); return n >= 0 ? n : 1; } EXPORT_SYMBOL_GPL(static_key_count); /* * static_key_fast_inc_not_disabled - adds a user for a static key * @key: static key that must be already enabled * * The caller must make sure that the static key can't get disabled while * in this function. It doesn't patch jump labels, only adds a user to * an already enabled static key. * * Returns true if the increment was done. Unlike refcount_t the ref counter * is not saturated, but will fail to increment on overflow. */ bool static_key_fast_inc_not_disabled(struct static_key *key) { int v; STATIC_KEY_CHECK_USE(key); /* * Negative key->enabled has a special meaning: it sends * static_key_slow_inc/dec() down the slow path, and it is non-zero * so it counts as "enabled" in jump_label_update(). * * The INT_MAX overflow condition is either used by the networking * code to reset or detected in the slow path of * static_key_slow_inc_cpuslocked(). */ v = atomic_read(&key->enabled); do { if (v <= 0 || v == INT_MAX) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); return true; } EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled); bool static_key_slow_inc_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); /* * Careful if we get concurrent static_key_slow_inc/dec() calls; * later calls must wait for the first one to _finish_ the * jump_label_update() process. At the same time, however, * the jump_label_update() call below wants to see * static_key_enabled(&key) for jumps to be updated properly. */ if (static_key_fast_inc_not_disabled(key)) return true; guard(mutex)(&jump_label_mutex); /* Try to mark it as 'enabling in progress. */ if (!atomic_cmpxchg(&key->enabled, 0, -1)) { jump_label_update(key); /* * Ensure that when static_key_fast_inc_not_disabled() or * static_key_dec_not_one() observe the positive value, * they must also observe all the text changes. */ atomic_set_release(&key->enabled, 1); } else { /* * While holding the mutex this should never observe * anything else than a value >= 1 and succeed */ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) return false; } return true; } bool static_key_slow_inc(struct static_key *key) { bool ret; cpus_read_lock(); ret = static_key_slow_inc_cpuslocked(key); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(static_key_slow_inc); void static_key_enable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) > 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); jump_label_update(key); /* * See static_key_slow_inc(). */ atomic_set_release(&key->enabled, 1); } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked); void static_key_enable(struct static_key *key) { cpus_read_lock(); static_key_enable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_enable); void static_key_disable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } jump_label_lock(); if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked); void static_key_disable(struct static_key *key) { cpus_read_lock(); static_key_disable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_disable); static bool static_key_dec_not_one(struct static_key *key) { int v; /* * Go into the slow path if key::enabled is less than or equal than * one. One is valid to shut down the key, anything less than one * is an imbalance, which is handled at the call site. * * That includes the special case of '-1' which is set in * static_key_slow_inc_cpuslocked(), but that's harmless as it is * fully serialized in the slow path below. By the time this task * acquires the jump label lock the value is back to one and the * retry under the lock must succeed. */ v = atomic_read(&key->enabled); do { /* * Warn about the '-1' case though; since that means a * decrement is concurrent with a first (0->1) increment. IOW * people are trying to disable something that wasn't yet fully * enabled. This suggests an ordering problem on the user side. */ WARN_ON_ONCE(v < 0); /* * Warn about underflow, and lie about success in an attempt to * not make things worse. */ if (WARN_ON_ONCE(v == 0)) return true; if (v <= 1) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1))); return true; } static void __static_key_slow_dec_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); int val; if (static_key_dec_not_one(key)) return; guard(mutex)(&jump_label_mutex); val = atomic_read(&key->enabled); /* * It should be impossible to observe -1 with jump_label_mutex held, * see static_key_slow_inc_cpuslocked(). */ if (WARN_ON_ONCE(val == -1)) return; /* * Cannot already be 0, something went sideways. */ if (WARN_ON_ONCE(val == 0)) return; if (atomic_dec_and_test(&key->enabled)) jump_label_update(key); } static void __static_key_slow_dec(struct static_key *key) { cpus_read_lock(); __static_key_slow_dec_cpuslocked(key); cpus_read_unlock(); } void jump_label_update_timeout(struct work_struct *work) { struct static_key_deferred *key = container_of(work, struct static_key_deferred, work.work); __static_key_slow_dec(&key->key); } EXPORT_SYMBOL_GPL(jump_label_update_timeout); void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); __static_key_slow_dec(key); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); __static_key_slow_dec_cpuslocked(key); } void __static_key_slow_dec_deferred(struct static_key *key, struct delayed_work *work, unsigned long timeout) { STATIC_KEY_CHECK_USE(key); if (static_key_dec_not_one(key)) return; schedule_delayed_work(work, timeout); } EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); void __static_key_deferred_flush(void *key, struct delayed_work *work) { STATIC_KEY_CHECK_USE(key); flush_delayed_work(work); } EXPORT_SYMBOL_GPL(__static_key_deferred_flush); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { STATIC_KEY_CHECK_USE(key); key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (jump_entry_code(entry) <= (unsigned long)end && jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start) return 1; return 0; } static int __jump_label_text_reserved(struct jump_entry *iter_start, struct jump_entry *iter_stop, void *start, void *end, bool init) { struct jump_entry *iter; iter = iter_start; while (iter < iter_stop) { if (init || !jump_entry_is_init(iter)) { if (addr_conflict(iter, start, end)) return 1; } iter++; } return 0; } #ifndef arch_jump_label_transform_static static void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { /* nothing to do on most architectures */ } #endif static inline struct jump_entry *static_key_entries(struct static_key *key) { WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED); return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK); } static inline bool static_key_type(struct static_key *key) { return key->type & JUMP_TYPE_TRUE; } static inline bool static_key_linked(struct static_key *key) { return key->type & JUMP_TYPE_LINKED; } static inline void static_key_clear_linked(struct static_key *key) { key->type &= ~JUMP_TYPE_LINKED; } static inline void static_key_set_linked(struct static_key *key) { key->type |= JUMP_TYPE_LINKED; } /*** * A 'struct static_key' uses a union such that it either points directly * to a table of 'struct jump_entry' or to a linked list of modules which in * turn point to 'struct jump_entry' tables. * * The two lower bits of the pointer are used to keep track of which pointer * type is in use and to store the initial branch direction, we use an access * function which preserves these bits. */ static void static_key_set_entries(struct static_key *key, struct jump_entry *entries) { unsigned long type; WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK); type = key->type & JUMP_TYPE_MASK; key->entries = entries; key->type |= type; } static enum jump_label_type jump_label_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool enabled = static_key_enabled(key); bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return enabled ^ branch; } static bool jump_label_can_update(struct jump_entry *entry, bool init) { /* * Cannot update code that was in an init text area. */ if (!init && jump_entry_is_init(entry)) return false; if (!kernel_text_address(jump_entry_code(entry))) { /* * This skips patching built-in __exit, which * is part of init_section_contains() but is * not part of kernel_text_address(). * * Skipping built-in __exit is fine since it * will never be executed. */ WARN_ONCE(!jump_entry_is_init(entry), "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); return false; } return true; } #ifndef HAVE_JUMP_LABEL_BATCH static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { if (jump_label_can_update(entry, init)) arch_jump_label_transform(entry, jump_label_type(entry)); } } #else static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { if (!jump_label_can_update(entry, init)) continue; if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) { /* * Queue is full: Apply the current queue and try again. */ arch_jump_label_transform_apply(); BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry))); } } arch_jump_label_transform_apply(); } #endif void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct static_key *key = NULL; struct jump_entry *iter; /* * Since we are initializing the static_key.enabled field with * with the 'raw' int values (to avoid pulling in atomic.h) in * jump_label.h, let's make sure that is safe. There are only two * cases to check since we initialize to 0 or 1. */ BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); if (static_key_initialized) return; cpus_read_lock(); jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; bool in_init; /* rewrite NOPs */ if (jump_label_type(iter) == JUMP_LABEL_NOP) arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); in_init = init_section_contains((void *)jump_entry_code(iter), 1); jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; static_key_set_entries(key, iter); } static_key_initialized = true; jump_label_unlock(); cpus_read_unlock(); } static inline bool static_key_sealed(struct static_key *key) { return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK); } static inline void static_key_seal(struct static_key *key) { unsigned long type = key->type & JUMP_TYPE_TRUE; key->type = JUMP_TYPE_LINKED | type; } void jump_label_init_ro(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct jump_entry *iter; if (WARN_ON_ONCE(!static_key_initialized)) return; cpus_read_lock(); jump_label_lock(); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk = jump_entry_key(iter); if (!is_kernel_ro_after_init((unsigned long)iterk)) continue; if (static_key_sealed(iterk)) continue; static_key_seal(iterk); } jump_label_unlock(); cpus_read_unlock(); } #ifdef CONFIG_MODULES enum jump_label_type jump_label_init_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool type = static_key_type(key); bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return type ^ branch; } struct static_key_mod { struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; static inline struct static_key_mod *static_key_mod(struct static_key *key) { WARN_ON_ONCE(!static_key_linked(key)); return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK); } /*** * key->type and key->next are the same via union. * This sets key->next and preserves the type bits. * * See additional comments above static_key_set_entries(). */ static void static_key_set_mod(struct static_key *key, struct static_key_mod *mod) { unsigned long type; WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK); type = key->type & JUMP_TYPE_MASK; key->next = mod; key->type |= type; } static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; int ret; scoped_guard(rcu) { mod = __module_text_address((unsigned long)start); WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); if (!try_module_get(mod)) mod = NULL; } if (!mod) return 0; ret = __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, start, end, mod->state == MODULE_STATE_COMING); module_put(mod); return ret; } static void __jump_label_mod_update(struct static_key *key) { struct static_key_mod *mod; for (mod = static_key_mod(key); mod; mod = mod->next) { struct jump_entry *stop; struct module *m; /* * NULL if the static_key is defined in a module * that does not use it */ if (!mod->entries) continue; m = mod->mod; if (!m) stop = __stop___jump_table; else stop = m->jump_entries + m->num_jump_entries; __jump_label_update(key, mod->entries, stop, m && m->state == MODULE_STATE_COMING); } } static int jump_label_add_module(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; struct static_key_mod *jlm, *jlm2; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) return 0; jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; bool in_init; in_init = within_module_init(jump_entry_code(iter), mod); jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; if (within_module((unsigned long)key, mod)) { static_key_set_entries(key, iter); continue; } /* * If the key was sealed at init, then there's no need to keep a * reference to its module entries - just patch them now and be * done with it. */ if (static_key_sealed(key)) goto do_poke; jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; if (!static_key_linked(key)) { jlm2 = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm2) { kfree(jlm); return -ENOMEM; } scoped_guard(rcu) jlm2->mod = __module_address((unsigned long)key); jlm2->entries = static_key_entries(key); jlm2->next = NULL; static_key_set_mod(key, jlm2); static_key_set_linked(key); } jlm->mod = mod; jlm->entries = iter; jlm->next = static_key_mod(key); static_key_set_mod(key, jlm); static_key_set_linked(key); /* Only update if we've changed from our initial state */ do_poke: if (jump_label_type(iter) != jump_label_init_type(iter)) __jump_label_update(key, iter, iter_stop, true); } return 0; } static void jump_label_del_module(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (jump_entry_key(iter) == key) continue; key = jump_entry_key(iter); if (within_module((unsigned long)key, mod)) continue; /* No @jlm allocated because key was sealed at init. */ if (static_key_sealed(key)) continue; /* No memory during module load */ if (WARN_ON(!static_key_linked(key))) continue; prev = &key->next; jlm = static_key_mod(key); while (jlm && jlm->mod != mod) { prev = &jlm->next; jlm = jlm->next; } /* No memory during module load */ if (WARN_ON(!jlm)) continue; if (prev == &key->next) static_key_set_mod(key, jlm->next); else *prev = jlm->next; kfree(jlm); jlm = static_key_mod(key); /* if only one etry is left, fold it back into the static_key */ if (jlm->next == NULL) { static_key_set_entries(key, jlm->entries); static_key_clear_linked(key); kfree(jlm); } } } static int jump_label_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; int ret = 0; cpus_read_lock(); jump_label_lock(); switch (val) { case MODULE_STATE_COMING: ret = jump_label_add_module(mod); if (ret) { WARN(1, "Failed to allocate memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } break; case MODULE_STATE_GOING: jump_label_del_module(mod); break; } jump_label_unlock(); cpus_read_unlock(); return notifier_from_errno(ret); } static struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, .priority = 1, /* higher than tracepoints */ }; static __init int jump_label_init_module(void) { return register_module_notifier(&jump_label_module_nb); } early_initcall(jump_label_init_module); #endif /* CONFIG_MODULES */ /*** * jump_label_text_reserved - check if addr range is reserved * @start: start text addr * @end: end text addr * * checks if the text addr located between @start and @end * overlaps with any of the jump label patch addresses. Code * that wants to modify kernel text should first verify that * it does not overlap with any of the jump label addresses. * Caller must hold jump_label_mutex. * * returns 1 if there is an overlap, 0 otherwise */ int jump_label_text_reserved(void *start, void *end) { bool init = system_state < SYSTEM_RUNNING; int ret = __jump_label_text_reserved(__start___jump_table, __stop___jump_table, start, end, init); if (ret) return ret; #ifdef CONFIG_MODULES ret = __jump_label_mod_text_reserved(start, end); #endif return ret; } static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; bool init = system_state < SYSTEM_RUNNING; struct jump_entry *entry; #ifdef CONFIG_MODULES struct module *mod; if (static_key_linked(key)) { __jump_label_mod_update(key); return; } scoped_guard(rcu) { mod = __module_address((unsigned long)key); if (mod) { stop = mod->jump_entries + mod->num_jump_entries; init = mod->state == MODULE_STATE_COMING; } } #endif entry = static_key_entries(key); /* if there are no users, entry can be NULL */ if (entry) __jump_label_update(key, entry, stop, init); } #ifdef CONFIG_STATIC_KEYS_SELFTEST static DEFINE_STATIC_KEY_TRUE(sk_true); static DEFINE_STATIC_KEY_FALSE(sk_false); static __init int jump_label_test(void) { int i; for (i = 0; i < 2; i++) { WARN_ON(static_key_enabled(&sk_true.key) != true); WARN_ON(static_key_enabled(&sk_false.key) != false); WARN_ON(!static_branch_likely(&sk_true)); WARN_ON(!static_branch_unlikely(&sk_true)); WARN_ON(static_branch_likely(&sk_false)); WARN_ON(static_branch_unlikely(&sk_false)); static_branch_disable(&sk_true); static_branch_enable(&sk_false); WARN_ON(static_key_enabled(&sk_true.key) == true); WARN_ON(static_key_enabled(&sk_false.key) == false); WARN_ON(static_branch_likely(&sk_true)); WARN_ON(static_branch_unlikely(&sk_true)); WARN_ON(!static_branch_likely(&sk_false)); WARN_ON(!static_branch_unlikely(&sk_false)); static_branch_enable(&sk_true); static_branch_disable(&sk_false); } return 0; } early_initcall(jump_label_test); #endif /* STATIC_KEYS_SELFTEST */ |
9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 208 209 208 209 208 209 208 209 209 209 208 208 208 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 | // SPDX-License-Identifier: GPL-2.0-only /* * Generic helpers for smp ipi calls * * (C) Jens Axboe <jens.axboe@oracle.com> 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/irq_work.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/hypervisor.h> #include <linux/sched/clock.h> #include <linux/nmi.h> #include <linux/sched/debug.h> #include <linux/jump_label.h> #include <linux/string_choices.h> #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS #include <trace/events/csd.h> #undef CREATE_TRACE_POINTS #include "smpboot.h" #include "sched/smp.h" #define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1); static void __flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, cpu_to_node(cpu))) { free_cpumask_var(cfd->cpumask); return -ENOMEM; } cfd->csd = alloc_percpu(call_single_data_t); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } return 0; } int smpcfd_dead_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); return 0; } int smpcfd_dying_cpu(unsigned int cpu) { /* * The IPIs for the smp-call-function callbacks queued by other * CPUs might arrive late, either due to hardware latencies or * because this CPU disabled interrupts (inside stop-machine) * before the IPIs were sent. So flush out any pending callbacks * explicitly (without waiting for the IPIs to arrive), to * ensure that the outgoing CPU doesn't go offline with work * still pending. */ __flush_smp_call_function_queue(false); irq_work_run(); return 0; } void __init call_function_init(void) { int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(call_single_queue, i)); smpcfd_prepare_cpu(smp_processor_id()); } static __always_inline void send_call_function_single_ipi(int cpu) { if (call_function_single_prep_ipi(cpu)) { trace_ipi_send_cpu(cpu, _RET_IP_, generic_smp_call_function_single_interrupt); arch_send_call_function_single_ipi(cpu); } } static __always_inline void send_call_function_ipi_mask(struct cpumask *mask) { trace_ipi_send_cpumask(mask, _RET_IP_, generic_smp_call_function_single_interrupt); arch_send_call_function_ipi_mask(mask); } static __always_inline void csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd) { trace_csd_function_entry(func, csd); func(info); trace_csd_function_exit(func, csd); } #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled); /* * Parse the csdlock_debug= kernel boot parameter. * * If you need to restore the old "ext" value that once provided * additional debugging information, reapply the following commits: * * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging") * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging") */ static int __init csdlock_debug(char *str) { int ret; unsigned int val = 0; ret = get_option(&str, &val); if (ret) { if (val) static_branch_enable(&csdlock_debug_enabled); else static_branch_disable(&csdlock_debug_enabled); } return 1; } __setup("csdlock_debug=", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ module_param(csd_lock_timeout, ulong, 0644); static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */ module_param(panic_on_ipistall, int, 0644); static atomic_t csd_bug_count = ATOMIC_INIT(0); /* Record current CSD work for current CPU, NULL to erase. */ static void __csd_lock_record(call_single_data_t *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ __this_cpu_write(cur_csd, NULL); return; } __this_cpu_write(cur_csd_func, csd->func); __this_cpu_write(cur_csd_info, csd->info); smp_wmb(); /* func and info before csd. */ __this_cpu_write(cur_csd, csd); smp_mb(); /* Update cur_csd before function call. */ /* Or before unlock, as the case may be. */ } static __always_inline void csd_lock_record(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) __csd_lock_record(csd); } static int csd_lock_wait_getcpu(call_single_data_t *csd) { unsigned int csd_type; csd_type = CSD_TYPE(csd); if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */ return -1; } static atomic_t n_csd_lock_stuck; /** * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long? * * Returns @true if a CSD-lock acquisition is stuck and has been stuck * long enough for a "non-responsive CSD lock" message to be printed. */ bool csd_lock_is_stuck(void) { return !!atomic_read(&n_csd_lock_stuck); } /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages) { int cpu = -1; int cpux; bool firsttime; u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC; if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) return true; cpu = csd_lock_wait_getcpu(csd); pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", *bug_id, raw_smp_processor_id(), cpu); atomic_dec(&n_csd_lock_stuck); return true; } ts2 = ktime_get_mono_fast_ns(); /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) * (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) || csd_lock_timeout_ns == 0)) return false; if (ts0 > ts2) { /* Our own sched_clock went backward; don't blame another CPU. */ ts_delta = ts0 - ts2; pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta); *ts1 = ts2; return false; } firsttime = !*bug_id; if (firsttime) *bug_id = atomic_inc_return(&csd_bug_count); cpu = csd_lock_wait_getcpu(csd); if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu)) cpux = 0; else cpux = cpu; cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ /* How long since this CSD lock was stuck. */ ts_delta = ts2 - ts0; pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); (*nmessages)++; if (firsttime) atomic_inc(&n_csd_lock_stuck); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely * to become unstuck. Use a signed comparison to avoid triggering * on underflows when the TSC is out of sync between sockets. */ BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC)); if (cpu_cur_csd && csd != cpu_cur_csd) { pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), READ_ONCE(per_cpu(cur_csd_info, cpux))); } else { pr_alert("\tcsd: CSD lock (#%d) %s.\n", *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); } if (cpu >= 0) { if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0)) dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); arch_send_call_function_single_ipi(cpu); } } if (firsttime) dump_stack(); *ts1 = ts2; return false; } /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * * For non-synchronous ipi calls the csd can still be in use by the * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ static void __csd_lock_wait(call_single_data_t *csd) { unsigned long nmessages = 0; int bug_id = 0; u64 ts0, ts1; ts1 = ts0 = ktime_get_mono_fast_ns(); for (;;) { if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages)) break; cpu_relax(); } smp_acquire__after_ctrl_dep(); } static __always_inline void csd_lock_wait(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) { __csd_lock_wait(csd); return; } smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #else static void csd_lock_record(call_single_data_t *csd) { } static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); csd->node.u_flags |= CSD_FLAG_LOCK; /* * prevent CPU from reordering the above assignment * to ->flags with any subsequent assignments to other * fields of the specified call_single_data_t structure: */ smp_wmb(); } static __always_inline void csd_unlock(call_single_data_t *csd) { WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ smp_store_release(&csd->node.u_flags, 0); } static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); void __smp_call_single_queue(int cpu, struct llist_node *node) { /* * We have to check the type of the CSD before queueing it, because * once queued it can have its flags cleared by * flush_smp_call_function_queue() * even if we haven't sent the smp_call IPI yet (e.g. the stopper * executes migration_cpu_stop() on the remote CPU). */ if (trace_csd_queue_cpu_enabled()) { call_single_data_t *csd; smp_call_func_t func; csd = container_of(node, call_single_data_t, node.llist); func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? sched_ttwu_pending : csd->func; trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); } /* * The list addition should be visible to the target CPU when it pops * the head of the list to pull the entry off it in the IPI handler * because of normal cache coherency rules implied by the underlying * llist ops. * * If IPIs can go out of order to the cache coherency protocol * in an architecture, sufficient synchronisation should be added * to arch code to make it appear to obey cache coherency WRT * locking and barrier primitives. Generic code isn't really * equipped to do the right thing... */ if (llist_add(node, &per_cpu(call_single_queue, cpu))) send_call_function_single_ipi(cpu); } /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ static int generic_exec_single(int cpu, call_single_data_t *csd) { if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; void *info = csd->info; unsigned long flags; /* * We can unlock early even for the synchronous on-stack case, * since we're doing this from the same CPU.. */ csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); csd_do_func(func, info, NULL); csd_lock_record(NULL); local_irq_restore(flags); return 0; } if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } __smp_call_single_queue(cpu, &csd->node.llist); return 0; } /** * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks * * Invoked by arch to handle an IPI for call function single. * Must be called with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { __flush_smp_call_function_queue(true); } /** * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks * * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * offline CPU. Skip this check if set to 'false'. * * Flush any pending smp-call-function callbacks queued on this CPU. This is * invoked by the generic IPI handler, as well as by a CPU about to go offline, * to ensure that all pending IPI callbacks are run before it goes completely * offline. * * Loop through the call_single_queue and run all the queued callbacks. * Must be called with interrupts disabled. */ static void __flush_smp_call_function_queue(bool warn_cpu_offline) { call_single_data_t *csd, *csd_next; struct llist_node *entry, *prev; struct llist_head *head; static bool warned; atomic_t *tbt; lockdep_assert_irqs_disabled(); /* Allow waiters to send backtrace NMI from here onwards */ tbt = this_cpu_ptr(&trigger_backtrace); atomic_set_release(tbt, 1); head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); entry = llist_reverse_order(entry); /* There shouldn't be any pending callbacks on an offline CPU. */ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && !warned && entry != NULL)) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); /* * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ llist_for_each_entry(csd, entry, node.llist) { switch (CSD_TYPE(csd)) { case CSD_TYPE_ASYNC: case CSD_TYPE_SYNC: case CSD_TYPE_IRQ_WORK: pr_warn("IPI callback %pS sent to offline CPU\n", csd->func); break; case CSD_TYPE_TTWU: pr_warn("IPI task-wakeup sent to offline CPU\n"); break; default: pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", CSD_TYPE(csd)); break; } } } /* * First; run all SYNC callbacks, people are waiting for us. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { /* Do we wait until *after* callback? */ if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { smp_call_func_t func = csd->func; void *info = csd->info; if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } csd_lock_record(csd); csd_do_func(func, info, csd); csd_unlock(csd); csd_lock_record(NULL); } else { prev = &csd->node.llist; } } if (!entry) return; /* * Second; run all !SYNC callbacks. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { int type = CSD_TYPE(csd); if (type != CSD_TYPE_TTWU) { if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } if (type == CSD_TYPE_ASYNC) { smp_call_func_t func = csd->func; void *info = csd->info; csd_lock_record(csd); csd_unlock(csd); csd_do_func(func, info, csd); csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); } } else { prev = &csd->node.llist; } } /* * Third; only CSD_TYPE_TTWU is left, issue those. */ if (entry) { csd = llist_entry(entry, typeof(*csd), node.llist); csd_do_func(sched_ttwu_pending, entry, csd); } } /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * from task context (idle, migration thread) * * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to * handle queued SMP function calls before scheduling. * * The migration thread has to ensure that an eventually pending wakeup has * been handled before it migrates a task. */ void flush_smp_call_function_queue(void) { unsigned int was_pending; unsigned long flags; if (llist_empty(this_cpu_ptr(&call_single_queue))) return; local_irq_save(flags); /* Get the already pending soft interrupts for RT enabled kernels */ was_pending = local_softirq_pending(); __flush_smp_call_function_queue(true); if (local_softirq_pending()) do_softirq_post_smp_call_flush(was_pending); local_irq_restore(flags); } /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed on other CPUs. * * Returns 0 on success, else a negative status code. */ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { call_single_data_t *csd; call_single_data_t csd_stack = { .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }, }; int this_cpu; int err; /* * prevent preemption and reschedule on another processor, * as well as CPU removal */ this_cpu = get_cpu(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); csd = &csd_stack; if (!wait) { csd = this_cpu_ptr(&csd_data); csd_lock(csd); } csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); put_cpu(); return err; } EXPORT_SYMBOL(smp_call_function_single); /** * smp_call_function_single_async() - Run an asynchronous function on a * specific CPU. * @cpu: The CPU to run on. * @csd: Pre-allocated and setup data structure * * Like smp_call_function_single(), but the call is asynchonous and * can thus be done from contexts with disabled interrupts. * * The caller passes his own pre-allocated data structure * (ie: embedded in an object) and is responsible for synchronizing it * such that the IPIs performed on the @csd are strictly serialized. * * If the function is called with one csd which has not yet been * processed by previous call to smp_call_function_single_async(), the * function will return immediately with -EBUSY showing that the csd * object is still in progress. * * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. * * Return: %0 on success or negative errno value on error */ int smp_call_function_single_async(int cpu, call_single_data_t *csd) { int err = 0; preempt_disable(); if (csd->node.u_flags & CSD_FLAG_LOCK) { err = -EBUSY; goto out; } csd->node.u_flags = CSD_FLAG_LOCK; smp_wmb(); err = generic_exec_single(cpu, csd); out: preempt_enable(); return err; } EXPORT_SYMBOL_GPL(smp_call_function_single_async); /* * smp_call_function_any - Run a function on any of the given cpus * @mask: The mask of cpus it can run on. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed. * * Returns 0 on success, else a negative status code (if no cpus were online). * * Selection preference: * 1) current cpu if in @mask * 2) any cpu of current node if in @mask * 3) any other online cpu in @mask */ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { unsigned int cpu; const struct cpumask *nodemask; int ret; /* Try for same CPU (cheapest) */ cpu = get_cpu(); if (cpumask_test_cpu(cpu, mask)) goto call; /* Try for same node. */ nodemask = cpumask_of_node(cpu_to_node(cpu)); for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; cpu = cpumask_next_and(cpu, nodemask, mask)) { if (cpu_online(cpu)) goto call; } /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ cpu = cpumask_any_and(mask, cpu_online_mask); call: ret = smp_call_function_single(cpu, func, info, wait); put_cpu(); return ret; } EXPORT_SYMBOL_GPL(smp_call_function_any); /* * Flags to be used as scf_flags argument of smp_call_function_many_cond(). * * %SCF_WAIT: Wait until function execution is completed * %SCF_RUN_LOCAL: Run also locally if local cpu is set in cpumask */ #define SCF_WAIT (1U << 0) #define SCF_RUN_LOCAL (1U << 1) static void smp_call_function_many_cond(const struct cpumask *mask, smp_call_func_t func, void *info, unsigned int scf_flags, smp_cond_func_t cond_func) { int cpu, last_cpu, this_cpu = smp_processor_id(); struct call_function_data *cfd; bool wait = scf_flags & SCF_WAIT; int nr_cpus = 0; bool run_remote = false; bool run_local = false; lockdep_assert_preemption_disabled(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ if (cpu_online(this_cpu) && !oops_in_progress && !early_boot_irqs_disabled) lockdep_assert_irqs_enabled(); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); /* Check if we need local execution. */ if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && (!cond_func || cond_func(this_cpu, info))) run_local = true; /* Check if we need remote execution, i.e., any CPU excluding this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); if (cpu == this_cpu) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); if (cpu < nr_cpu_ids) run_remote = true; if (run_remote) { cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); __cpumask_clear_cpu(this_cpu, cfd->cpumask); cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); if (cond_func && !cond_func(cpu, info)) { __cpumask_clear_cpu(cpu, cfd->cpumask); continue; } csd_lock(csd); if (wait) csd->node.u_flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; last_cpu = cpu; } } /* * Choose the most efficient way to send an IPI. Note that the * number of CPUs might be zero due to concurrent changes to the * provided mask. */ if (nr_cpus == 1) send_call_function_single_ipi(last_cpu); else if (likely(nr_cpus > 1)) send_call_function_ipi_mask(cfd->cpumask_ipi); } if (run_local) { unsigned long flags; local_irq_save(flags); csd_do_func(func, info, NULL); local_irq_restore(flags); } if (run_remote && wait) { for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd; csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); } } } /** * smp_call_function_many(): Run a function on a set of CPUs. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait * (atomically) until function has completed on other CPUs. If * %SCF_RUN_LOCAL is set, the function will also be run locally * if the local CPU is set in the @cpumask. * * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. */ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL); } EXPORT_SYMBOL(smp_call_function_many); /** * smp_call_function(): Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ void smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); } EXPORT_SYMBOL(smp_call_function); /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; EXPORT_SYMBOL(setup_max_cpus); /* * Setup routine for controlling SMP activation * * Command-line option of "nosmp" or "maxcpus=0" will disable SMP * activation entirely (the MPS table probe still happens, though). * * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer * greater than 0, limits the maximum number of CPUs activated in * SMP mode to <NUM>. */ void __weak __init arch_disable_smp_support(void) { } static int __init nosmp(char *str) { setup_max_cpus = 0; arch_disable_smp_support(); return 0; } early_param("nosmp", nosmp); /* this is hard limit */ static int __init nrcpus(char *str) { int nr_cpus; if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) set_nr_cpu_ids(nr_cpus); return 0; } early_param("nr_cpus", nrcpus); static int __init maxcpus(char *str) { get_option(&str, &setup_max_cpus); if (setup_max_cpus == 0) arch_disable_smp_support(); return 0; } early_param("maxcpus", maxcpus); #if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS) /* Setup number of possible processor ids */ unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); #endif /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ void __init setup_nr_cpu_ids(void) { set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1); } /* Called by boot processor to activate the rest. */ void __init smp_init(void) { int num_nodes, num_cpus; idle_threads_init(); cpuhp_threads_init(); pr_info("Bringing up secondary CPUs ...\n"); bringup_nonboot_cpus(setup_max_cpus); num_nodes = num_online_nodes(); num_cpus = num_online_cpus(); pr_info("Brought up %d node%s, %d CPU%s\n", num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus)); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } /* * on_each_cpu_cond(): Call a function on each processor for which * the supplied function cond_func returns true, optionally waiting * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and * the info parameter. The function is called * with preemption disabled. The function should * return a blooean value indicating whether to IPI * the specified CPU. * @func: The function to run on all applicable CPUs. * This must be fast and non-blocking. * @info: An arbitrary pointer to pass to both functions. * @wait: If true, wait (atomically) until function has * completed on other CPUs. * * Preemption is disabled to protect against CPUs going offline but not online. * CPUs going online during the call will not be seen or sent an IPI. * * You must not call this function with disabled interrupts or * from a hardware interrupt handler or from a bottom half handler. */ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask) { unsigned int scf_flags = SCF_RUN_LOCAL; if (wait) scf_flags |= SCF_WAIT; preempt_disable(); smp_call_function_many_cond(mask, func, info, scf_flags, cond_func); preempt_enable(); } EXPORT_SYMBOL(on_each_cpu_cond_mask); static void do_nothing(void *unused) { } /** * kick_all_cpus_sync - Force all cpus out of idle * * Used to synchronize the update of pm_idle function pointer. It's * called after the pointer is updated and returns after the dummy * callback function has been executed on all cpus. The execution of * the function can only happen on the remote cpus after they have * left the idle function which had been called via pm_idle function * pointer. So it's guaranteed that nothing uses the previous pointer * anymore. */ void kick_all_cpus_sync(void) { /* Make sure the change is visible before we kick the cpus */ smp_mb(); smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); /** * wake_up_all_idle_cpus - break all cpus out of idle * wake_up_all_idle_cpus try to break all cpus which is in idle state even * including idle polling cpus, for non-idle cpus, we will do nothing * for them. */ void wake_up_all_idle_cpus(void) { int cpu; for_each_possible_cpu(cpu) { preempt_disable(); if (cpu != smp_processor_id() && cpu_online(cpu)) wake_up_if_idle(cpu); preempt_enable(); } } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); /** * struct smp_call_on_cpu_struct - Call a function on a specific CPU * @work: &work_struct * @done: &completion to signal * @func: function to call * @data: function's data argument * @ret: return value from @func * @cpu: target CPU (%-1 for any CPU) * * Used to call a function on a specific cpu and wait for it to return. * Optionally make sure the call is done on a specified physical cpu via vcpu * pinning in order to support virtualized environments. */ struct smp_call_on_cpu_struct { struct work_struct work; struct completion done; int (*func)(void *); void *data; int ret; int cpu; }; static void smp_call_on_cpu_callback(struct work_struct *work) { struct smp_call_on_cpu_struct *sscs; sscs = container_of(work, struct smp_call_on_cpu_struct, work); if (sscs->cpu >= 0) hypervisor_pin_vcpu(sscs->cpu); sscs->ret = sscs->func(sscs->data); if (sscs->cpu >= 0) hypervisor_pin_vcpu(-1); complete(&sscs->done); } int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) { struct smp_call_on_cpu_struct sscs = { .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), .func = func, .data = par, .cpu = phys ? cpu : -1, }; INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; queue_work_on(cpu, system_wq, &sscs.work); wait_for_completion(&sscs.done); destroy_work_on_stack(&sscs.work); return sscs.ret; } EXPORT_SYMBOL_GPL(smp_call_on_cpu); |
1518 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_PREEMPT_H #define __ASM_PREEMPT_H #include <linux/jump_label.h> #include <linux/thread_info.h> #define PREEMPT_NEED_RESCHED BIT(32) #define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED) static inline int preempt_count(void) { return READ_ONCE(current_thread_info()->preempt.count); } static inline void preempt_count_set(u64 pc) { /* Preserve existing value of PREEMPT_NEED_RESCHED */ WRITE_ONCE(current_thread_info()->preempt.count, pc); } #define init_task_preempt_count(p) do { \ task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \ } while (0) #define init_idle_preempt_count(p, cpu) do { \ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ } while (0) static inline void set_preempt_need_resched(void) { current_thread_info()->preempt.need_resched = 0; } static inline void clear_preempt_need_resched(void) { current_thread_info()->preempt.need_resched = 1; } static inline bool test_preempt_need_resched(void) { return !current_thread_info()->preempt.need_resched; } static inline void __preempt_count_add(int val) { u32 pc = READ_ONCE(current_thread_info()->preempt.count); pc += val; WRITE_ONCE(current_thread_info()->preempt.count, pc); } static inline void __preempt_count_sub(int val) { u32 pc = READ_ONCE(current_thread_info()->preempt.count); pc -= val; WRITE_ONCE(current_thread_info()->preempt.count, pc); } static inline bool __preempt_count_dec_and_test(void) { struct thread_info *ti = current_thread_info(); u64 pc = READ_ONCE(ti->preempt_count); /* Update only the count field, leaving need_resched unchanged */ WRITE_ONCE(ti->preempt.count, --pc); /* * If we wrote back all zeroes, then we're preemptible and in * need of a reschedule. Otherwise, we need to reload the * preempt_count in case the need_resched flag was cleared by an * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE * pair. */ return !pc || !READ_ONCE(ti->preempt_count); } static inline bool should_resched(int preempt_offset) { u64 pc = READ_ONCE(current_thread_info()->preempt_count); return pc == preempt_offset; } #ifdef CONFIG_PREEMPTION void preempt_schedule(void); void preempt_schedule_notrace(void); #ifdef CONFIG_PREEMPT_DYNAMIC DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); void dynamic_preempt_schedule(void); #define __preempt_schedule() dynamic_preempt_schedule() void dynamic_preempt_schedule_notrace(void); #define __preempt_schedule_notrace() dynamic_preempt_schedule_notrace() #else /* CONFIG_PREEMPT_DYNAMIC */ #define __preempt_schedule() preempt_schedule() #define __preempt_schedule_notrace() preempt_schedule_notrace() #endif /* CONFIG_PREEMPT_DYNAMIC */ #endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_RTNETLINK_H #define __LINUX_RTNETLINK_H #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/wait.h> #include <linux/refcount.h> #include <uapi/linux/rtnetlink.h> extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); static inline int rtnetlink_maybe_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo) { return !skb ? 0 : rtnetlink_send(skb, net, pid, group, echo); } extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, const struct nlmsghdr *nlh, gfp_t flags); extern void rtnl_set_sk_err(struct net *net, u32 group, int error); extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error); void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags, u32 portid, const struct nlmsghdr *nlh); void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex, u32 portid, const struct nlmsghdr *nlh); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags, u32 portid, const struct nlmsghdr *nlh); /* RTNL is used as a global lock for all changes to network configuration */ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); extern int rtnl_lock_interruptible(void); extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); extern wait_queue_head_t netdev_unregistering_wq; extern atomic_t dev_unreg_count; extern struct rw_semaphore pernet_ops_rwsem; extern struct rw_semaphore net_rwsem; #define ASSERT_RTNL() \ WARN_ONCE(!rtnl_is_locked(), \ "RTNL: assertion failed at %s (%d)\n", __FILE__, __LINE__) #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); #else static inline bool lockdep_rtnl_is_held(void) { return true; } #endif /* #ifdef CONFIG_PROVE_LOCKING */ /** * rcu_dereference_rtnl - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * * Do an rcu_dereference(p), but check caller either holds rcu_read_lock() * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference() */ #define rcu_dereference_rtnl(p) \ rcu_dereference_check(p, lockdep_rtnl_is_held()) /** * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing * * Return: the value of the specified RCU-protected pointer, but omit * the READ_ONCE(), because caller holds RTNL. */ #define rtnl_dereference(p) \ rcu_dereference_protected(p, lockdep_rtnl_is_held()) /** * rcu_replace_pointer_rtnl - replace an RCU pointer under rtnl_lock, returning * its old value * @rp: RCU pointer, whose value is returned * @p: regular pointer * * Perform a replacement under rtnl_lock, where @rp is an RCU-annotated * pointer. The old value of @rp is returned, and @rp is set to @p */ #define rcu_replace_pointer_rtnl(rp, p) \ rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) #ifdef CONFIG_DEBUG_NET_SMALL_RTNL void __rtnl_net_lock(struct net *net); void __rtnl_net_unlock(struct net *net); void rtnl_net_lock(struct net *net); void rtnl_net_unlock(struct net *net); int rtnl_net_trylock(struct net *net); int rtnl_net_lock_killable(struct net *net); int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); bool rtnl_net_is_locked(struct net *net); #define ASSERT_RTNL_NET(net) \ WARN_ONCE(!rtnl_net_is_locked(net), \ "RTNL_NET: assertion failed at %s (%d)\n", \ __FILE__, __LINE__) bool lockdep_rtnl_net_is_held(struct net *net); #define rcu_dereference_rtnl_net(net, p) \ rcu_dereference_check(p, lockdep_rtnl_net_is_held(net)) #define rtnl_net_dereference(net, p) \ rcu_dereference_protected(p, lockdep_rtnl_net_is_held(net)) #define rcu_replace_pointer_rtnl_net(net, rp, p) \ rcu_replace_pointer(rp, p, lockdep_rtnl_net_is_held(net)) #else static inline void __rtnl_net_lock(struct net *net) {} static inline void __rtnl_net_unlock(struct net *net) {} static inline void rtnl_net_lock(struct net *net) { rtnl_lock(); } static inline void rtnl_net_unlock(struct net *net) { rtnl_unlock(); } static inline int rtnl_net_trylock(struct net *net) { return rtnl_trylock(); } static inline int rtnl_net_lock_killable(struct net *net) { return rtnl_lock_killable(); } static inline void ASSERT_RTNL_NET(struct net *net) { ASSERT_RTNL(); } #define rcu_dereference_rtnl_net(net, p) \ rcu_dereference_rtnl(p) #define rtnl_net_dereference(net, p) \ rtnl_dereference(p) #define rcu_replace_pointer_rtnl_net(net, rp, p) \ rcu_replace_pointer_rtnl(rp, p) #endif static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) { return rtnl_dereference(dev->ingress_queue); } static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev) { return rcu_dereference(dev->ingress_queue); } struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); #ifdef CONFIG_NET_INGRESS void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); #endif #ifdef CONFIG_NET_EGRESS void net_inc_egress_queue(void); void net_dec_egress_queue(void); void netdev_xmit_skip_txqueue(bool skip); #endif void rtnetlink_init(void); void __rtnl_unlock(void); void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); /* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */ struct ndo_fdb_dump_context { unsigned long ifindex; unsigned long fdb_idx; }; extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx); extern int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags); extern int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, u32 flags, u32 mask, int nlflags, u32 filter_mask, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)); extern void rtnl_offload_xstats_notify(struct net_device *dev); static inline int rtnl_has_listeners(const struct net *net, u32 group) { struct sock *rtnl = net->rtnl; return netlink_has_listeners(rtnl, group); } /** * rtnl_notify_needed - check if notification is needed * @net: Pointer to the net namespace * @nlflags: netlink ingress message flags * @group: rtnl group * * Based on the ingress message flags and rtnl group, returns true * if a notification is needed, false otherwise. */ static inline bool rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group) { return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group); } void netif_set_operstate(struct net_device *dev, int newstate); #endif /* __LINUX_RTNETLINK_H */ |
1257 203 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 | /* SPDX-License-Identifier: GPL-2.0 */ /* * security/tomoyo/common.h * * Header file for TOMOYO. * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #ifndef _SECURITY_TOMOYO_COMMON_H #define _SECURITY_TOMOYO_COMMON_H #define pr_fmt(fmt) fmt #include <linux/ctype.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/kmod.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/list.h> #include <linux/cred.h> #include <linux/poll.h> #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/un.h> #include <linux/lsm_hooks.h> #include <net/sock.h> #include <net/af_unix.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/udp.h> /********** Constants definitions. **********/ /* * TOMOYO uses this hash only when appending a string into the string * table. Frequency of appending strings is very low. So we don't need * large (e.g. 64k) hash size. 256 will be sufficient. */ #define TOMOYO_HASH_BITS 8 #define TOMOYO_MAX_HASH (1u<<TOMOYO_HASH_BITS) /* * TOMOYO checks only SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, SOCK_SEQPACKET. * Therefore, we don't need SOCK_MAX. */ #define TOMOYO_SOCK_MAX 6 #define TOMOYO_EXEC_TMPSIZE 4096 /* Garbage collector is trying to kfree() this element. */ #define TOMOYO_GC_IN_PROGRESS -1 /* Profile number is an integer between 0 and 255. */ #define TOMOYO_MAX_PROFILES 256 /* Group number is an integer between 0 and 255. */ #define TOMOYO_MAX_ACL_GROUPS 256 /* Index numbers for "struct tomoyo_condition". */ enum tomoyo_conditions_index { TOMOYO_TASK_UID, /* current_uid() */ TOMOYO_TASK_EUID, /* current_euid() */ TOMOYO_TASK_SUID, /* current_suid() */ TOMOYO_TASK_FSUID, /* current_fsuid() */ TOMOYO_TASK_GID, /* current_gid() */ TOMOYO_TASK_EGID, /* current_egid() */ TOMOYO_TASK_SGID, /* current_sgid() */ TOMOYO_TASK_FSGID, /* current_fsgid() */ TOMOYO_TASK_PID, /* sys_getpid() */ TOMOYO_TASK_PPID, /* sys_getppid() */ TOMOYO_EXEC_ARGC, /* "struct linux_binprm *"->argc */ TOMOYO_EXEC_ENVC, /* "struct linux_binprm *"->envc */ TOMOYO_TYPE_IS_SOCKET, /* S_IFSOCK */ TOMOYO_TYPE_IS_SYMLINK, /* S_IFLNK */ TOMOYO_TYPE_IS_FILE, /* S_IFREG */ TOMOYO_TYPE_IS_BLOCK_DEV, /* S_IFBLK */ TOMOYO_TYPE_IS_DIRECTORY, /* S_IFDIR */ TOMOYO_TYPE_IS_CHAR_DEV, /* S_IFCHR */ TOMOYO_TYPE_IS_FIFO, /* S_IFIFO */ TOMOYO_MODE_SETUID, /* S_ISUID */ TOMOYO_MODE_SETGID, /* S_ISGID */ TOMOYO_MODE_STICKY, /* S_ISVTX */ TOMOYO_MODE_OWNER_READ, /* S_IRUSR */ TOMOYO_MODE_OWNER_WRITE, /* S_IWUSR */ TOMOYO_MODE_OWNER_EXECUTE, /* S_IXUSR */ TOMOYO_MODE_GROUP_READ, /* S_IRGRP */ TOMOYO_MODE_GROUP_WRITE, /* S_IWGRP */ TOMOYO_MODE_GROUP_EXECUTE, /* S_IXGRP */ TOMOYO_MODE_OTHERS_READ, /* S_IROTH */ TOMOYO_MODE_OTHERS_WRITE, /* S_IWOTH */ TOMOYO_MODE_OTHERS_EXECUTE, /* S_IXOTH */ TOMOYO_EXEC_REALPATH, TOMOYO_SYMLINK_TARGET, TOMOYO_PATH1_UID, TOMOYO_PATH1_GID, TOMOYO_PATH1_INO, TOMOYO_PATH1_MAJOR, TOMOYO_PATH1_MINOR, TOMOYO_PATH1_PERM, TOMOYO_PATH1_TYPE, TOMOYO_PATH1_DEV_MAJOR, TOMOYO_PATH1_DEV_MINOR, TOMOYO_PATH2_UID, TOMOYO_PATH2_GID, TOMOYO_PATH2_INO, TOMOYO_PATH2_MAJOR, TOMOYO_PATH2_MINOR, TOMOYO_PATH2_PERM, TOMOYO_PATH2_TYPE, TOMOYO_PATH2_DEV_MAJOR, TOMOYO_PATH2_DEV_MINOR, TOMOYO_PATH1_PARENT_UID, TOMOYO_PATH1_PARENT_GID, TOMOYO_PATH1_PARENT_INO, TOMOYO_PATH1_PARENT_PERM, TOMOYO_PATH2_PARENT_UID, TOMOYO_PATH2_PARENT_GID, TOMOYO_PATH2_PARENT_INO, TOMOYO_PATH2_PARENT_PERM, TOMOYO_MAX_CONDITION_KEYWORD, TOMOYO_NUMBER_UNION, TOMOYO_NAME_UNION, TOMOYO_ARGV_ENTRY, TOMOYO_ENVP_ENTRY, }; /* Index numbers for stat(). */ enum tomoyo_path_stat_index { /* Do not change this order. */ TOMOYO_PATH1, TOMOYO_PATH1_PARENT, TOMOYO_PATH2, TOMOYO_PATH2_PARENT, TOMOYO_MAX_PATH_STAT }; /* Index numbers for operation mode. */ enum tomoyo_mode_index { TOMOYO_CONFIG_DISABLED, TOMOYO_CONFIG_LEARNING, TOMOYO_CONFIG_PERMISSIVE, TOMOYO_CONFIG_ENFORCING, TOMOYO_CONFIG_MAX_MODE, TOMOYO_CONFIG_WANT_REJECT_LOG = 64, TOMOYO_CONFIG_WANT_GRANT_LOG = 128, TOMOYO_CONFIG_USE_DEFAULT = 255, }; /* Index numbers for entry type. */ enum tomoyo_policy_id { TOMOYO_ID_GROUP, TOMOYO_ID_ADDRESS_GROUP, TOMOYO_ID_PATH_GROUP, TOMOYO_ID_NUMBER_GROUP, TOMOYO_ID_TRANSITION_CONTROL, TOMOYO_ID_AGGREGATOR, TOMOYO_ID_MANAGER, TOMOYO_ID_CONDITION, TOMOYO_ID_NAME, TOMOYO_ID_ACL, TOMOYO_ID_DOMAIN, TOMOYO_MAX_POLICY }; /* Index numbers for domain's attributes. */ enum tomoyo_domain_info_flags_index { /* Quota warnning flag. */ TOMOYO_DIF_QUOTA_WARNED, /* * This domain was unable to create a new domain at * tomoyo_find_next_domain() because the name of the domain to be * created was too long or it could not allocate memory. * More than one process continued execve() without domain transition. */ TOMOYO_DIF_TRANSITION_FAILED, TOMOYO_MAX_DOMAIN_INFO_FLAGS }; /* Index numbers for audit type. */ enum tomoyo_grant_log { /* Follow profile's configuration. */ TOMOYO_GRANTLOG_AUTO, /* Do not generate grant log. */ TOMOYO_GRANTLOG_NO, /* Generate grant_log. */ TOMOYO_GRANTLOG_YES, }; /* Index numbers for group entries. */ enum tomoyo_group_id { TOMOYO_PATH_GROUP, TOMOYO_NUMBER_GROUP, TOMOYO_ADDRESS_GROUP, TOMOYO_MAX_GROUP }; /* Index numbers for type of numeric values. */ enum tomoyo_value_type { TOMOYO_VALUE_TYPE_INVALID, TOMOYO_VALUE_TYPE_DECIMAL, TOMOYO_VALUE_TYPE_OCTAL, TOMOYO_VALUE_TYPE_HEXADECIMAL, }; /* Index numbers for domain transition control keywords. */ enum tomoyo_transition_type { /* Do not change this order, */ TOMOYO_TRANSITION_CONTROL_NO_RESET, TOMOYO_TRANSITION_CONTROL_RESET, TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE, TOMOYO_TRANSITION_CONTROL_INITIALIZE, TOMOYO_TRANSITION_CONTROL_NO_KEEP, TOMOYO_TRANSITION_CONTROL_KEEP, TOMOYO_MAX_TRANSITION_TYPE }; /* Index numbers for Access Controls. */ enum tomoyo_acl_entry_type_index { TOMOYO_TYPE_PATH_ACL, TOMOYO_TYPE_PATH2_ACL, TOMOYO_TYPE_PATH_NUMBER_ACL, TOMOYO_TYPE_MKDEV_ACL, TOMOYO_TYPE_MOUNT_ACL, TOMOYO_TYPE_INET_ACL, TOMOYO_TYPE_UNIX_ACL, TOMOYO_TYPE_ENV_ACL, TOMOYO_TYPE_MANUAL_TASK_ACL, }; /* Index numbers for access controls with one pathname. */ enum tomoyo_path_acl_index { TOMOYO_TYPE_EXECUTE, TOMOYO_TYPE_READ, TOMOYO_TYPE_WRITE, TOMOYO_TYPE_APPEND, TOMOYO_TYPE_UNLINK, TOMOYO_TYPE_GETATTR, TOMOYO_TYPE_RMDIR, TOMOYO_TYPE_TRUNCATE, TOMOYO_TYPE_SYMLINK, TOMOYO_TYPE_CHROOT, TOMOYO_TYPE_UMOUNT, TOMOYO_MAX_PATH_OPERATION }; /* Index numbers for /sys/kernel/security/tomoyo/stat interface. */ enum tomoyo_memory_stat_type { TOMOYO_MEMORY_POLICY, TOMOYO_MEMORY_AUDIT, TOMOYO_MEMORY_QUERY, TOMOYO_MAX_MEMORY_STAT }; enum tomoyo_mkdev_acl_index { TOMOYO_TYPE_MKBLOCK, TOMOYO_TYPE_MKCHAR, TOMOYO_MAX_MKDEV_OPERATION }; /* Index numbers for socket operations. */ enum tomoyo_network_acl_index { TOMOYO_NETWORK_BIND, /* bind() operation. */ TOMOYO_NETWORK_LISTEN, /* listen() operation. */ TOMOYO_NETWORK_CONNECT, /* connect() operation. */ TOMOYO_NETWORK_SEND, /* send() operation. */ TOMOYO_MAX_NETWORK_OPERATION }; /* Index numbers for access controls with two pathnames. */ enum tomoyo_path2_acl_index { TOMOYO_TYPE_LINK, TOMOYO_TYPE_RENAME, TOMOYO_TYPE_PIVOT_ROOT, TOMOYO_MAX_PATH2_OPERATION }; /* Index numbers for access controls with one pathname and one number. */ enum tomoyo_path_number_acl_index { TOMOYO_TYPE_CREATE, TOMOYO_TYPE_MKDIR, TOMOYO_TYPE_MKFIFO, TOMOYO_TYPE_MKSOCK, TOMOYO_TYPE_IOCTL, TOMOYO_TYPE_CHMOD, TOMOYO_TYPE_CHOWN, TOMOYO_TYPE_CHGRP, TOMOYO_MAX_PATH_NUMBER_OPERATION }; /* Index numbers for /sys/kernel/security/tomoyo/ interfaces. */ enum tomoyo_securityfs_interface_index { TOMOYO_DOMAINPOLICY, TOMOYO_EXCEPTIONPOLICY, TOMOYO_PROCESS_STATUS, TOMOYO_STAT, TOMOYO_AUDIT, TOMOYO_VERSION, TOMOYO_PROFILE, TOMOYO_QUERY, TOMOYO_MANAGER }; /* Index numbers for special mount operations. */ enum tomoyo_special_mount { TOMOYO_MOUNT_BIND, /* mount --bind /source /dest */ TOMOYO_MOUNT_MOVE, /* mount --move /old /new */ TOMOYO_MOUNT_REMOUNT, /* mount -o remount /dir */ TOMOYO_MOUNT_MAKE_UNBINDABLE, /* mount --make-unbindable /dir */ TOMOYO_MOUNT_MAKE_PRIVATE, /* mount --make-private /dir */ TOMOYO_MOUNT_MAKE_SLAVE, /* mount --make-slave /dir */ TOMOYO_MOUNT_MAKE_SHARED, /* mount --make-shared /dir */ TOMOYO_MAX_SPECIAL_MOUNT }; /* Index numbers for functionality. */ enum tomoyo_mac_index { TOMOYO_MAC_FILE_EXECUTE, TOMOYO_MAC_FILE_OPEN, TOMOYO_MAC_FILE_CREATE, TOMOYO_MAC_FILE_UNLINK, TOMOYO_MAC_FILE_GETATTR, TOMOYO_MAC_FILE_MKDIR, TOMOYO_MAC_FILE_RMDIR, TOMOYO_MAC_FILE_MKFIFO, TOMOYO_MAC_FILE_MKSOCK, TOMOYO_MAC_FILE_TRUNCATE, TOMOYO_MAC_FILE_SYMLINK, TOMOYO_MAC_FILE_MKBLOCK, TOMOYO_MAC_FILE_MKCHAR, TOMOYO_MAC_FILE_LINK, TOMOYO_MAC_FILE_RENAME, TOMOYO_MAC_FILE_CHMOD, TOMOYO_MAC_FILE_CHOWN, TOMOYO_MAC_FILE_CHGRP, TOMOYO_MAC_FILE_IOCTL, TOMOYO_MAC_FILE_CHROOT, TOMOYO_MAC_FILE_MOUNT, TOMOYO_MAC_FILE_UMOUNT, TOMOYO_MAC_FILE_PIVOT_ROOT, TOMOYO_MAC_NETWORK_INET_STREAM_BIND, TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN, TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT, TOMOYO_MAC_NETWORK_INET_DGRAM_BIND, TOMOYO_MAC_NETWORK_INET_DGRAM_SEND, TOMOYO_MAC_NETWORK_INET_RAW_BIND, TOMOYO_MAC_NETWORK_INET_RAW_SEND, TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND, TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN, TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT, TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND, TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT, TOMOYO_MAC_ENVIRON, TOMOYO_MAX_MAC_INDEX }; /* Index numbers for category of functionality. */ enum tomoyo_mac_category_index { TOMOYO_MAC_CATEGORY_FILE, TOMOYO_MAC_CATEGORY_NETWORK, TOMOYO_MAC_CATEGORY_MISC, TOMOYO_MAX_MAC_CATEGORY_INDEX }; /* * Retry this request. Returned by tomoyo_supervisor() if policy violation has * occurred in enforcing mode and the userspace daemon decided to retry. * * We must choose a positive value in order to distinguish "granted" (which is * 0) and "rejected" (which is a negative value) and "retry". */ #define TOMOYO_RETRY_REQUEST 1 /* Index numbers for /sys/kernel/security/tomoyo/stat interface. */ enum tomoyo_policy_stat_type { /* Do not change this order. */ TOMOYO_STAT_POLICY_UPDATES, TOMOYO_STAT_POLICY_LEARNING, /* == TOMOYO_CONFIG_LEARNING */ TOMOYO_STAT_POLICY_PERMISSIVE, /* == TOMOYO_CONFIG_PERMISSIVE */ TOMOYO_STAT_POLICY_ENFORCING, /* == TOMOYO_CONFIG_ENFORCING */ TOMOYO_MAX_POLICY_STAT }; /* Index numbers for profile's PREFERENCE values. */ enum tomoyo_pref_index { TOMOYO_PREF_MAX_AUDIT_LOG, TOMOYO_PREF_MAX_LEARNING_ENTRY, TOMOYO_MAX_PREF }; /********** Structure definitions. **********/ /* Common header for holding ACL entries. */ struct tomoyo_acl_head { struct list_head list; s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */ } __packed; /* Common header for shared entries. */ struct tomoyo_shared_acl_head { struct list_head list; atomic_t users; } __packed; struct tomoyo_policy_namespace; /* Structure for request info. */ struct tomoyo_request_info { /* * For holding parameters specific to operations which deal files. * NULL if not dealing files. */ struct tomoyo_obj_info *obj; /* * For holding parameters specific to execve() request. * NULL if not dealing execve(). */ struct tomoyo_execve *ee; struct tomoyo_domain_info *domain; /* For holding parameters. */ union { struct { const struct tomoyo_path_info *filename; /* For using wildcards at tomoyo_find_next_domain(). */ const struct tomoyo_path_info *matched_path; /* One of values in "enum tomoyo_path_acl_index". */ u8 operation; } path; struct { const struct tomoyo_path_info *filename1; const struct tomoyo_path_info *filename2; /* One of values in "enum tomoyo_path2_acl_index". */ u8 operation; } path2; struct { const struct tomoyo_path_info *filename; unsigned int mode; unsigned int major; unsigned int minor; /* One of values in "enum tomoyo_mkdev_acl_index". */ u8 operation; } mkdev; struct { const struct tomoyo_path_info *filename; unsigned long number; /* * One of values in * "enum tomoyo_path_number_acl_index". */ u8 operation; } path_number; struct { const struct tomoyo_path_info *name; } environ; struct { const __be32 *address; u16 port; /* One of values smaller than TOMOYO_SOCK_MAX. */ u8 protocol; /* One of values in "enum tomoyo_network_acl_index". */ u8 operation; bool is_ipv6; } inet_network; struct { const struct tomoyo_path_info *address; /* One of values smaller than TOMOYO_SOCK_MAX. */ u8 protocol; /* One of values in "enum tomoyo_network_acl_index". */ u8 operation; } unix_network; struct { const struct tomoyo_path_info *type; const struct tomoyo_path_info *dir; const struct tomoyo_path_info *dev; unsigned long flags; int need_dev; } mount; struct { const struct tomoyo_path_info *domainname; } task; } param; struct tomoyo_acl_info *matched_acl; u8 param_type; bool granted; u8 retry; u8 profile; u8 mode; /* One of tomoyo_mode_index . */ u8 type; }; /* Structure for holding a token. */ struct tomoyo_path_info { const char *name; u32 hash; /* = full_name_hash(name, strlen(name)) */ u16 const_len; /* = tomoyo_const_part_length(name) */ bool is_dir; /* = tomoyo_strendswith(name, "/") */ bool is_patterned; /* = tomoyo_path_contains_pattern(name) */ }; /* Structure for holding string data. */ struct tomoyo_name { struct tomoyo_shared_acl_head head; struct tomoyo_path_info entry; }; /* Structure for holding a word. */ struct tomoyo_name_union { /* Either @filename or @group is NULL. */ const struct tomoyo_path_info *filename; struct tomoyo_group *group; }; /* Structure for holding a number. */ struct tomoyo_number_union { unsigned long values[2]; struct tomoyo_group *group; /* Maybe NULL. */ /* One of values in "enum tomoyo_value_type". */ u8 value_type[2]; }; /* Structure for holding an IP address. */ struct tomoyo_ipaddr_union { struct in6_addr ip[2]; /* Big endian. */ struct tomoyo_group *group; /* Pointer to address group. */ bool is_ipv6; /* Valid only if @group == NULL. */ }; /* Structure for "path_group"/"number_group"/"address_group" directive. */ struct tomoyo_group { struct tomoyo_shared_acl_head head; const struct tomoyo_path_info *group_name; struct list_head member_list; }; /* Structure for "path_group" directive. */ struct tomoyo_path_group { struct tomoyo_acl_head head; const struct tomoyo_path_info *member_name; }; /* Structure for "number_group" directive. */ struct tomoyo_number_group { struct tomoyo_acl_head head; struct tomoyo_number_union number; }; /* Structure for "address_group" directive. */ struct tomoyo_address_group { struct tomoyo_acl_head head; /* Structure for holding an IP address. */ struct tomoyo_ipaddr_union address; }; /* Subset of "struct stat". Used by conditional ACL and audit logs. */ struct tomoyo_mini_stat { kuid_t uid; kgid_t gid; ino_t ino; umode_t mode; dev_t dev; dev_t rdev; }; /* Structure for dumping argv[] and envp[] of "struct linux_binprm". */ struct tomoyo_page_dump { struct page *page; /* Previously dumped page. */ char *data; /* Contents of "page". Size is PAGE_SIZE. */ }; /* Structure for attribute checks in addition to pathname checks. */ struct tomoyo_obj_info { /* * True if tomoyo_get_attributes() was already called, false otherwise. */ bool validate_done; /* True if @stat[] is valid. */ bool stat_valid[TOMOYO_MAX_PATH_STAT]; /* First pathname. Initialized with { NULL, NULL } if no path. */ struct path path1; /* Second pathname. Initialized with { NULL, NULL } if no path. */ struct path path2; /* * Information on @path1, @path1's parent directory, @path2, @path2's * parent directory. */ struct tomoyo_mini_stat stat[TOMOYO_MAX_PATH_STAT]; /* * Content of symbolic link to be created. NULL for operations other * than symlink(). */ struct tomoyo_path_info *symlink_target; }; /* Structure for argv[]. */ struct tomoyo_argv { unsigned long index; const struct tomoyo_path_info *value; bool is_not; }; /* Structure for envp[]. */ struct tomoyo_envp { const struct tomoyo_path_info *name; const struct tomoyo_path_info *value; bool is_not; }; /* Structure for execve() operation. */ struct tomoyo_execve { struct tomoyo_request_info r; struct tomoyo_obj_info obj; struct linux_binprm *bprm; const struct tomoyo_path_info *transition; /* For dumping argv[] and envp[]. */ struct tomoyo_page_dump dump; /* For temporary use. */ char *tmp; /* Size is TOMOYO_EXEC_TMPSIZE bytes */ }; /* Structure for entries which follows "struct tomoyo_condition". */ struct tomoyo_condition_element { /* * Left hand operand. A "struct tomoyo_argv" for TOMOYO_ARGV_ENTRY, a * "struct tomoyo_envp" for TOMOYO_ENVP_ENTRY is attached to the tail * of the array of this struct. */ u8 left; /* * Right hand operand. A "struct tomoyo_number_union" for * TOMOYO_NUMBER_UNION, a "struct tomoyo_name_union" for * TOMOYO_NAME_UNION is attached to the tail of the array of this * struct. */ u8 right; /* Equation operator. True if equals or overlaps, false otherwise. */ bool equals; }; /* Structure for optional arguments. */ struct tomoyo_condition { struct tomoyo_shared_acl_head head; u32 size; /* Memory size allocated for this entry. */ u16 condc; /* Number of conditions in this struct. */ u16 numbers_count; /* Number of "struct tomoyo_number_union values". */ u16 names_count; /* Number of "struct tomoyo_name_union names". */ u16 argc; /* Number of "struct tomoyo_argv". */ u16 envc; /* Number of "struct tomoyo_envp". */ u8 grant_log; /* One of values in "enum tomoyo_grant_log". */ const struct tomoyo_path_info *transit; /* Maybe NULL. */ /* * struct tomoyo_condition_element condition[condc]; * struct tomoyo_number_union values[numbers_count]; * struct tomoyo_name_union names[names_count]; * struct tomoyo_argv argv[argc]; * struct tomoyo_envp envp[envc]; */ }; /* Common header for individual entries. */ struct tomoyo_acl_info { struct list_head list; struct tomoyo_condition *cond; /* Maybe NULL. */ s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */ u8 type; /* One of values in "enum tomoyo_acl_entry_type_index". */ } __packed; /* Structure for domain information. */ struct tomoyo_domain_info { struct list_head list; struct list_head acl_info_list; /* Name of this domain. Never NULL. */ const struct tomoyo_path_info *domainname; /* Namespace for this domain. Never NULL. */ struct tomoyo_policy_namespace *ns; /* Group numbers to use. */ unsigned long group[TOMOYO_MAX_ACL_GROUPS / BITS_PER_LONG]; u8 profile; /* Profile number to use. */ bool is_deleted; /* Delete flag. */ bool flags[TOMOYO_MAX_DOMAIN_INFO_FLAGS]; atomic_t users; /* Number of referring tasks. */ }; /* * Structure for "task manual_domain_transition" directive. */ struct tomoyo_task_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MANUAL_TASK_ACL */ /* Pointer to domainname. */ const struct tomoyo_path_info *domainname; }; /* * Structure for "file execute", "file read", "file write", "file append", * "file unlink", "file getattr", "file rmdir", "file truncate", * "file symlink", "file chroot" and "file unmount" directive. */ struct tomoyo_path_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_ACL */ u16 perm; /* Bitmask of values in "enum tomoyo_path_acl_index". */ struct tomoyo_name_union name; }; /* * Structure for "file create", "file mkdir", "file mkfifo", "file mksock", * "file ioctl", "file chmod", "file chown" and "file chgrp" directive. */ struct tomoyo_path_number_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_NUMBER_ACL */ /* Bitmask of values in "enum tomoyo_path_number_acl_index". */ u8 perm; struct tomoyo_name_union name; struct tomoyo_number_union number; }; /* Structure for "file mkblock" and "file mkchar" directive. */ struct tomoyo_mkdev_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MKDEV_ACL */ u8 perm; /* Bitmask of values in "enum tomoyo_mkdev_acl_index". */ struct tomoyo_name_union name; struct tomoyo_number_union mode; struct tomoyo_number_union major; struct tomoyo_number_union minor; }; /* * Structure for "file rename", "file link" and "file pivot_root" directive. */ struct tomoyo_path2_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH2_ACL */ u8 perm; /* Bitmask of values in "enum tomoyo_path2_acl_index". */ struct tomoyo_name_union name1; struct tomoyo_name_union name2; }; /* Structure for "file mount" directive. */ struct tomoyo_mount_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MOUNT_ACL */ struct tomoyo_name_union dev_name; struct tomoyo_name_union dir_name; struct tomoyo_name_union fs_type; struct tomoyo_number_union flags; }; /* Structure for "misc env" directive in domain policy. */ struct tomoyo_env_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_ENV_ACL */ const struct tomoyo_path_info *env; /* environment variable */ }; /* Structure for "network inet" directive. */ struct tomoyo_inet_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_INET_ACL */ u8 protocol; u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */ struct tomoyo_ipaddr_union address; struct tomoyo_number_union port; }; /* Structure for "network unix" directive. */ struct tomoyo_unix_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_UNIX_ACL */ u8 protocol; u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */ struct tomoyo_name_union name; }; /* Structure for holding a line from /sys/kernel/security/tomoyo/ interface. */ struct tomoyo_acl_param { char *data; struct list_head *list; struct tomoyo_policy_namespace *ns; bool is_delete; }; #define TOMOYO_MAX_IO_READ_QUEUE 64 /* * Structure for reading/writing policy via /sys/kernel/security/tomoyo * interfaces. */ struct tomoyo_io_buffer { void (*read)(struct tomoyo_io_buffer *head); int (*write)(struct tomoyo_io_buffer *head); __poll_t (*poll)(struct file *file, poll_table *wait); /* Exclusive lock for this structure. */ struct mutex io_sem; char __user *read_user_buf; size_t read_user_buf_avail; struct { struct list_head *ns; struct list_head *domain; struct list_head *group; struct list_head *acl; size_t avail; unsigned int step; unsigned int query_index; u16 index; u16 cond_index; u8 acl_group_index; u8 cond_step; u8 bit; u8 w_pos; bool eof; bool print_this_domain_only; bool print_transition_related_only; bool print_cond_part; const char *w[TOMOYO_MAX_IO_READ_QUEUE]; } r; struct { struct tomoyo_policy_namespace *ns; /* The position currently writing to. */ struct tomoyo_domain_info *domain; /* Bytes available for writing. */ size_t avail; bool is_delete; } w; /* Buffer for reading. */ char *read_buf; /* Size of read buffer. */ size_t readbuf_size; /* Buffer for writing. */ char *write_buf; /* Size of write buffer. */ size_t writebuf_size; /* Type of this interface. */ enum tomoyo_securityfs_interface_index type; /* Users counter protected by tomoyo_io_buffer_list_lock. */ u8 users; /* List for telling GC not to kfree() elements. */ struct list_head list; }; /* * Structure for "initialize_domain"/"no_initialize_domain"/"keep_domain"/ * "no_keep_domain" keyword. */ struct tomoyo_transition_control { struct tomoyo_acl_head head; u8 type; /* One of values in "enum tomoyo_transition_type". */ /* True if the domainname is tomoyo_get_last_name(). */ bool is_last_name; const struct tomoyo_path_info *domainname; /* Maybe NULL */ const struct tomoyo_path_info *program; /* Maybe NULL */ }; /* Structure for "aggregator" keyword. */ struct tomoyo_aggregator { struct tomoyo_acl_head head; const struct tomoyo_path_info *original_name; const struct tomoyo_path_info *aggregated_name; }; /* Structure for policy manager. */ struct tomoyo_manager { struct tomoyo_acl_head head; /* A path to program or a domainname. */ const struct tomoyo_path_info *manager; }; struct tomoyo_preference { unsigned int learning_max_entry; bool enforcing_verbose; bool learning_verbose; bool permissive_verbose; }; /* Structure for /sys/kernel/security/tomnoyo/profile interface. */ struct tomoyo_profile { const struct tomoyo_path_info *comment; struct tomoyo_preference *learning; struct tomoyo_preference *permissive; struct tomoyo_preference *enforcing; struct tomoyo_preference preference; u8 default_config; u8 config[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX]; unsigned int pref[TOMOYO_MAX_PREF]; }; /* Structure for representing YYYY/MM/DD hh/mm/ss. */ struct tomoyo_time { u16 year; u8 month; u8 day; u8 hour; u8 min; u8 sec; }; /* Structure for policy namespace. */ struct tomoyo_policy_namespace { /* Profile table. Memory is allocated as needed. */ struct tomoyo_profile *profile_ptr[TOMOYO_MAX_PROFILES]; /* List of "struct tomoyo_group". */ struct list_head group_list[TOMOYO_MAX_GROUP]; /* List of policy. */ struct list_head policy_list[TOMOYO_MAX_POLICY]; /* The global ACL referred by "use_group" keyword. */ struct list_head acl_group[TOMOYO_MAX_ACL_GROUPS]; /* List for connecting to tomoyo_namespace_list list. */ struct list_head namespace_list; /* Profile version. Currently only 20150505 is defined. */ unsigned int profile_version; /* Name of this namespace (e.g. "<kernel>", "</usr/sbin/httpd>" ). */ const char *name; }; /* Structure for "struct task_struct"->security. */ struct tomoyo_task { struct tomoyo_domain_info *domain_info; struct tomoyo_domain_info *old_domain_info; }; /********** Function prototypes. **********/ bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address, const struct tomoyo_group *group); bool tomoyo_compare_number_union(const unsigned long value, const struct tomoyo_number_union *ptr); bool tomoyo_condition(struct tomoyo_request_info *r, const struct tomoyo_condition *cond); bool tomoyo_correct_domain(const unsigned char *domainname); bool tomoyo_correct_path(const char *filename); bool tomoyo_correct_word(const char *string); bool tomoyo_domain_def(const unsigned char *buffer); bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r); bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, struct tomoyo_page_dump *dump); bool tomoyo_memory_ok(void *ptr); bool tomoyo_number_matches_group(const unsigned long min, const unsigned long max, const struct tomoyo_group *group); bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param, struct tomoyo_ipaddr_union *ptr); bool tomoyo_parse_name_union(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr); bool tomoyo_parse_number_union(struct tomoyo_acl_param *param, struct tomoyo_number_union *ptr); bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename, const struct tomoyo_path_info *pattern); bool tomoyo_permstr(const char *string, const char *keyword); bool tomoyo_str_starts(char **src, const char *find); char *tomoyo_encode(const char *str); char *tomoyo_encode2(const char *str, int str_len); char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) __printf(3, 0); char *tomoyo_read_token(struct tomoyo_acl_param *param); char *tomoyo_realpath_from_path(const struct path *path); char *tomoyo_realpath_nofollow(const char *pathname); const char *tomoyo_get_exe(void); const struct tomoyo_path_info *tomoyo_compare_name_union (const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr); const struct tomoyo_path_info *tomoyo_get_domainname (struct tomoyo_acl_param *param); const struct tomoyo_path_info *tomoyo_get_name(const char *name); const struct tomoyo_path_info *tomoyo_path_matches_group (const struct tomoyo_path_info *pathname, const struct tomoyo_group *group); int tomoyo_check_open_permission(struct tomoyo_domain_info *domain, const struct path *path, const int flag); void tomoyo_close_control(struct tomoyo_io_buffer *head); int tomoyo_env_perm(struct tomoyo_request_info *r, const char *env); int tomoyo_execute_permission(struct tomoyo_request_info *r, const struct tomoyo_path_info *filename); int tomoyo_find_next_domain(struct linux_binprm *bprm); int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile, const u8 index); int tomoyo_init_request_info(struct tomoyo_request_info *r, struct tomoyo_domain_info *domain, const u8 index); int tomoyo_mkdev_perm(const u8 operation, const struct path *path, const unsigned int mode, unsigned int dev); int tomoyo_mount_permission(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data_page); int tomoyo_open_control(const u8 type, struct file *file); int tomoyo_path2_perm(const u8 operation, const struct path *path1, const struct path *path2); int tomoyo_path_number_perm(const u8 operation, const struct path *path, unsigned long number); int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target); __poll_t tomoyo_poll_control(struct file *file, poll_table *wait); __poll_t tomoyo_poll_log(struct file *file, poll_table *wait); int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr, int addr_len); int tomoyo_socket_connect_permission(struct socket *sock, struct sockaddr *addr, int addr_len); int tomoyo_socket_listen_permission(struct socket *sock); int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg, int size); int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...) __printf(2, 3); int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate) (const struct tomoyo_acl_info *, const struct tomoyo_acl_info *), bool (*merge_duplicate) (struct tomoyo_acl_info *, struct tomoyo_acl_info *, const bool)); int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate) (const struct tomoyo_acl_head *, const struct tomoyo_acl_head *)); int tomoyo_write_aggregator(struct tomoyo_acl_param *param); int tomoyo_write_file(struct tomoyo_acl_param *param); int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type); int tomoyo_write_misc(struct tomoyo_acl_param *param); int tomoyo_write_inet_network(struct tomoyo_acl_param *param); int tomoyo_write_transition_control(struct tomoyo_acl_param *param, const u8 type); int tomoyo_write_unix_network(struct tomoyo_acl_param *param); ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer, const int buffer_len); ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head, const char __user *buffer, const int buffer_len); struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param); struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname, const bool transit); struct tomoyo_domain_info *tomoyo_domain(void); struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname); struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param, const u8 idx); struct tomoyo_policy_namespace *tomoyo_assign_namespace (const char *domainname); struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns, const u8 profile); u8 tomoyo_parse_ulong(unsigned long *result, char **str); void *tomoyo_commit_ok(void *data, const unsigned int size); void __init tomoyo_load_builtin_policy(void); void __init tomoyo_mm_init(void); void tomoyo_check_acl(struct tomoyo_request_info *r, bool (*check_entry)(struct tomoyo_request_info *, const struct tomoyo_acl_info *)); void tomoyo_check_profile(void); void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp); void tomoyo_del_condition(struct list_head *element); void tomoyo_fill_path_info(struct tomoyo_path_info *ptr); void tomoyo_get_attributes(struct tomoyo_obj_info *obj); void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns); void tomoyo_load_policy(const char *filename); void tomoyo_normalize_line(unsigned char *buffer); void tomoyo_notify_gc(struct tomoyo_io_buffer *head, const bool is_register); void tomoyo_print_ip(char *buf, const unsigned int size, const struct tomoyo_ipaddr_union *ptr); void tomoyo_print_ulong(char *buffer, const int buffer_len, const unsigned long value, const u8 type); void tomoyo_put_name_union(struct tomoyo_name_union *ptr); void tomoyo_put_number_union(struct tomoyo_number_union *ptr); void tomoyo_read_log(struct tomoyo_io_buffer *head); void tomoyo_update_stat(const u8 index); void tomoyo_warn_oom(const char *function); void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...) __printf(2, 3); void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) __printf(3, 0); /********** External variable definitions. **********/ extern bool tomoyo_policy_loaded; extern int tomoyo_enabled; extern const char * const tomoyo_condition_keyword [TOMOYO_MAX_CONDITION_KEYWORD]; extern const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS]; extern const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX]; extern const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE]; extern const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION]; extern const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX]; extern const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION]; extern const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX]; extern const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION]; extern const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION]; extern const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION]; extern struct list_head tomoyo_condition_list; extern struct list_head tomoyo_domain_list; extern struct list_head tomoyo_name_list[TOMOYO_MAX_HASH]; extern struct list_head tomoyo_namespace_list; extern struct mutex tomoyo_policy_lock; extern struct srcu_struct tomoyo_ss; extern struct tomoyo_domain_info tomoyo_kernel_domain; extern struct tomoyo_policy_namespace tomoyo_kernel_namespace; extern unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT]; extern unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT]; extern struct lsm_blob_sizes tomoyo_blob_sizes; /********** Inlined functions. **********/ /** * tomoyo_read_lock - Take lock for protecting policy. * * Returns index number for tomoyo_read_unlock(). */ static inline int tomoyo_read_lock(void) { return srcu_read_lock(&tomoyo_ss); } /** * tomoyo_read_unlock - Release lock for protecting policy. * * @idx: Index number returned by tomoyo_read_lock(). * * Returns nothing. */ static inline void tomoyo_read_unlock(int idx) { srcu_read_unlock(&tomoyo_ss, idx); } /** * tomoyo_sys_getppid - Copy of getppid(). * * Returns parent process's PID. * * Alpha does not have getppid() defined. To be able to build this module on * Alpha, I have to copy getppid() from kernel/timer.c. */ static inline pid_t tomoyo_sys_getppid(void) { pid_t pid; rcu_read_lock(); pid = task_tgid_vnr(rcu_dereference(current->real_parent)); rcu_read_unlock(); return pid; } /** * tomoyo_sys_getpid - Copy of getpid(). * * Returns current thread's PID. * * Alpha does not have getpid() defined. To be able to build this module on * Alpha, I have to copy getpid() from kernel/timer.c. */ static inline pid_t tomoyo_sys_getpid(void) { return task_tgid_vnr(current); } /** * tomoyo_pathcmp - strcmp() for "struct tomoyo_path_info" structure. * * @a: Pointer to "struct tomoyo_path_info". * @b: Pointer to "struct tomoyo_path_info". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_pathcmp(const struct tomoyo_path_info *a, const struct tomoyo_path_info *b) { return a->hash != b->hash || strcmp(a->name, b->name); } /** * tomoyo_put_name - Drop reference on "struct tomoyo_name". * * @name: Pointer to "struct tomoyo_path_info". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_name(const struct tomoyo_path_info *name) { if (name) { struct tomoyo_name *ptr = container_of(name, typeof(*ptr), entry); atomic_dec(&ptr->head.users); } } /** * tomoyo_put_condition - Drop reference on "struct tomoyo_condition". * * @cond: Pointer to "struct tomoyo_condition". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_condition(struct tomoyo_condition *cond) { if (cond) atomic_dec(&cond->head.users); } /** * tomoyo_put_group - Drop reference on "struct tomoyo_group". * * @group: Pointer to "struct tomoyo_group". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_group(struct tomoyo_group *group) { if (group) atomic_dec(&group->head.users); } /** * tomoyo_task - Get "struct tomoyo_task" for specified thread. * * @task - Pointer to "struct task_struct". * * Returns pointer to "struct tomoyo_task" for specified thread. */ static inline struct tomoyo_task *tomoyo_task(struct task_struct *task) { return task->security + tomoyo_blob_sizes.lbs_task; } /** * tomoyo_same_name_union - Check for duplicated "struct tomoyo_name_union" entry. * * @a: Pointer to "struct tomoyo_name_union". * @b: Pointer to "struct tomoyo_name_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_name_union (const struct tomoyo_name_union *a, const struct tomoyo_name_union *b) { return a->filename == b->filename && a->group == b->group; } /** * tomoyo_same_number_union - Check for duplicated "struct tomoyo_number_union" entry. * * @a: Pointer to "struct tomoyo_number_union". * @b: Pointer to "struct tomoyo_number_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_number_union (const struct tomoyo_number_union *a, const struct tomoyo_number_union *b) { return a->values[0] == b->values[0] && a->values[1] == b->values[1] && a->group == b->group && a->value_type[0] == b->value_type[0] && a->value_type[1] == b->value_type[1]; } /** * tomoyo_same_ipaddr_union - Check for duplicated "struct tomoyo_ipaddr_union" entry. * * @a: Pointer to "struct tomoyo_ipaddr_union". * @b: Pointer to "struct tomoyo_ipaddr_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_ipaddr_union (const struct tomoyo_ipaddr_union *a, const struct tomoyo_ipaddr_union *b) { return !memcmp(a->ip, b->ip, sizeof(a->ip)) && a->group == b->group && a->is_ipv6 == b->is_ipv6; } /** * tomoyo_current_namespace - Get "struct tomoyo_policy_namespace" for current thread. * * Returns pointer to "struct tomoyo_policy_namespace" for current thread. */ static inline struct tomoyo_policy_namespace *tomoyo_current_namespace(void) { return tomoyo_domain()->ns; } /** * list_for_each_cookie - iterate over a list with cookie. * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_cookie(pos, head) \ if (!pos) \ pos = srcu_dereference((head)->next, &tomoyo_ss); \ for ( ; pos != (head); pos = srcu_dereference(pos->next, &tomoyo_ss)) #endif /* !defined(_SECURITY_TOMOYO_COMMON_H) */ |
238 239 238 239 26 139 318 319 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 | // SPDX-License-Identifier: GPL-2.0-or-later /* Common capabilities, needed by capability.o. */ #include <linux/capability.h> #include <linux/audit.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/lsm_hooks.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/ptrace.h> #include <linux/xattr.h> #include <linux/hugetlb.h> #include <linux/mount.h> #include <linux/sched.h> #include <linux/prctl.h> #include <linux/securebits.h> #include <linux/user_namespace.h> #include <linux/binfmts.h> #include <linux/personality.h> #include <linux/mnt_idmapping.h> #include <uapi/linux/lsm.h> #define CREATE_TRACE_POINTS #include <trace/events/capability.h> /* * If a non-root user executes a setuid-root binary in * !secure(SECURE_NOROOT) mode, then we raise capabilities. * However if fE is also set, then the intent is for only * the file capabilities to be applied, and the setuid-root * bit is left on either to change the uid (plausible) or * to get full privilege on a kernel without file capabilities * support. So in that case we do not raise capabilities. * * Warn if that happens, once per boot. */ static void warn_setuid_and_fcaps_mixed(const char *fname) { static int warned; if (!warned) { printk(KERN_INFO "warning: `%s' has both setuid-root and" " effective capabilities. Therefore not raising all" " capabilities.\n", fname); warned = 1; } } /** * cap_capable_helper - Determine whether a task has a particular effective * capability. * @cred: The credentials to use * @target_ns: The user namespace of the resource being accessed * @cred_ns: The user namespace of the credentials * @cap: The capability to check for * * Determine whether the nominated task has the specified capability amongst * its effective set, returning 0 if it does, -ve if it does not. * * See cap_capable for more details. */ static inline int cap_capable_helper(const struct cred *cred, struct user_namespace *target_ns, const struct user_namespace *cred_ns, int cap) { struct user_namespace *ns = target_ns; /* See if cred has the capability in the target user namespace * by examining the target user namespace and all of the target * user namespace's parents. */ for (;;) { /* Do we have the necessary capabilities? */ if (likely(ns == cred_ns)) return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; /* * If we're already at a lower level than we're looking for, * we're done searching. */ if (ns->level <= cred_ns->level) return -EPERM; /* * The owner of the user namespace in the parent of the * user namespace has all caps. */ if ((ns->parent == cred_ns) && uid_eq(ns->owner, cred->euid)) return 0; /* * If you have a capability in a parent user ns, then you have * it over all children user namespaces as well. */ ns = ns->parent; } /* We never get here */ } /** * cap_capable - Determine whether a task has a particular effective capability * @cred: The credentials to use * @target_ns: The user namespace of the resource being accessed * @cap: The capability to check for * @opts: Bitmask of options defined in include/linux/security.h (unused) * * Determine whether the nominated task has the specified capability amongst * its effective set, returning 0 if it does, -ve if it does not. * * NOTE WELL: cap_capable() has reverse semantics to the capable() call * and friends. That is cap_capable() returns an int 0 when a task has * a capability, while the kernel's capable(), has_ns_capability(), * has_ns_capability_noaudit(), and has_capability_noaudit() return a * bool true (1) for this case. */ int cap_capable(const struct cred *cred, struct user_namespace *target_ns, int cap, unsigned int opts) { const struct user_namespace *cred_ns = cred->user_ns; int ret = cap_capable_helper(cred, target_ns, cred_ns, cap); trace_cap_capable(cred, target_ns, cred_ns, cap, ret); return ret; } /** * cap_settime - Determine whether the current process may set the system clock * @ts: The time to set * @tz: The timezone to set * * Determine whether the current process may set the system clock and timezone * information, returning 0 if permission granted, -ve if denied. */ int cap_settime(const struct timespec64 *ts, const struct timezone *tz) { if (!capable(CAP_SYS_TIME)) return -EPERM; return 0; } /** * cap_ptrace_access_check - Determine whether the current process may access * another * @child: The process to be accessed * @mode: The mode of attachment. * * If we are in the same or an ancestor user_ns and have all the target * task's capabilities, then ptrace access is allowed. * If we have the ptrace capability to the target user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether a process may access another, returning 0 if permission * granted, -ve if denied. */ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; const struct cred *cred, *child_cred; const kernel_cap_t *caller_caps; rcu_read_lock(); cred = current_cred(); child_cred = __task_cred(child); if (mode & PTRACE_MODE_FSCREDS) caller_caps = &cred->cap_effective; else caller_caps = &cred->cap_permitted; if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, *caller_caps)) goto out; if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_ptrace_traceme - Determine whether another process may trace the current * @parent: The task proposed to be the tracer * * If parent is in the same or an ancestor user_ns and has all current's * capabilities, then ptrace access is allowed. * If parent has the ptrace capability to current's user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether the nominated task is permitted to trace the current * process, returning 0 if permission is granted, -ve if denied. */ int cap_ptrace_traceme(struct task_struct *parent) { int ret = 0; const struct cred *cred, *child_cred; rcu_read_lock(); cred = __task_cred(parent); child_cred = current_cred(); if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) goto out; if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_capget - Retrieve a task's capability sets * @target: The task from which to retrieve the capability sets * @effective: The place to record the effective set * @inheritable: The place to record the inheritable set * @permitted: The place to record the permitted set * * This function retrieves the capabilities of the nominated task and returns * them to the caller. */ int cap_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { const struct cred *cred; /* Derived from kernel/capability.c:sys_capget. */ rcu_read_lock(); cred = __task_cred(target); *effective = cred->cap_effective; *inheritable = cred->cap_inheritable; *permitted = cred->cap_permitted; rcu_read_unlock(); return 0; } /* * Determine whether the inheritable capabilities are limited to the old * permitted set. Returns 1 if they are limited, 0 if they are not. */ static inline int cap_inh_is_capped(void) { /* they are so limited unless the current task has the CAP_SETPCAP * capability */ if (cap_capable(current_cred(), current_cred()->user_ns, CAP_SETPCAP, CAP_OPT_NONE) == 0) return 0; return 1; } /** * cap_capset - Validate and apply proposed changes to current's capabilities * @new: The proposed new credentials; alterations should be made here * @old: The current task's current credentials * @effective: A pointer to the proposed new effective capabilities set * @inheritable: A pointer to the proposed new inheritable capabilities set * @permitted: A pointer to the proposed new permitted capabilities set * * This function validates and applies a proposed mass change to the current * process's capability sets. The changes are made to the proposed new * credentials, and assuming no error, will be committed by the caller of LSM. */ int cap_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { if (cap_inh_is_capped() && !cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_permitted))) /* incapable of using this inheritable set */ return -EPERM; if (!cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_bset))) /* no new pI capabilities outside bounding set */ return -EPERM; /* verify restrictions on target's new Permitted set */ if (!cap_issubset(*permitted, old->cap_permitted)) return -EPERM; /* verify the _new_Effective_ is a subset of the _new_Permitted_ */ if (!cap_issubset(*effective, *permitted)) return -EPERM; new->cap_effective = *effective; new->cap_inheritable = *inheritable; new->cap_permitted = *permitted; /* * Mask off ambient bits that are no longer both permitted and * inheritable. */ new->cap_ambient = cap_intersect(new->cap_ambient, cap_intersect(*permitted, *inheritable)); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EINVAL; return 0; } /** * cap_inode_need_killpriv - Determine if inode change affects privileges * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV * * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV * affects the security markings on that inode, and if it is, should * inode_killpriv() be invoked or the change rejected. * * Return: 1 if security.capability has a value, meaning inode_killpriv() * is required, 0 otherwise, meaning inode_killpriv() is not required. */ int cap_inode_need_killpriv(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); int error; error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0); return error > 0; } /** * cap_inode_killpriv - Erase the security markings on an inode * * @idmap: idmap of the mount the inode was found from * @dentry: The inode/dentry to alter * * Erase the privilege-enhancing security markings on an inode. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * Return: 0 if successful, -ve on error. */ int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { int error; error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS); if (error == -EOPNOTSUPP) error = 0; return error; } static bool rootid_owns_currentns(vfsuid_t rootvfsuid) { struct user_namespace *ns; kuid_t kroot; if (!vfsuid_valid(rootvfsuid)) return false; kroot = vfsuid_into_kuid(rootvfsuid); for (ns = current_user_ns();; ns = ns->parent) { if (from_kuid(ns, kroot) == 0) return true; if (ns == &init_user_ns) break; } return false; } static __u32 sansflags(__u32 m) { return m & ~VFS_CAP_FLAGS_EFFECTIVE; } static bool is_v2header(int size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_2) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2; } static bool is_v3header(int size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_3) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3; } /* * getsecurity: We are called for security.* before any attempt to read the * xattr from the inode itself. * * This gives us a chance to read the on-disk value and convert it. If we * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler. * * Note we are not called by vfs_getxattr_alloc(), but that is only called * by the integrity subsystem, which really wants the unconverted values - * so that's good. */ int cap_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { int size; kuid_t kroot; vfsuid_t vfsroot; u32 nsmagic, magic; uid_t root, mappedroot; char *tmpbuf = NULL; struct vfs_cap_data *cap; struct vfs_ns_cap_data *nscap = NULL; struct dentry *dentry; struct user_namespace *fs_ns; if (strcmp(name, "capability") != 0) return -EOPNOTSUPP; dentry = d_find_any_alias(inode); if (!dentry) return -EINVAL; size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, &tmpbuf, sizeof(struct vfs_ns_cap_data), GFP_NOFS); dput(dentry); /* gcc11 complains if we don't check for !tmpbuf */ if (size < 0 || !tmpbuf) goto out_free; fs_ns = inode->i_sb->s_user_ns; cap = (struct vfs_cap_data *) tmpbuf; if (is_v2header(size, cap)) { root = 0; } else if (is_v3header(size, cap)) { nscap = (struct vfs_ns_cap_data *) tmpbuf; root = le32_to_cpu(nscap->rootid); } else { size = -EINVAL; goto out_free; } kroot = make_kuid(fs_ns, root); /* If this is an idmapped mount shift the kuid. */ vfsroot = make_vfsuid(idmap, fs_ns, kroot); /* If the root kuid maps to a valid uid in current ns, then return * this as a nscap. */ mappedroot = from_kuid(current_user_ns(), vfsuid_into_kuid(vfsroot)); if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { size = sizeof(struct vfs_ns_cap_data); if (alloc) { if (!nscap) { /* v2 -> v3 conversion */ nscap = kzalloc(size, GFP_ATOMIC); if (!nscap) { size = -ENOMEM; goto out_free; } nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); nscap->magic_etc = cpu_to_le32(nsmagic); } else { /* use allocated v3 buffer */ tmpbuf = NULL; } nscap->rootid = cpu_to_le32(mappedroot); *buffer = nscap; } goto out_free; } if (!rootid_owns_currentns(vfsroot)) { size = -EOVERFLOW; goto out_free; } /* This comes from a parent namespace. Return as a v2 capability */ size = sizeof(struct vfs_cap_data); if (alloc) { if (nscap) { /* v3 -> v2 conversion */ cap = kzalloc(size, GFP_ATOMIC); if (!cap) { size = -ENOMEM; goto out_free; } magic = VFS_CAP_REVISION_2; nsmagic = le32_to_cpu(nscap->magic_etc); if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) magic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); cap->magic_etc = cpu_to_le32(magic); } else { /* use unconverted v2 */ tmpbuf = NULL; } *buffer = cap; } out_free: kfree(tmpbuf); return size; } /** * rootid_from_xattr - translate root uid of vfs caps * * @value: vfs caps value which may be modified by this function * @size: size of @ivalue * @task_ns: user namespace of the caller */ static vfsuid_t rootid_from_xattr(const void *value, size_t size, struct user_namespace *task_ns) { const struct vfs_ns_cap_data *nscap = value; uid_t rootid = 0; if (size == XATTR_CAPS_SZ_3) rootid = le32_to_cpu(nscap->rootid); return VFSUIDT_INIT(make_kuid(task_ns, rootid)); } static bool validheader(size_t size, const struct vfs_cap_data *cap) { return is_v2header(size, cap) || is_v3header(size, cap); } /** * cap_convert_nscap - check vfs caps * * @idmap: idmap of the mount the inode was found from * @dentry: used to retrieve inode to check permissions on * @ivalue: vfs caps value which may be modified by this function * @size: size of @ivalue * * User requested a write of security.capability. If needed, update the * xattr to change from v2 to v3, or to fixup the v3 rootid. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * Return: On success, return the new size; on error, return < 0. */ int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size) { struct vfs_ns_cap_data *nscap; uid_t nsrootid; const struct vfs_cap_data *cap = *ivalue; __u32 magic, nsmagic; struct inode *inode = d_backing_inode(dentry); struct user_namespace *task_ns = current_user_ns(), *fs_ns = inode->i_sb->s_user_ns; kuid_t rootid; vfsuid_t vfsrootid; size_t newsize; if (!*ivalue) return -EINVAL; if (!validheader(size, cap)) return -EINVAL; if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP)) return -EPERM; if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap)) if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP)) /* user is privileged, just write the v2 */ return size; vfsrootid = rootid_from_xattr(*ivalue, size, task_ns); if (!vfsuid_valid(vfsrootid)) return -EINVAL; rootid = from_vfsuid(idmap, fs_ns, vfsrootid); if (!uid_valid(rootid)) return -EINVAL; nsrootid = from_kuid(fs_ns, rootid); if (nsrootid == -1) return -EINVAL; newsize = sizeof(struct vfs_ns_cap_data); nscap = kmalloc(newsize, GFP_ATOMIC); if (!nscap) return -ENOMEM; nscap->rootid = cpu_to_le32(nsrootid); nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; nscap->magic_etc = cpu_to_le32(nsmagic); memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); *ivalue = nscap; return newsize; } /* * Calculate the new process capability sets from the capability sets attached * to a file. */ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, struct linux_binprm *bprm, bool *effective, bool *has_fcap) { struct cred *new = bprm->cred; int ret = 0; if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE) *effective = true; if (caps->magic_etc & VFS_CAP_REVISION_MASK) *has_fcap = true; /* * pP' = (X & fP) | (pI & fI) * The addition of pA' is handled later. */ new->cap_permitted.val = (new->cap_bset.val & caps->permitted.val) | (new->cap_inheritable.val & caps->inheritable.val); if (caps->permitted.val & ~new->cap_permitted.val) /* insufficient to execute correctly */ ret = -EPERM; /* * For legacy apps, with no internal support for recognizing they * do not have enough capabilities, we return an error if they are * missing some "forced" (aka file-permitted) capabilities. */ return *effective ? ret : 0; } /** * get_vfs_caps_from_disk - retrieve vfs caps from disk * * @idmap: idmap of the mount the inode was found from * @dentry: dentry from which @inode is retrieved * @cpu_caps: vfs capabilities * * Extract the on-exec-apply capability sets for an executable file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) { struct inode *inode = d_backing_inode(dentry); __u32 magic_etc; int size; struct vfs_ns_cap_data data, *nscaps = &data; struct vfs_cap_data *caps = (struct vfs_cap_data *) &data; kuid_t rootkuid; vfsuid_t rootvfsuid; struct user_namespace *fs_ns; memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data)); if (!inode) return -ENODATA; fs_ns = inode->i_sb->s_user_ns; size = __vfs_getxattr((struct dentry *)dentry, inode, XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ); if (size == -ENODATA || size == -EOPNOTSUPP) /* no data, that's ok */ return -ENODATA; if (size < 0) return size; if (size < sizeof(magic_etc)) return -EINVAL; cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc); rootkuid = make_kuid(fs_ns, 0); switch (magic_etc & VFS_CAP_REVISION_MASK) { case VFS_CAP_REVISION_1: if (size != XATTR_CAPS_SZ_1) return -EINVAL; break; case VFS_CAP_REVISION_2: if (size != XATTR_CAPS_SZ_2) return -EINVAL; break; case VFS_CAP_REVISION_3: if (size != XATTR_CAPS_SZ_3) return -EINVAL; rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid)); break; default: return -EINVAL; } rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid); if (!vfsuid_valid(rootvfsuid)) return -ENODATA; /* Limit the caps to the mounter of the filesystem * or the more limited uid specified in the xattr. */ if (!rootid_owns_currentns(rootvfsuid)) return -ENODATA; cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted); cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable); /* * Rev1 had just a single 32-bit word, later expanded * to a second one for the high bits */ if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) { cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32; cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32; } cpu_caps->permitted.val &= CAP_VALID_MASK; cpu_caps->inheritable.val &= CAP_VALID_MASK; cpu_caps->rootid = vfsuid_into_kuid(rootvfsuid); return 0; } /* * Attempt to get the on-exec apply capability sets for an executable file from * its xattrs and, if present, apply them to the proposed credentials being * constructed by execve(). */ static int get_file_caps(struct linux_binprm *bprm, const struct file *file, bool *effective, bool *has_fcap) { int rc = 0; struct cpu_vfs_cap_data vcaps; cap_clear(bprm->cred->cap_permitted); if (!file_caps_enabled) return 0; if (!mnt_may_suid(file->f_path.mnt)) return 0; /* * This check is redundant with mnt_may_suid() but is kept to make * explicit that capability bits are limited to s_user_ns and its * descendants. */ if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns)) return 0; rc = get_vfs_caps_from_disk(file_mnt_idmap(file), file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) printk(KERN_NOTICE "Invalid argument reading file caps for %s\n", bprm->filename); else if (rc == -ENODATA) rc = 0; goto out; } rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap); out: if (rc) cap_clear(bprm->cred->cap_permitted); return rc; } static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); } static inline bool __is_real(kuid_t uid, struct cred *cred) { return uid_eq(cred->uid, uid); } static inline bool __is_eff(kuid_t uid, struct cred *cred) { return uid_eq(cred->euid, uid); } static inline bool __is_suid(kuid_t uid, struct cred *cred) { return !__is_real(uid, cred) && __is_eff(uid, cred); } /* * handle_privileged_root - Handle case of privileged root * @bprm: The execution parameters, including the proposed creds * @has_fcap: Are any file capabilities set? * @effective: Do we have effective root privilege? * @root_uid: This namespace' root UID WRT initial USER namespace * * Handle the case where root is privileged and hasn't been neutered by * SECURE_NOROOT. If file capabilities are set, they won't be combined with * set UID root and nothing is changed. If we are root, cap_permitted is * updated. If we have become set UID root, the effective bit is set. */ static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, bool *effective, kuid_t root_uid) { const struct cred *old = current_cred(); struct cred *new = bprm->cred; if (!root_privileged()) return; /* * If the legacy file capability is set, then don't set privs * for a setuid root binary run by a non-root user. Do set it * for a root user just to cause least surprise to an admin. */ if (has_fcap && __is_suid(root_uid, new)) { warn_setuid_and_fcaps_mixed(bprm->filename); return; } /* * To support inheritance of root-permissions and suid-root * executables under compatibility mode, we override the * capability sets for the file. */ if (__is_eff(root_uid, new) || __is_real(root_uid, new)) { /* pP' = (cap_bset & ~0) | (pI & ~0) */ new->cap_permitted = cap_combine(old->cap_bset, old->cap_inheritable); } /* * If only the real uid is 0, we do not set the effective bit. */ if (__is_eff(root_uid, new)) *effective = true; } #define __cap_gained(field, target, source) \ !cap_issubset(target->cap_##field, source->cap_##field) #define __cap_grew(target, source, cred) \ !cap_issubset(cred->cap_##target, cred->cap_##source) #define __cap_full(field, cred) \ cap_issubset(CAP_FULL_SET, cred->cap_##field) static inline bool __is_setuid(struct cred *new, const struct cred *old) { return !uid_eq(new->euid, old->uid); } static inline bool __is_setgid(struct cred *new, const struct cred *old) { return !gid_eq(new->egid, old->gid); } /* * 1) Audit candidate if current->cap_effective is set * * We do not bother to audit if 3 things are true: * 1) cap_effective has all caps * 2) we became root *OR* are were already root * 3) root is supposed to have all caps (SECURE_NOROOT) * Since this is just a normal root execing a process. * * Number 1 above might fail if you don't have a full bset, but I think * that is interesting information to audit. * * A number of other conditions require logging: * 2) something prevented setuid root getting all caps * 3) non-setuid root gets fcaps * 4) non-setuid root gets ambient */ static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old, kuid_t root, bool has_fcap) { bool ret = false; if ((__cap_grew(effective, ambient, new) && !(__cap_full(effective, new) && (__is_eff(root, new) || __is_real(root, new)) && root_privileged())) || (root_privileged() && __is_suid(root, new) && !__cap_full(effective, new)) || (!__is_setuid(new, old) && ((has_fcap && __cap_gained(permitted, new, old)) || __cap_gained(ambient, new, old)))) ret = true; return ret; } /** * cap_bprm_creds_from_file - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds * @file: The file to pull the credentials from * * Set up the proposed credentials for a new execution context being * constructed by execve(). The proposed creds in @bprm->cred is altered, * which won't take effect immediately. * * Return: 0 if successful, -ve on error. */ int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file) { /* Process setpcap binaries and capabilities for uid 0 */ const struct cred *old = current_cred(); struct cred *new = bprm->cred; bool effective = false, has_fcap = false, is_setid; int ret; kuid_t root_uid; if (WARN_ON(!cap_ambient_invariant_ok(old))) return -EPERM; ret = get_file_caps(bprm, file, &effective, &has_fcap); if (ret < 0) return ret; root_uid = make_kuid(new->user_ns, 0); handle_privileged_root(bprm, has_fcap, &effective, root_uid); /* if we have fs caps, clear dangerous personality flags */ if (__cap_gained(permitted, new, old)) bprm->per_clear |= PER_CLEAR_ON_SETID; /* Don't let someone trace a set[ug]id/setpcap binary with the revised * credentials unless they have the appropriate permit. * * In addition, if NO_NEW_PRIVS, then ensure we get no new privs. */ is_setid = __is_setuid(new, old) || __is_setgid(new, old); if ((is_setid || __cap_gained(permitted, new, old)) && ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) || !ptracer_capable(current, new->user_ns))) { /* downgrade; they get no more than they had, and maybe less */ if (!ns_capable(new->user_ns, CAP_SETUID) || (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) { new->euid = new->uid; new->egid = new->gid; } new->cap_permitted = cap_intersect(new->cap_permitted, old->cap_permitted); } new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; /* File caps or setid cancels ambient. */ if (has_fcap || is_setid) cap_clear(new->cap_ambient); /* * Now that we've computed pA', update pP' to give: * pP' = (X & fP) | (pI & fI) | pA' */ new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient); /* * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set, * this is the same as pE' = (fE ? pP' : 0) | pA'. */ if (effective) new->cap_effective = new->cap_permitted; else new->cap_effective = new->cap_ambient; if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; if (nonroot_raised_pE(new, old, root_uid, has_fcap)) { ret = audit_log_bprm_fcaps(bprm, new, old); if (ret < 0) return ret; } new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; /* Check for privilege-elevated exec. */ if (is_setid || (!__is_real(root_uid, new) && (effective || __cap_grew(permitted, ambient, new)))) bprm->secureexec = 1; return 0; } /** * cap_inode_setxattr - Determine whether an xattr may be altered * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * @value: The value that the xattr will be changed to * @size: The size of value * @flags: The replacement flag * * Determine whether an xattr may be altered or set on an inode, returning 0 if * permission is granted, -ve if denied. * * This is used to make sure security xattrs don't get updated or set by those * who aren't privileged to do so. */ int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; /* * For XATTR_NAME_CAPS the check will be done in * cap_convert_nscap(), called by setxattr() */ if (strcmp(name, XATTR_NAME_CAPS) == 0) return 0; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /** * cap_inode_removexattr - Determine whether an xattr may be removed * * @idmap: idmap of the mount the inode was found from * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * * Determine whether an xattr may be removed from an inode, returning 0 if * permission is granted, -ve if denied. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * This is used to make sure security xattrs don't get removed by those who * aren't privileged to remove them. */ int cap_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; if (strcmp(name, XATTR_NAME_CAPS) == 0) { /* security.capability gets namespaced */ struct inode *inode = d_backing_inode(dentry); if (!inode) return -EINVAL; if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP)) return -EPERM; return 0; } if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /* * cap_emulate_setxuid() fixes the effective / permitted capabilities of * a process after a call to setuid, setreuid, or setresuid. * * 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of * {r,e,s}uid != 0, the permitted and effective capabilities are * cleared. * * 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective * capabilities of the process are cleared. * * 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective * capabilities are set to the permitted capabilities. * * fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should * never happen. * * -astor * * cevans - New behaviour, Oct '99 * A process may, via prctl(), elect to keep its capabilities when it * calls setuid() and switches away from uid==0. Both permitted and * effective sets will be retained. * Without this change, it was impossible for a daemon to drop only some * of its privilege. The call to setuid(!=0) would drop all privileges! * Keeping uid 0 is not an option because uid 0 owns too many vital * files.. * Thanks to Olaf Kirch and Peter Benie for spotting this. */ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old) { kuid_t root_uid = make_kuid(old->user_ns, 0); if ((uid_eq(old->uid, root_uid) || uid_eq(old->euid, root_uid) || uid_eq(old->suid, root_uid)) && (!uid_eq(new->uid, root_uid) && !uid_eq(new->euid, root_uid) && !uid_eq(new->suid, root_uid))) { if (!issecure(SECURE_KEEP_CAPS)) { cap_clear(new->cap_permitted); cap_clear(new->cap_effective); } /* * Pre-ambient programs expect setresuid to nonroot followed * by exec to drop capabilities. We should make sure that * this remains the case. */ cap_clear(new->cap_ambient); } if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid)) cap_clear(new->cap_effective); if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid)) new->cap_effective = new->cap_permitted; } /** * cap_task_fix_setuid - Fix up the results of setuid() call * @new: The proposed credentials * @old: The current task's current credentials * @flags: Indications of what has changed * * Fix up the results of setuid() call before the credential changes are * actually applied. * * Return: 0 to grant the changes, -ve to deny them. */ int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { switch (flags) { case LSM_SETID_RE: case LSM_SETID_ID: case LSM_SETID_RES: /* juggle the capabilities to follow [RES]UID changes unless * otherwise suppressed */ if (!issecure(SECURE_NO_SETUID_FIXUP)) cap_emulate_setxuid(new, old); break; case LSM_SETID_FS: /* juggle the capabilities to follow FSUID changes, unless * otherwise suppressed * * FIXME - is fsuser used for all CAP_FS_MASK capabilities? * if not, we might be a bit too harsh here. */ if (!issecure(SECURE_NO_SETUID_FIXUP)) { kuid_t root_uid = make_kuid(old->user_ns, 0); if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid)) new->cap_effective = cap_drop_fs_set(new->cap_effective); if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid)) new->cap_effective = cap_raise_fs_set(new->cap_effective, new->cap_permitted); } break; default: return -EINVAL; } return 0; } /* * Rationale: code calling task_setscheduler, task_setioprio, and * task_setnice, assumes that * . if capable(cap_sys_nice), then those actions should be allowed * . if not capable(cap_sys_nice), but acting on your own processes, * then those actions should be allowed * This is insufficient now since you can call code without suid, but * yet with increased caps. * So we check for increased caps on the target process. */ static int cap_safe_nice(struct task_struct *p) { int is_subset, ret = 0; rcu_read_lock(); is_subset = cap_issubset(__task_cred(p)->cap_permitted, current_cred()->cap_permitted); if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) ret = -EPERM; rcu_read_unlock(); return ret; } /** * cap_task_setscheduler - Determine if scheduler policy change is permitted * @p: The task to affect * * Determine if the requested scheduler policy change is permitted for the * specified task. * * Return: 0 if permission is granted, -ve if denied. */ int cap_task_setscheduler(struct task_struct *p) { return cap_safe_nice(p); } /** * cap_task_setioprio - Determine if I/O priority change is permitted * @p: The task to affect * @ioprio: The I/O priority to set * * Determine if the requested I/O priority change is permitted for the specified * task. * * Return: 0 if permission is granted, -ve if denied. */ int cap_task_setioprio(struct task_struct *p, int ioprio) { return cap_safe_nice(p); } /** * cap_task_setnice - Determine if task priority change is permitted * @p: The task to affect * @nice: The nice value to set * * Determine if the requested task priority change is permitted for the * specified task. * * Return: 0 if permission is granted, -ve if denied. */ int cap_task_setnice(struct task_struct *p, int nice) { return cap_safe_nice(p); } /* * Implement PR_CAPBSET_DROP. Attempt to remove the specified capability from * the current task's bounding set. Returns 0 on success, -ve on error. */ static int cap_prctl_drop(unsigned long cap) { struct cred *new; if (!ns_capable(current_user_ns(), CAP_SETPCAP)) return -EPERM; if (!cap_valid(cap)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; cap_lower(new->cap_bset, cap); return commit_creds(new); } /** * cap_task_prctl - Implement process control functions for this security module * @option: The process control function requested * @arg2: The argument data for this function * @arg3: The argument data for this function * @arg4: The argument data for this function * @arg5: The argument data for this function * * Allow process control functions (sys_prctl()) to alter capabilities; may * also deny access to other functions not otherwise implemented here. * * Return: 0 or +ve on success, -ENOSYS if this function is not implemented * here, other -ve on error. If -ENOSYS is returned, sys_prctl() and other LSM * modules will consider performing the function. */ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { const struct cred *old = current_cred(); struct cred *new; switch (option) { case PR_CAPBSET_READ: if (!cap_valid(arg2)) return -EINVAL; return !!cap_raised(old->cap_bset, arg2); case PR_CAPBSET_DROP: return cap_prctl_drop(arg2); /* * The next four prctl's remain to assist with transitioning a * system from legacy UID=0 based privilege (when filesystem * capabilities are not in use) to a system using filesystem * capabilities only - as the POSIX.1e draft intended. * * Note: * * PR_SET_SECUREBITS = * issecure_mask(SECURE_KEEP_CAPS_LOCKED) * | issecure_mask(SECURE_NOROOT) * | issecure_mask(SECURE_NOROOT_LOCKED) * | issecure_mask(SECURE_NO_SETUID_FIXUP) * | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED) * * will ensure that the current process and all of its * children will be locked into a pure * capability-based-privilege environment. */ case PR_SET_SECUREBITS: if ((((old->securebits & SECURE_ALL_LOCKS) >> 1) & (old->securebits ^ arg2)) /*[1]*/ || ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/ || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ /* * [1] no changing of bits that are locked * [2] no unlocking of locks * [3] no setting of unsupported bits */ ) /* cannot change a locked bit */ return -EPERM; /* * Doing anything requires privilege (go read about the * "sendmail capabilities bug"), except for unprivileged bits. * Indeed, the SECURE_ALL_UNPRIVILEGED bits are not * restrictions enforced by the kernel but by user space on * itself. */ if (cap_capable(current_cred(), current_cred()->user_ns, CAP_SETPCAP, CAP_OPT_NONE) != 0) { const unsigned long unpriv_and_locks = SECURE_ALL_UNPRIVILEGED | SECURE_ALL_UNPRIVILEGED << 1; const unsigned long changed = old->securebits ^ arg2; /* For legacy reason, denies non-change. */ if (!changed) return -EPERM; /* Denies privileged changes. */ if (changed & ~unpriv_and_locks) return -EPERM; } new = prepare_creds(); if (!new) return -ENOMEM; new->securebits = arg2; return commit_creds(new); case PR_GET_SECUREBITS: return old->securebits; case PR_GET_KEEPCAPS: return !!issecure(SECURE_KEEP_CAPS); case PR_SET_KEEPCAPS: if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */ return -EINVAL; if (issecure(SECURE_KEEP_CAPS_LOCKED)) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (arg2) new->securebits |= issecure_mask(SECURE_KEEP_CAPS); else new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); return commit_creds(new); case PR_CAP_AMBIENT: if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) { if (arg3 | arg4 | arg5) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; cap_clear(new->cap_ambient); return commit_creds(new); } if (((!cap_valid(arg3)) | arg4 | arg5)) return -EINVAL; if (arg2 == PR_CAP_AMBIENT_IS_SET) { return !!cap_raised(current_cred()->cap_ambient, arg3); } else if (arg2 != PR_CAP_AMBIENT_RAISE && arg2 != PR_CAP_AMBIENT_LOWER) { return -EINVAL; } else { if (arg2 == PR_CAP_AMBIENT_RAISE && (!cap_raised(current_cred()->cap_permitted, arg3) || !cap_raised(current_cred()->cap_inheritable, arg3) || issecure(SECURE_NO_CAP_AMBIENT_RAISE))) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (arg2 == PR_CAP_AMBIENT_RAISE) cap_raise(new->cap_ambient, arg3); else cap_lower(new->cap_ambient, arg3); return commit_creds(new); } default: /* No functionality available - continue with default */ return -ENOSYS; } } /** * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted * @mm: The VM space in which the new mapping is to be made * @pages: The size of the mapping * * Determine whether the allocation of a new virtual mapping by the current * task is permitted. * * Return: 0 if permission granted, negative error code if not. */ int cap_vm_enough_memory(struct mm_struct *mm, long pages) { return cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN, CAP_OPT_NOAUDIT); } /** * cap_mmap_addr - check if able to map given addr * @addr: address attempting to be mapped * * If the process is attempting to map memory below dac_mmap_min_addr they need * CAP_SYS_RAWIO. The other parameters to this function are unused by the * capability security module. * * Return: 0 if this mapping should be allowed or -EPERM if not. */ int cap_mmap_addr(unsigned long addr) { int ret = 0; if (addr < dac_mmap_min_addr) { ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO, CAP_OPT_NONE); /* set PF_SUPERPRIV if it turns out we allow the low mmap */ if (ret == 0) current->flags |= PF_SUPERPRIV; } return ret; } #ifdef CONFIG_SECURITY static const struct lsm_id capability_lsmid = { .name = "capability", .id = LSM_ID_CAPABILITY, }; static struct security_hook_list capability_hooks[] __ro_after_init = { LSM_HOOK_INIT(capable, cap_capable), LSM_HOOK_INIT(settime, cap_settime), LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme), LSM_HOOK_INIT(capget, cap_capget), LSM_HOOK_INIT(capset, cap_capset), LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file), LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv), LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv), LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity), LSM_HOOK_INIT(mmap_addr, cap_mmap_addr), LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid), LSM_HOOK_INIT(task_prctl, cap_task_prctl), LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler), LSM_HOOK_INIT(task_setioprio, cap_task_setioprio), LSM_HOOK_INIT(task_setnice, cap_task_setnice), LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory), }; static int __init capability_init(void) { security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks), &capability_lsmid); return 0; } DEFINE_LSM(capability) = { .name = "capability", .order = LSM_ORDER_FIRST, .init = capability_init, }; #endif /* CONFIG_SECURITY */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 | /* SPDX-License-Identifier: GPL-2.0 */ /* * fs-verity: read-only file-based authenticity protection * * This header declares the interface between the fs/verity/ support layer and * filesystems that support fs-verity. * * Copyright 2019 Google LLC */ #ifndef _LINUX_FSVERITY_H #define _LINUX_FSVERITY_H #include <linux/fs.h> #include <linux/mm.h> #include <crypto/hash_info.h> #include <crypto/sha2.h> #include <uapi/linux/fsverity.h> /* * Largest digest size among all hash algorithms supported by fs-verity. * Currently assumed to be <= size of fsverity_descriptor::root_hash. */ #define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 /* Verity operations for filesystems */ struct fsverity_operations { /** * Begin enabling verity on the given file. * * @filp: a readonly file descriptor for the file * * The filesystem must do any needed filesystem-specific preparations * for enabling verity, e.g. evicting inline data. It also must return * -EBUSY if verity is already being enabled on the given file. * * i_rwsem is held for write. * * Return: 0 on success, -errno on failure */ int (*begin_enable_verity)(struct file *filp); /** * End enabling verity on the given file. * * @filp: a readonly file descriptor for the file * @desc: the verity descriptor to write, or NULL on failure * @desc_size: size of verity descriptor, or 0 on failure * @merkle_tree_size: total bytes the Merkle tree took up * * If desc == NULL, then enabling verity failed and the filesystem only * must do any necessary cleanups. Else, it must also store the given * verity descriptor to a fs-specific location associated with the inode * and do any fs-specific actions needed to mark the inode as a verity * inode, e.g. setting a bit in the on-disk inode. The filesystem is * also responsible for setting the S_VERITY flag in the VFS inode. * * i_rwsem is held for write, but it may have been dropped between * ->begin_enable_verity() and ->end_enable_verity(). * * Return: 0 on success, -errno on failure */ int (*end_enable_verity)(struct file *filp, const void *desc, size_t desc_size, u64 merkle_tree_size); /** * Get the verity descriptor of the given inode. * * @inode: an inode with the S_VERITY flag set * @buf: buffer in which to place the verity descriptor * @bufsize: size of @buf, or 0 to retrieve the size only * * If bufsize == 0, then the size of the verity descriptor is returned. * Otherwise the verity descriptor is written to 'buf' and its actual * size is returned; -ERANGE is returned if it's too large. This may be * called by multiple processes concurrently on the same inode. * * Return: the size on success, -errno on failure */ int (*get_verity_descriptor)(struct inode *inode, void *buf, size_t bufsize); /** * Read a Merkle tree page of the given inode. * * @inode: the inode * @index: 0-based index of the page within the Merkle tree * @num_ra_pages: The number of Merkle tree pages that should be * prefetched starting at @index if the page at @index * isn't already cached. Implementations may ignore this * argument; it's only a performance optimization. * * This can be called at any time on an open verity file. It may be * called by multiple processes concurrently, even with the same page. * * Note that this must retrieve a *page*, not necessarily a *block*. * * Return: the page on success, ERR_PTR() on failure */ struct page *(*read_merkle_tree_page)(struct inode *inode, pgoff_t index, unsigned long num_ra_pages); /** * Write a Merkle tree block to the given inode. * * @inode: the inode for which the Merkle tree is being built * @buf: the Merkle tree block to write * @pos: the position of the block in the Merkle tree (in bytes) * @size: the Merkle tree block size (in bytes) * * This is only called between ->begin_enable_verity() and * ->end_enable_verity(). * * Return: 0 on success, -errno on failure */ int (*write_merkle_tree_block)(struct inode *inode, const void *buf, u64 pos, unsigned int size); }; #ifdef CONFIG_FS_VERITY static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fsverity_set_info(). * I.e., another task may publish ->i_verity_info concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ return smp_load_acquire(&inode->i_verity_info); } /* enable.c */ int fsverity_ioctl_enable(struct file *filp, const void __user *arg); /* measure.c */ int fsverity_ioctl_measure(struct file *filp, void __user *arg); int fsverity_get_digest(struct inode *inode, u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], u8 *alg, enum hash_algo *halg); /* open.c */ int __fsverity_file_open(struct inode *inode, struct file *filp); int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); void __fsverity_cleanup_inode(struct inode *inode); /** * fsverity_cleanup_inode() - free the inode's verity info, if present * @inode: an inode being evicted * * Filesystems must call this on inode eviction to free ->i_verity_info. */ static inline void fsverity_cleanup_inode(struct inode *inode) { if (inode->i_verity_info) __fsverity_cleanup_inode(inode); } /* read_metadata.c */ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg); /* verify.c */ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset); void fsverity_verify_bio(struct bio *bio); void fsverity_enqueue_verify_work(struct work_struct *work); #else /* !CONFIG_FS_VERITY */ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { return NULL; } /* enable.c */ static inline int fsverity_ioctl_enable(struct file *filp, const void __user *arg) { return -EOPNOTSUPP; } /* measure.c */ static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fsverity_get_digest(struct inode *inode, u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], u8 *alg, enum hash_algo *halg) { /* * fsverity is not enabled in the kernel configuration, so always report * that the file doesn't have fsverity enabled (digest size 0). */ return 0; } /* open.c */ static inline int __fsverity_file_open(struct inode *inode, struct file *filp) { return -EOPNOTSUPP; } static inline int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } static inline void fsverity_cleanup_inode(struct inode *inode) { } /* read_metadata.c */ static inline int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) { return -EOPNOTSUPP; } /* verify.c */ static inline bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { WARN_ON_ONCE(1); return false; } static inline void fsverity_verify_bio(struct bio *bio) { WARN_ON_ONCE(1); } static inline void fsverity_enqueue_verify_work(struct work_struct *work) { WARN_ON_ONCE(1); } #endif /* !CONFIG_FS_VERITY */ static inline bool fsverity_verify_folio(struct folio *folio) { return fsverity_verify_blocks(folio, folio_size(folio), 0); } static inline bool fsverity_verify_page(struct page *page) { return fsverity_verify_blocks(page_folio(page), PAGE_SIZE, 0); } /** * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check * * This checks whether ->i_verity_info has been set. * * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) * * Return: true if reads need to go through fs-verity, otherwise false */ static inline bool fsverity_active(const struct inode *inode) { return fsverity_get_info(inode) != NULL; } /** * fsverity_file_open() - prepare to open a verity file * @inode: the inode being opened * @filp: the struct file being set up * * When opening a verity file, deny the open if it is for writing. Otherwise, * set up the inode's ->i_verity_info if not already done. * * When combined with fscrypt, this must be called after fscrypt_file_open(). * Otherwise, we won't have the key set up to decrypt the verity metadata. * * Return: 0 on success, -errno on failure */ static inline int fsverity_file_open(struct inode *inode, struct file *filp) { if (IS_VERITY(inode)) return __fsverity_file_open(inode, filp); return 0; } /** * fsverity_prepare_setattr() - prepare to change a verity inode's attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * * Verity files are immutable, so deny truncates. This isn't covered by the * open-time check because sys_truncate() takes a path, not a file descriptor. * * Return: 0 on success, -errno on failure */ static inline int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (IS_VERITY(d_inode(dentry))) return __fsverity_prepare_setattr(dentry, attr); return 0; } #endif /* _LINUX_FSVERITY_H */ |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/virtio_net.h> #include <linux/skb_array.h> struct macvtap_dev { struct macvlan_dev vlan; struct tap_dev tap; }; /* * Variables for dealing with macvtaps device numbers. */ static dev_t macvtap_major; static const void *macvtap_net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d->parent); return dev_net(dev); } static struct class macvtap_class = { .name = "macvtap", .ns_type = &net_ns_type_operations, .namespace = macvtap_net_namespace, }; static struct cdev macvtap_cdev; #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ NETIF_F_TSO6) static void macvtap_count_tx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; this_cpu_inc(vlan->pcpu_stats->tx_dropped); } static void macvtap_count_rx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; macvlan_count_rx(vlan, 0, 0, 0); } static void macvtap_update_features(struct tap_dev *tap, netdev_features_t features) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; vlan->set_features = features; netdev_update_features(vlan->dev); } static int macvtap_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct macvtap_dev *vlantap = netdev_priv(dev); int err; INIT_LIST_HEAD(&vlantap->tap.queue_list); /* Since macvlan supports all offloads by default, make * tap support all offloads also. */ vlantap->tap.tap_features = TUN_OFFLOADS; /* Register callbacks for rx/tx drops accounting and updating * net_device features */ vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped; vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped; vlantap->tap.update_features = macvtap_update_features; err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap); if (err) return err; /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ err = macvlan_common_newlink(dev, params, extack); if (err) { netdev_rx_handler_unregister(dev); return err; } vlantap->tap.dev = vlantap->vlan.dev; return 0; } static void macvtap_dellink(struct net_device *dev, struct list_head *head) { struct macvtap_dev *vlantap = netdev_priv(dev); netdev_rx_handler_unregister(dev); tap_del_queues(&vlantap->tap); macvlan_dellink(dev, head); } static void macvtap_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->tx_queue_len = TUN_READQ_SIZE; } static struct net *macvtap_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvtap_link_ops __read_mostly = { .kind = "macvtap", .setup = macvtap_setup, .newlink = macvtap_newlink, .dellink = macvtap_dellink, .get_link_net = macvtap_link_net, .priv_size = sizeof(struct macvtap_dev), }; static int macvtap_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvtap_dev *vlantap; struct device *classdev; dev_t devt; int err; char tap_name[IFNAMSIZ]; if (dev->rtnl_link_ops != &macvtap_link_ops) return NOTIFY_DONE; snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex); vlantap = netdev_priv(dev); switch (event) { case NETDEV_REGISTER: /* Create the device node here after the network device has * been registered but before register_netdevice has * finished running. */ err = tap_get_minor(macvtap_major, &vlantap->tap); if (err) return notifier_from_errno(err); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); classdev = device_create(&macvtap_class, &dev->dev, devt, dev, "%s", tap_name); if (IS_ERR(classdev)) { tap_free_minor(macvtap_major, &vlantap->tap); return notifier_from_errno(PTR_ERR(classdev)); } err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj, tap_name); if (err) return notifier_from_errno(err); break; case NETDEV_UNREGISTER: /* vlan->minor == 0 if NETDEV_REGISTER above failed */ if (vlantap->tap.minor == 0) break; sysfs_remove_link(&dev->dev.kobj, tap_name); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); device_destroy(&macvtap_class, devt); tap_free_minor(macvtap_major, &vlantap->tap); break; case NETDEV_CHANGE_TX_QUEUE_LEN: if (tap_queue_resize(&vlantap->tap)) return NOTIFY_BAD; break; } return NOTIFY_DONE; } static struct notifier_block macvtap_notifier_block __read_mostly = { .notifier_call = macvtap_device_event, }; static int __init macvtap_init(void) { int err; err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap", THIS_MODULE); if (err) goto out1; err = class_register(&macvtap_class); if (err) goto out2; err = register_netdevice_notifier(&macvtap_notifier_block); if (err) goto out3; err = macvlan_link_register(&macvtap_link_ops); if (err) goto out4; return 0; out4: unregister_netdevice_notifier(&macvtap_notifier_block); out3: class_unregister(&macvtap_class); out2: tap_destroy_cdev(macvtap_major, &macvtap_cdev); out1: return err; } module_init(macvtap_init); static void __exit macvtap_exit(void) { rtnl_link_unregister(&macvtap_link_ops); unregister_netdevice_notifier(&macvtap_notifier_block); class_unregister(&macvtap_class); tap_destroy_cdev(macvtap_major, &macvtap_cdev); } module_exit(macvtap_exit); MODULE_ALIAS_RTNL_LINK("macvtap"); MODULE_DESCRIPTION("MAC-VLAN based tap driver"); MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); MODULE_LICENSE("GPL"); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> */ #ifndef _LINUX_HUGETLB_VMEMMAP_H #define _LINUX_HUGETLB_VMEMMAP_H #include <linux/hugetlb.h> #include <linux/io.h> #include <linux/memblock.h> /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See * Documentation/mm/vmemmap_dedup.rst. */ #define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE #define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio); long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios); void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list); #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT void hugetlb_vmemmap_init_early(int nid); void hugetlb_vmemmap_init_late(int nid); #endif static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { return pages_per_huge_page(h) * sizeof(struct page); } /* * Return how many vmemmap size associated with a HugeTLB page that can be * optimized and can be freed to the buddy allocator. */ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE; if (!is_power_of_2(sizeof(struct page))) return 0; return size > 0 ? size : 0; } #else static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return 0; } static inline long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { list_splice_init(folio_list, non_hvo_folios); return 0; } static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { } static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { } static inline void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list) { } static inline void hugetlb_vmemmap_init_early(int nid) { } static inline void hugetlb_vmemmap_init_late(int nid) { } static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { return 0; } #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h) { return hugetlb_vmemmap_optimizable_size(h) != 0; } #endif /* _LINUX_HUGETLB_VMEMMAP_H */ |
40 40 94 47 3 41 41 51 51 51 50 42 34 42 42 42 51 104 1 1 1 99 94 52 40 82 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <trace/events/kvm.h> #include "trace.h" void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data) { void *datap = NULL; union { u8 byte; u16 hword; u32 word; u64 dword; } tmp; switch (len) { case 1: tmp.byte = data; datap = &tmp.byte; break; case 2: tmp.hword = data; datap = &tmp.hword; break; case 4: tmp.word = data; datap = &tmp.word; break; case 8: tmp.dword = data; datap = &tmp.dword; break; } memcpy(buf, datap, len); } unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) { unsigned long data = 0; union { u16 hword; u32 word; u64 dword; } tmp; switch (len) { case 1: data = *(u8 *)buf; break; case 2: memcpy(&tmp.hword, buf, len); data = tmp.hword; break; case 4: memcpy(&tmp.word, buf, len); data = tmp.word; break; case 8: memcpy(&tmp.dword, buf, len); data = tmp.dword; break; } return data; } static bool kvm_pending_sync_exception(struct kvm_vcpu *vcpu) { if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) return false; if (vcpu_el1_is_32bit(vcpu)) { switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { case unpack_vcpu_flag(EXCEPT_AA32_UND): case unpack_vcpu_flag(EXCEPT_AA32_IABT): case unpack_vcpu_flag(EXCEPT_AA32_DABT): return true; default: return false; } } else { switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): return true; default: return false; } } } /** * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation * or in-kernel IO emulation * * @vcpu: The VCPU pointer */ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) { unsigned long data; unsigned int len; int mask; /* * Detect if the MMIO return was already handled or if userspace aborted * the MMIO access. */ if (unlikely(!vcpu->mmio_needed || kvm_pending_sync_exception(vcpu))) return 1; vcpu->mmio_needed = 0; if (!kvm_vcpu_dabt_iswrite(vcpu)) { struct kvm_run *run = vcpu->run; len = kvm_vcpu_dabt_get_as(vcpu); data = kvm_mmio_read_buf(run->mmio.data, len); if (kvm_vcpu_dabt_issext(vcpu) && len < sizeof(unsigned long)) { mask = 1U << ((len * 8) - 1); data = (data ^ mask) - mask; } if (!kvm_vcpu_dabt_issf(vcpu)) data = data & 0xffffffff; trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, &data); data = vcpu_data_host_to_guest(vcpu, data, len); vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); } /* * The MMIO instruction is emulated and should not be re-executed * in the guest. */ kvm_incr_pc(vcpu); return 1; } int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { struct kvm_run *run = vcpu->run; unsigned long data; unsigned long rt; int ret; bool is_write; int len; u8 data_buf[8]; /* * No valid syndrome? Ask userspace for help if it has * volunteered to do so, and bail out otherwise. * * In the protected VM case, there isn't much userspace can do * though, so directly deliver an exception to the guest. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), kvm_vcpu_get_hfar(vcpu), fault_ipa); if (vcpu_is_protected(vcpu)) { kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); return 1; } if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, &vcpu->kvm->arch.flags)) { run->exit_reason = KVM_EXIT_ARM_NISV; run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); run->arm_nisv.fault_ipa = fault_ipa; return 0; } return -ENOSYS; } /* * Prepare MMIO operation. First decode the syndrome data we get * from the CPU. Then try if some in-kernel emulation feels * responsible, otherwise let user space do its magic. */ is_write = kvm_vcpu_dabt_iswrite(vcpu); len = kvm_vcpu_dabt_get_as(vcpu); rt = kvm_vcpu_dabt_get_rd(vcpu); if (is_write) { data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), len); trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); kvm_mmio_write_buf(data_buf, len, data); ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); } else { trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, fault_ipa, NULL); ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); } /* Now prepare kvm_run for the potential return to userland. */ run->mmio.is_write = is_write; run->mmio.phys_addr = fault_ipa; run->mmio.len = len; vcpu->mmio_needed = 1; if (!ret) { /* We handled the access successfully in the kernel. */ if (!is_write) memcpy(run->mmio.data, data_buf, len); vcpu->stat.mmio_exit_kernel++; kvm_handle_mmio_return(vcpu); return 1; } if (is_write) memcpy(run->mmio.data, data_buf, len); vcpu->stat.mmio_exit_user++; run->exit_reason = KVM_EXIT_MMIO; return 0; } |
135 135 50 12 221 221 134 135 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #ifndef __ARM64_KVM_MMU_H__ #define __ARM64_KVM_MMU_H__ #include <asm/page.h> #include <asm/memory.h> #include <asm/mmu.h> #include <asm/cpufeature.h> /* * As ARMv8.0 only has the TTBR0_EL2 register, we cannot express * "negative" addresses. This makes it impossible to directly share * mappings with the kernel. * * Instead, give the HYP mode its own VA region at a fixed offset from * the kernel by just masking the top bits (which are all ones for a * kernel address). We need to find out how many bits to mask. * * We want to build a set of page tables that cover both parts of the * idmap (the trampoline page used to initialize EL2), and our normal * runtime VA space, at the same time. * * Given that the kernel uses VA_BITS for its entire address space, * and that half of that space (VA_BITS - 1) is used for the linear * mapping, we can also limit the EL2 space to (VA_BITS - 1). * * The main question is "Within the VA_BITS space, does EL2 use the * top or the bottom half of that space to shadow the kernel's linear * mapping?". As we need to idmap the trampoline page, this is * determined by the range in which this page lives. * * If the page is in the bottom half, we have to use the top half. If * the page is in the top half, we have to use the bottom half: * * T = __pa_symbol(__hyp_idmap_text_start) * if (T & BIT(VA_BITS - 1)) * HYP_VA_MIN = 0 //idmap in upper half * else * HYP_VA_MIN = 1 << (VA_BITS - 1) * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1 * * When using VHE, there are no separate hyp mappings and all KVM * functionality is already mapped as part of the main kernel * mappings, and none of this applies in that case. */ #ifdef __ASSEMBLY__ #include <asm/alternative.h> /* * Convert a hypervisor VA to a PA * reg: hypervisor address to be converted in place * tmp: temporary register */ .macro hyp_pa reg, tmp ldr_l \tmp, hyp_physvirt_offset add \reg, \reg, \tmp .endm /* * Convert a hypervisor VA to a kernel image address * reg: hypervisor address to be converted in place * tmp: temporary register * * The actual code generation takes place in kvm_get_kimage_voffset, and * the instructions below are only there to reserve the space and * perform the register allocation (kvm_get_kimage_voffset uses the * specific registers encoded in the instructions). */ .macro hyp_kimg_va reg, tmp /* Convert hyp VA -> PA. */ hyp_pa \reg, \tmp /* Load kimage_voffset. */ alternative_cb ARM64_ALWAYS_SYSTEM, kvm_get_kimage_voffset movz \tmp, #0 movk \tmp, #0, lsl #16 movk \tmp, #0, lsl #32 movk \tmp, #0, lsl #48 alternative_cb_end /* Convert PA -> kimg VA. */ add \reg, \reg, \tmp .endm #else #include <linux/pgtable.h> #include <asm/pgalloc.h> #include <asm/cache.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/kvm_emulate.h> #include <asm/kvm_host.h> #include <asm/kvm_nested.h> void kvm_update_va_mask(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst); void kvm_compute_layout(void); void kvm_apply_hyp_relocations(void); #define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset) /* * Convert a kernel VA into a HYP VA. * * Can be called from hyp or non-hyp context. * * The actual code generation takes place in kvm_update_va_mask(), and * the instructions below are only there to reserve the space and * perform the register allocation (kvm_update_va_mask() uses the * specific registers encoded in the instructions). */ static __always_inline unsigned long __kern_hyp_va(unsigned long v) { /* * This #ifndef is an optimisation for when this is called from VHE hyp * context. When called from a VHE non-hyp context, kvm_update_va_mask() will * replace the instructions with `nop`s. */ #ifndef __KVM_VHE_HYPERVISOR__ asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n" /* mask with va_mask */ "ror %0, %0, #1\n" /* rotate to the first tag bit */ "add %0, %0, #0\n" /* insert the low 12 bits of the tag */ "add %0, %0, #0, lsl 12\n" /* insert the top 12 bits of the tag */ "ror %0, %0, #63\n", /* rotate back */ ARM64_ALWAYS_SYSTEM, kvm_update_va_mask) : "+r" (v)); #endif return v; } #define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v)))) extern u32 __hyp_va_bits; /* * We currently support using a VM-specified IPA size. For backward * compatibility, the default IPA size is fixed to 40bits. */ #define KVM_PHYS_SHIFT (40) #define kvm_phys_shift(mmu) VTCR_EL2_IPA((mmu)->vtcr) #define kvm_phys_size(mmu) (_AC(1, ULL) << kvm_phys_shift(mmu)) #define kvm_phys_mask(mmu) (kvm_phys_size(mmu) - _AC(1, ULL)) #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> int kvm_share_hyp(void *from, void *to); void kvm_unshare_hyp(void *from, void *to); int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot); int __create_hyp_mappings(unsigned long start, unsigned long size, unsigned long phys, enum kvm_pgtable_prot prot); int hyp_alloc_private_va_range(size_t size, unsigned long *haddr); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, void __iomem **haddr); int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, void **haddr); int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr); void __init free_hyp_pgds(void); void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, bool may_block); void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); void stage2_unmap_vm(struct kvm *kvm); int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); void kvm_uninit_stage2_mmu(struct kvm *kvm); void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu); int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable); int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); int __init kvm_mmu_init(u32 *hyp_va_bits); static inline void *__kvm_vector_slot2addr(void *base, enum arm64_hyp_spectre_vector slot) { int idx = slot - (slot != HYP_VECTOR_DIRECT); return base + (idx * SZ_2K); } struct kvm; #define kvm_flush_dcache_to_poc(a,l) \ dcache_clean_inval_poc((unsigned long)(a), (unsigned long)(a)+(l)) static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) { u64 cache_bits = SCTLR_ELx_M | SCTLR_ELx_C; int reg; if (vcpu_is_el2(vcpu)) reg = SCTLR_EL2; else reg = SCTLR_EL1; return (vcpu_read_sys_reg(vcpu, reg) & cache_bits) == cache_bits; } static inline void __clean_dcache_guest_page(void *va, size_t size) { /* * With FWB, we ensure that the guest always accesses memory using * cacheable attributes, and we don't have to clean to PoC when * faulting in pages. Furthermore, FWB implies IDC, so cleaning to * PoU is not required either in this case. */ if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) return; kvm_flush_dcache_to_poc(va, size); } static inline size_t __invalidate_icache_max_range(void) { u8 iminline; u64 ctr; asm volatile(ALTERNATIVE_CB("movz %0, #0\n" "movk %0, #0, lsl #16\n" "movk %0, #0, lsl #32\n" "movk %0, #0, lsl #48\n", ARM64_ALWAYS_SYSTEM, kvm_compute_final_ctr_el0) : "=r" (ctr)); iminline = SYS_FIELD_GET(CTR_EL0, IminLine, ctr) + 2; return MAX_DVM_OPS << iminline; } static inline void __invalidate_icache_guest_page(void *va, size_t size) { /* * Blow the whole I-cache if it is aliasing (i.e. VIPT) or the * invalidation range exceeds our arbitrary limit on invadations by * cache line. */ if (icache_is_aliasing() || size > __invalidate_icache_max_range()) icache_inval_all_pou(); else icache_inval_pou((unsigned long)va, (unsigned long)va + size); } void kvm_set_way_flush(struct kvm_vcpu *vcpu); void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled); static inline unsigned int kvm_get_vmid_bits(void) { int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); return get_vmid_bits(reg); } /* * We are not in the kvm->srcu critical section most of the time, so we take * the SRCU read lock here. Since we copy the data from the user page, we * can immediately drop the lock again. */ static inline int kvm_read_guest_lock(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { int srcu_idx = srcu_read_lock(&kvm->srcu); int ret = kvm_read_guest(kvm, gpa, data, len); srcu_read_unlock(&kvm->srcu, srcu_idx); return ret; } static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) { int srcu_idx = srcu_read_lock(&kvm->srcu); int ret = kvm_write_guest(kvm, gpa, data, len); srcu_read_unlock(&kvm->srcu, srcu_idx); return ret; } #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) /* * When this is (directly or indirectly) used on the TLB invalidation * path, we rely on a previously issued DSB so that page table updates * and VMID reads are correctly ordered. */ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) { struct kvm_vmid *vmid = &mmu->vmid; u64 vmid_field, baddr; u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0; baddr = mmu->pgd_phys; vmid_field = atomic64_read(&vmid->id) << VTTBR_VMID_SHIFT; vmid_field &= VTTBR_VMID_MASK(kvm_arm_vmid_bits); return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } /* * Must be called from hyp code running at EL2 with an updated VTTBR * and interrupts disabled. */ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, struct kvm_arch *arch) { write_sysreg(mmu->vtcr, vtcr_el2); write_sysreg(kvm_get_vttbr(mmu), vttbr_el2); /* * ARM errata 1165522 and 1530923 require the actual execution of the * above before we can switch to the EL1/EL0 translation regime used by * the guest. */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); } static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu) { return container_of(mmu->arch, struct kvm, arch); } static inline u64 get_vmid(u64 vttbr) { return (vttbr & VTTBR_VMID_MASK(kvm_get_vmid_bits())) >> VTTBR_VMID_SHIFT; } static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu) { return !(mmu->tlb_vttbr & VTTBR_CNP_BIT); } static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) { /* * Be careful, mmu may not be fully initialised so do look at * *any* of its fields. */ return &kvm->arch.mmu != mmu; } static inline void kvm_fault_lock(struct kvm *kvm) { if (is_protected_kvm_enabled()) write_lock(&kvm->mmu_lock); else read_lock(&kvm->mmu_lock); } static inline void kvm_fault_unlock(struct kvm *kvm) { if (is_protected_kvm_enabled()) write_unlock(&kvm->mmu_lock); else read_unlock(&kvm->mmu_lock); } #ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS void kvm_s2_ptdump_create_debugfs(struct kvm *kvm); #else static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {} #endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */ #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ |
169 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/eventpoll.h ( Efficient event polling implementation ) * Copyright (C) 2001,...,2006 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #ifndef _LINUX_EVENTPOLL_H #define _LINUX_EVENTPOLL_H #include <uapi/linux/eventpoll.h> #include <uapi/linux/kcmp.h> /* Forward declarations to avoid compiler errors */ struct file; #ifdef CONFIG_EPOLL #ifdef CONFIG_KCMP struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); #endif /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); /* Copy ready events to userspace */ int epoll_sendevents(struct file *file, struct epoll_event __user *events, int maxevents); /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup * correctly files that are closed without being removed from the eventpoll * interface. */ static inline void eventpoll_release(struct file *file) { /* * Fast check to avoid the get/release of the semaphore. Since * we're doing this outside the semaphore lock, it might return * false negatives, but we don't care. It'll help in 99.99% of cases * to avoid the semaphore lock. False positives simply cannot happen * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ if (likely(!READ_ONCE(file->f_ep))) return; /* * The file is being closed while it is still linked to an epoll * descriptor. We need to handle this by correctly unlinking it * from its containers. */ eventpoll_release_file(file); } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock); /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { return op != EPOLL_CTL_DEL; } #else static inline void eventpoll_release(struct file *file) {} #endif #if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT) /* ARM OABI has an incompatible struct layout and needs a special handler */ extern struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent); #else static inline struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent) { if (__put_user(revents, &uevent->events) || __put_user(data, &uevent->data)) return NULL; return uevent+1; } #endif #endif /* #ifndef _LINUX_EVENTPOLL_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NETFILTER_NETDEV_H_ #define _NETFILTER_NETDEV_H_ #include <linux/netfilter.h> #include <linux/netdevice.h> #ifdef CONFIG_NETFILTER_INGRESS static inline bool nf_hook_ingress_active(const struct sk_buff *skb) { #ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) return false; #endif return rcu_access_pointer(skb->dev->nf_hooks_ingress); } /* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress); struct nf_hook_state state; int ret; /* Must recheck the ingress hook head, in the event it became NULL * after the check in nf_hook_ingress_active evaluated to true. */ if (unlikely(!e)) return 0; nf_hook_state_init(&state, NF_NETDEV_INGRESS, NFPROTO_NETDEV, skb->dev, NULL, NULL, dev_net(skb->dev), NULL); ret = nf_hook_slow(skb, &state, e, 0); if (ret == 0) return -1; return ret; } #else /* CONFIG_NETFILTER_INGRESS */ static inline int nf_hook_ingress_active(struct sk_buff *skb) { return 0; } static inline int nf_hook_ingress(struct sk_buff *skb) { return 0; } #endif /* CONFIG_NETFILTER_INGRESS */ #ifdef CONFIG_NETFILTER_EGRESS static inline bool nf_hook_egress_active(void) { #ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS])) return false; #endif return true; } /** * nf_hook_egress - classify packets before transmission * @skb: packet to be classified * @rc: result code which shall be returned by __dev_queue_xmit() on failure * @dev: netdev whose egress hooks shall be applied to @skb * * Caller must hold rcu_read_lock. * * On ingress, packets are classified first by tc, then by netfilter. * On egress, the order is reversed for symmetry. Conceptually, tc and * netfilter can be thought of as layers, with netfilter layered above tc: * When tc redirects a packet to another interface, netfilter is not applied * because the packet is on the tc layer. * * The nf_skip_egress flag controls whether netfilter is applied on egress. * It is updated by __netif_receive_skb_core() and __dev_queue_xmit() when the * packet passes through tc and netfilter. Because __dev_queue_xmit() may be * called recursively by tunnel drivers such as vxlan, the flag is reverted to * false after sch_handle_egress(). This ensures that netfilter is applied * both on the overlay and underlying network. * * Returns: @skb on success or %NULL if the packet was consumed or filtered. */ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) { struct nf_hook_entries *e; struct nf_hook_state state; int ret; #ifdef CONFIG_NETFILTER_SKIP_EGRESS if (skb->nf_skip_egress) return skb; #endif e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held()); if (!e) return skb; nf_hook_state_init(&state, NF_NETDEV_EGRESS, NFPROTO_NETDEV, NULL, dev, NULL, dev_net(dev), NULL); /* nf assumes rcu_read_lock, not just read_lock_bh */ rcu_read_lock(); ret = nf_hook_slow(skb, &state, e, 0); rcu_read_unlock(); if (ret == 1) { return skb; } else if (ret < 0) { *rc = NET_XMIT_DROP; return NULL; } else { /* ret == 0 */ *rc = NET_XMIT_SUCCESS; return NULL; } } #else /* CONFIG_NETFILTER_EGRESS */ static inline bool nf_hook_egress_active(void) { return false; } static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) { return skb; } #endif /* CONFIG_NETFILTER_EGRESS */ static inline void nf_skip_egress(struct sk_buff *skb, bool skip) { #ifdef CONFIG_NETFILTER_SKIP_EGRESS skb->nf_skip_egress = skip; #endif } static inline void nf_hook_netdev_init(struct net_device *dev) { #ifdef CONFIG_NETFILTER_INGRESS RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); #endif #ifdef CONFIG_NETFILTER_EGRESS RCU_INIT_POINTER(dev->nf_hooks_egress, NULL); #endif } #endif /* _NETFILTER_NETDEV_H_ */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_FLS64_H_ #define _ASM_GENERIC_BITOPS_FLS64_H_ #include <asm/types.h> /** * fls64 - find last set bit in a 64-bit word * @x: the word to search * * This is defined in a similar way as the libc and compiler builtin * ffsll, but returns the position of the most significant set bit. * * fls64(value) returns 0 if value is 0 or the position of the last * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ #if BITS_PER_LONG == 32 static __always_inline int fls64(__u64 x) { __u32 h = x >> 32; if (h) return fls(h) + 32; return fls(x); } #elif BITS_PER_LONG == 64 static __always_inline int fls64(__u64 x) { if (x == 0) return 0; return __fls(x) + 1; } #else #error BITS_PER_LONG not 32 or 64 #endif #endif /* _ASM_GENERIC_BITOPS_FLS64_H_ */ |
125 99 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef LINUX_RESUME_USER_MODE_H #define LINUX_RESUME_USER_MODE_H #include <linux/sched.h> #include <linux/task_work.h> #include <linux/memcontrol.h> #include <linux/rseq.h> #include <linux/blk-cgroup.h> /** * set_notify_resume - cause resume_user_mode_work() to be called * @task: task that will call resume_user_mode_work() * * Calling this arranges that @task will call resume_user_mode_work() * before returning to user mode. If it's already running in user mode, * it will enter the kernel and call resume_user_mode_work() soon. * If it's blocked, it will not be woken. */ static inline void set_notify_resume(struct task_struct *task) { if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME)) kick_process(task); } /** * resume_user_mode_work - Perform work before returning to user mode * @regs: user-mode registers of @current task * * This is called when %TIF_NOTIFY_RESUME has been set. Now we are * about to return to user mode, and the user state in @regs can be * inspected or adjusted. The caller in arch code has cleared * %TIF_NOTIFY_RESUME before the call. If the flag gets set again * asynchronously, this will be called again before we return to * user mode. * * Called without locks. */ static inline void resume_user_mode_work(struct pt_regs *regs) { clear_thread_flag(TIF_NOTIFY_RESUME); /* * This barrier pairs with task_work_add()->set_notify_resume() after * hlist_add_head(task->task_works); */ smp_mb__after_atomic(); if (unlikely(task_work_pending(current))) task_work_run(); #ifdef CONFIG_KEYS_REQUEST_CACHE if (unlikely(current->cached_requested_key)) { key_put(current->cached_requested_key); current->cached_requested_key = NULL; } #endif mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); rseq_handle_notify_resume(NULL, regs); } #endif /* LINUX_RESUME_USER_MODE_H */ |
3 3 3 3 3 2 3 3 1 3 3 3 3 3 3 4 26 26 166 109 58 58 58 58 27 27 1 27 2 2 2 1 2 2 1 1 2 35 92 83 1 83 83 83 35 6 5 2 2 1 59 59 1 58 59 59 59 59 46 46 45 46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/irqchip/arm-gic-v3.h> #include <linux/irq.h> #include <linux/irqdomain.h> #include <linux/kstrtox.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/string_choices.h> #include <kvm/arm_vgic.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/kvm_asm.h> #include "vgic.h" static bool group0_trap; static bool group1_trap; static bool common_trap; static bool dir_trap; static bool gicv4_enable; void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; } static bool lr_signals_eoi_mi(u64 lr_val) { return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) && !(lr_val & ICH_LR_HW); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; int lr; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE; for (lr = 0; lr < cpuif->used_lrs; lr++) { u64 val = cpuif->vgic_lr[lr]; u32 intid, cpuid; struct vgic_irq *irq; bool is_v2_sgi = false; bool deactivated; cpuid = val & GICH_LR_PHYSID_CPUID; cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; if (model == KVM_DEV_TYPE_ARM_VGIC_V3) { intid = val & ICH_LR_VIRTUAL_ID_MASK; } else { intid = val & GICH_LR_VIRTUALID; is_v2_sgi = vgic_irq_is_sgi(intid); } /* Notify fds when the guest EOI'ed a level-triggered IRQ */ if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) kvm_notify_acked_irq(vcpu->kvm, 0, intid - VGIC_NR_PRIVATE_IRQS); irq = vgic_get_vcpu_irq(vcpu, intid); if (!irq) /* An LPI could have been unmapped. */ continue; raw_spin_lock(&irq->irq_lock); /* Always preserve the active bit, note deactivation */ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); irq->active = !!(val & ICH_LR_ACTIVE_BIT); if (irq->active && is_v2_sgi) irq->active_source = cpuid; /* Edge is the only case where we preserve the pending bit */ if (irq->config == VGIC_CONFIG_EDGE && (val & ICH_LR_PENDING_BIT)) { irq->pending_latch = true; if (is_v2_sgi) irq->source |= (1 << cpuid); } /* * Clear soft pending state when level irqs have been acked. */ if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) irq->pending_latch = false; /* Handle resampling for mapped interrupts if required */ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } cpuif->used_lrs = 0; } /* Requires the irq to be locked already */ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) { u32 model = vcpu->kvm->arch.vgic.vgic_model; u64 val = irq->intid; bool allow_pending = true, is_v2_sgi; is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2); if (irq->active) { val |= ICH_LR_ACTIVE_BIT; if (is_v2_sgi) val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; if (vgic_irq_is_multi_sgi(irq)) { allow_pending = false; val |= ICH_LR_EOI; } } if (irq->hw && !vgic_irq_needs_resampling(irq)) { val |= ICH_LR_HW; val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; /* * Never set pending+active on a HW interrupt, as the * pending state is kept at the physical distributor * level. */ if (irq->active) allow_pending = false; } else { if (irq->config == VGIC_CONFIG_LEVEL) { val |= ICH_LR_EOI; /* * Software resampling doesn't work very well * if we allow P+A, so let's not do that. */ if (irq->active) allow_pending = false; } } if (allow_pending && irq_is_pending(irq)) { val |= ICH_LR_PENDING_BIT; if (irq->config == VGIC_CONFIG_EDGE) irq->pending_latch = false; if (vgic_irq_is_sgi(irq->intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2) { u32 src = ffs(irq->source); if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", irq->intid)) return; val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; irq->source &= ~(1 << (src - 1)); if (irq->source) { irq->pending_latch = true; val |= ICH_LR_EOI; } } } /* * Level-triggered mapped IRQs are special because we only observe * rising edges as input to the VGIC. We therefore lower the line * level here, so that we can take new virtual IRQs. See * vgic_v3_fold_lr_state for more info. */ if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) irq->line_level = false; if (irq->group) val |= ICH_LR_GROUP; val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; } void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) { vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0; } void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; u32 vmcr; if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) & ICH_VMCR_ACK_CTL_MASK; vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) & ICH_VMCR_FIQ_EN_MASK; } else { /* * When emulating GICv3 on GICv3 with SRE=1 on the * VFIQEn bit is RES1 and the VAckCtl bit is RES0. */ vmcr = ICH_VMCR_FIQ_EN_MASK; } vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK; vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK; vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; cpu_if->vgic_vmcr = vmcr; } void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; u32 vmcr; vmcr = cpu_if->vgic_vmcr; if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >> ICH_VMCR_ACK_CTL_SHIFT; vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >> ICH_VMCR_FIQ_EN_SHIFT; } else { /* * When emulating GICv3 on GICv3 with SRE=1 on the * VFIQEn bit is RES1 and the VAckCtl bit is RES0. */ vmcrp->fiqen = 1; vmcrp->ackctl = 0; } vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT; vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT; vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT; } #define INITIAL_PENDBASER_VALUE \ (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \ GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) void vgic_v3_enable(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; /* * By forcing VMCR to zero, the GIC will restore the binary * points to their reset values. Anything else resets to zero * anyway. */ vgic_v3->vgic_vmcr = 0; /* * If we are emulating a GICv3, we do it in an non-GICv2-compatible * way, so we force SRE to 1 to demonstrate this to the guest. * Also, we don't support any form of IRQ/FIQ bypass. * This goes with the spec allowing the value to be RAO/WI. */ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB | ICC_SRE_EL1_SRE); vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; } else { vgic_v3->vgic_sre = 0; } vcpu->arch.vgic_cpu.num_id_bits = FIELD_GET(ICH_VTR_EL2_IDbits, kvm_vgic_global_state.ich_vtr_el2); vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits, kvm_vgic_global_state.ich_vtr_el2) + 1; /* Get the show on the road... */ vgic_v3->vgic_hcr = ICH_HCR_EL2_En; } void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; /* Hide GICv3 sysreg if necessary */ if (!kvm_has_gicv3(vcpu->kvm)) { vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | ICH_HCR_EL2_TC); return; } if (group0_trap) vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0; if (group1_trap) vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1; if (common_trap) vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC; if (dir_trap) vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) { struct kvm_vcpu *vcpu; int byte_offset, bit_nr; gpa_t pendbase, ptr; bool status; u8 val; int ret; unsigned long flags; retry: vcpu = irq->target_vcpu; if (!vcpu) return 0; pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); byte_offset = irq->intid / BITS_PER_BYTE; bit_nr = irq->intid % BITS_PER_BYTE; ptr = pendbase + byte_offset; ret = kvm_read_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; status = val & (1 << bit_nr); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->target_vcpu != vcpu) { raw_spin_unlock_irqrestore(&irq->irq_lock, flags); goto retry; } irq->pending_latch = status; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); if (status) { /* clear consumed data */ val &= ~(1 << bit_nr); ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; } return 0; } /* * The deactivation of the doorbell interrupt will trigger the * unmapping of the associated vPE. */ static void unmap_all_vpes(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; int i; for (i = 0; i < dist->its_vm.nr_vpes; i++) free_irq(dist->its_vm.vpes[i]->irq, kvm_get_vcpu(kvm, i)); } static void map_all_vpes(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; int i; for (i = 0; i < dist->its_vm.nr_vpes; i++) WARN_ON(vgic_v4_request_vpe_irq(kvm_get_vcpu(kvm, i), dist->its_vm.vpes[i]->irq)); } /* * vgic_v3_save_pending_tables - Save the pending tables into guest RAM * kvm lock and all vcpu lock must be held */ int vgic_v3_save_pending_tables(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; gpa_t last_ptr = ~(gpa_t)0; bool vlpi_avail = false; unsigned long index; int ret = 0; u8 val; if (unlikely(!vgic_initialized(kvm))) return -ENXIO; /* * A preparation for getting any VLPI states. * The above vgic initialized check also ensures that the allocation * and enabling of the doorbells have already been done. */ if (kvm_vgic_global_state.has_gicv4_1) { unmap_all_vpes(kvm); vlpi_avail = true; } xa_for_each(&dist->lpi_xa, index, irq) { int byte_offset, bit_nr; struct kvm_vcpu *vcpu; gpa_t pendbase, ptr; bool is_pending; bool stored; vcpu = irq->target_vcpu; if (!vcpu) continue; pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); byte_offset = irq->intid / BITS_PER_BYTE; bit_nr = irq->intid % BITS_PER_BYTE; ptr = pendbase + byte_offset; if (ptr != last_ptr) { ret = kvm_read_guest_lock(kvm, ptr, &val, 1); if (ret) goto out; last_ptr = ptr; } stored = val & (1U << bit_nr); is_pending = irq->pending_latch; if (irq->hw && vlpi_avail) vgic_v4_get_vlpi_state(irq, &is_pending); if (stored == is_pending) continue; if (is_pending) val |= 1 << bit_nr; else val &= ~(1 << bit_nr); ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) goto out; } out: if (vlpi_avail) map_all_vpes(kvm); return ret; } /** * vgic_v3_rdist_overlap - check if a region overlaps with any * existing redistributor region * * @kvm: kvm handle * @base: base of the region * @size: size of region * * Return: true if there is an overlap */ bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size) { struct vgic_dist *d = &kvm->arch.vgic; struct vgic_redist_region *rdreg; list_for_each_entry(rdreg, &d->rd_regions, list) { if ((base + size > rdreg->base) && (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg))) return true; } return false; } /* * Check for overlapping regions and for regions crossing the end of memory * for base addresses which have already been set. */ bool vgic_v3_check_base(struct kvm *kvm) { struct vgic_dist *d = &kvm->arch.vgic; struct vgic_redist_region *rdreg; if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base) return false; list_for_each_entry(rdreg, &d->rd_regions, list) { size_t sz = vgic_v3_rd_region_size(kvm, rdreg); if (vgic_check_iorange(kvm, VGIC_ADDR_UNDEF, rdreg->base, SZ_64K, sz)) return false; } if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base)) return true; return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base, KVM_VGIC_V3_DIST_SIZE); } /** * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one * which has free space to put a new rdist region. * * @rd_regions: redistributor region list head * * A redistributor regions maps n redistributors, n = region size / (2 x 64kB). * Stride between redistributors is 0 and regions are filled in the index order. * * Return: the redist region handle, if any, that has space to map a new rdist * region. */ struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions) { struct vgic_redist_region *rdreg; list_for_each_entry(rdreg, rd_regions, list) { if (!vgic_v3_redist_region_full(rdreg)) return rdreg; } return NULL; } struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, u32 index) { struct list_head *rd_regions = &kvm->arch.vgic.rd_regions; struct vgic_redist_region *rdreg; list_for_each_entry(rdreg, rd_regions, list) { if (rdreg->index == index) return rdreg; } return NULL; } int vgic_v3_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; unsigned long c; kvm_for_each_vcpu(c, vcpu, kvm) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) { kvm_debug("vcpu %ld redistributor base not set\n", c); return -ENXIO; } } if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) { kvm_debug("Need to set vgic distributor addresses first\n"); return -ENXIO; } if (!vgic_v3_check_base(kvm)) { kvm_debug("VGIC redist and dist frames overlap\n"); return -EINVAL; } /* * For a VGICv3 we require the userland to explicitly initialize * the VGIC before we need to use it. */ if (!vgic_initialized(kvm)) { return -EBUSY; } if (kvm_vgic_global_state.has_gicv4_1) vgic_v4_configure_vsgis(kvm); return 0; } DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); static int __init early_group0_trap_cfg(char *buf) { return kstrtobool(buf, &group0_trap); } early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg); static int __init early_group1_trap_cfg(char *buf) { return kstrtobool(buf, &group1_trap); } early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg); static int __init early_common_trap_cfg(char *buf) { return kstrtobool(buf, &common_trap); } early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg); static int __init early_gicv4_enable(char *buf) { return kstrtobool(buf, &gicv4_enable); } early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); static const struct midr_range broken_seis[] = { MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), {}, }; static bool vgic_v3_broken_seis(void) { return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) && is_midr_in_range_list(broken_seis)); } /** * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller * @info: pointer to the GIC description * * Returns 0 if the VGICv3 has been probed successfully, returns an error code * otherwise */ int vgic_v3_probe(const struct gic_kvm_info *info) { u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); bool has_v2; int ret; has_v2 = ich_vtr_el2 >> 63; ich_vtr_el2 = (u32)ich_vtr_el2; /* * The ListRegs field is 5 bits, but there is an architectural * maximum of 16 list registers. Just ignore bit 4... */ kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; kvm_vgic_global_state.can_emulate_gicv2 = false; kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2; /* GICv4 support? */ if (info->has_v4) { kvm_vgic_global_state.has_gicv4 = gicv4_enable; kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; kvm_info("GICv4%s support %s\n", kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", str_enabled_disabled(gicv4_enable)); } kvm_vgic_global_state.vcpu_base = 0; if (!info->vcpu.start) { kvm_info("GICv3: no GICV resource entry\n"); } else if (!has_v2) { pr_warn(FW_BUG "CPU interface incapable of MMIO access\n"); } else if (!PAGE_ALIGNED(info->vcpu.start)) { pr_warn("GICV physical address 0x%llx not page aligned\n", (unsigned long long)info->vcpu.start); } else if (kvm_get_mode() != KVM_MODE_PROTECTED) { kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.can_emulate_gicv2 = true; ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); if (ret) { kvm_err("Cannot register GICv2 KVM device.\n"); return ret; } kvm_info("vgic-v2@%llx\n", info->vcpu.start); } ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); if (ret) { kvm_err("Cannot register GICv3 KVM device.\n"); kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); return ret; } if (kvm_vgic_global_state.vcpu_base == 0) kvm_info("disabling GICv2 emulation\n"); if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) { group0_trap = true; group1_trap = true; } if (vgic_v3_broken_seis()) { kvm_info("GICv3 with broken locally generated SEI\n"); kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS; group0_trap = true; group1_trap = true; if (ich_vtr_el2 & ICH_VTR_EL2_TDS) dir_trap = true; else common_trap = true; } if (group0_trap || group1_trap || common_trap | dir_trap) { kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", group0_trap ? "G0" : "", group1_trap ? "G1" : "", common_trap ? "C" : "", dir_trap ? "D" : ""); static_branch_enable(&vgic_v3_cpuif_trap); } kvm_vgic_global_state.vctrl_base = NULL; kvm_vgic_global_state.type = VGIC_V3; kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; return 0; } void vgic_v3_load(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; /* If the vgic is nested, perform the full state loading */ if (vgic_state_is_nested(vcpu)) { vgic_v3_load_nested(vcpu); return; } if (likely(!is_protected_kvm_enabled())) kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); WARN_ON(vgic_v4_load(vcpu)); } void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; if (vgic_state_is_nested(vcpu)) { vgic_v3_put_nested(vcpu); return; } if (likely(!is_protected_kvm_enabled())) kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); WARN_ON(vgic_v4_put(vcpu)); if (has_vhe()) __vgic_v3_deactivate_traps(cpu_if); } |
8 8 8 9 3 7 4 6 1 4 1 33 33 204 3 148 28 33 162 44 201 202 202 199 201 2 202 199 3 202 202 159 48 2 2 202 1 201 3 3 218 5 48 182 222 48 182 1 217 5 217 5 185 9 33 219 222 2 6 22 200 222 222 222 50 50 1 49 52 6 50 39 39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/open.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/module.h> #include <linux/tty.h> #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/rcupdate.h> #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> #include <linux/filelock.h> #include "internal.h" int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ if (length < 0) return -EINVAL; newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | time_attrs; if (filp) { newattrs.ia_file = filp; newattrs.ia_valid |= ATTR_FILE; } /* Remove suid, sgid, and file capabilities on truncate too */ ret = dentry_needs_remove_privs(idmap, dentry); if (ret < 0) return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } int vfs_truncate(const struct path *path, loff_t length) { struct mnt_idmap *idmap; struct inode *inode; int error; inode = path->dentry->d_inode; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode)) return -EINVAL; idmap = mnt_idmap(path->mnt); error = inode_permission(idmap, inode, MAY_WRITE); if (error) return error; error = fsnotify_truncate_perm(path, length); if (error) return error; error = mnt_want_write(path->mnt); if (error) return error; error = -EPERM; if (IS_APPEND(inode)) goto mnt_drop_write_and_out; error = get_write_access(inode); if (error) goto mnt_drop_write_and_out; /* * Make sure that there are no leases. get_write_access() protects * against the truncate racing with a lease-granting setlease(). */ error = break_lease(inode, O_WRONLY); if (error) goto put_write_and_out; error = security_path_truncate(path); if (!error) error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); return error; } EXPORT_SYMBOL_GPL(vfs_truncate); int do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int error; if (length < 0) /* sorry, but loff_t says... */ return -EINVAL; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { error = vfs_truncate(&path, length); path_put(&path); } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) { return do_sys_truncate(path, length); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) { return do_sys_truncate(path, length); } #endif int do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; int error; /* explicitly opened as large or we are on 64-bit box */ if (file->f_flags & O_LARGEFILE) small = 0; dentry = file->f_path.dentry; inode = dentry->d_inode; if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) return -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) return -EINVAL; /* Check IS_APPEND on real upper inode */ if (IS_APPEND(file_inode(file))) return -EPERM; error = security_file_truncate(file); if (error) return error; error = fsnotify_truncate_perm(&file->f_path, length); if (error) return error; sb_start_write(inode->i_sb); error = do_truncate(file_mnt_idmap(file), dentry, length, ATTR_MTIME | ATTR_CTIME, file); sb_end_write(inode->i_sb); return error; } int do_sys_ftruncate(unsigned int fd, loff_t length, int small) { if (length < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return do_ftruncate(fd_file(f), length, small); } SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length) { return do_sys_ftruncate(fd, length, 1); } #endif /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) { return do_sys_truncate(path, length); } SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) { return do_sys_ftruncate(fd, length, 0); } #endif /* BITS_PER_LONG == 32 */ #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, compat_arg_u64_dual(length)) { return ksys_truncate(pathname, compat_arg_u64_glue(length)); } #endif #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, compat_arg_u64_dual(length)) { return ksys_ftruncate(fd, compat_arg_u64_glue(length)); } #endif int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); int ret; loff_t sum; if (offset < 0 || len <= 0) return -EINVAL; if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; /* * Modes are exclusive, even if that is not obvious from the encoding * as bit masks and the mix with the flag in the same namespace. * * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is * encoded as no bit set. */ switch (mode & FALLOC_FL_MODE_MASK) { case FALLOC_FL_ALLOCATE_RANGE: case FALLOC_FL_UNSHARE_RANGE: case FALLOC_FL_ZERO_RANGE: break; case FALLOC_FL_PUNCH_HOLE: if (!(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; break; case FALLOC_FL_COLLAPSE_RANGE: case FALLOC_FL_INSERT_RANGE: if (mode & FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; break; default: return -EOPNOTSUPP; } if (!(file->f_mode & FMODE_WRITE)) return -EBADF; /* * On append-only files only space preallocation is supported. */ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* * We cannot allow any fallocate operation on an active swapfile */ if (IS_SWAPFILE(inode)) return -ETXTBSY; /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ ret = security_file_permission(file, MAY_WRITE); if (ret) return ret; ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); if (ret) return ret; if (S_ISFIFO(inode->i_mode)) return -ESPIPE; if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; /* Check for wraparound */ if (check_add_overflow(offset, len, &sum)) return -EFBIG; if (sum > inode->i_sb->s_maxbytes) return -EFBIG; if (!file->f_op->fallocate) return -EOPNOTSUPP; file_start_write(file); ret = file->f_op->fallocate(file, mode, offset, len); /* * Create inotify and fanotify events. * * To keep the logic simple always create events if fallocate succeeds. * This implies that events are even created if the file size remains * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. */ if (ret == 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_fallocate(fd_file(f), mode, offset, len); } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { return ksys_fallocate(fd, mode, offset, len); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), compat_arg_u64_dual(len)) { return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), compat_arg_u64_glue(len)); } #endif /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. * * Creating new credentials is expensive, so we try to skip doing it, * which we can if the result would match what we already got. */ static bool access_need_override_creds(int flags) { const struct cred *cred; if (flags & AT_EACCESS) return false; cred = current_cred(); if (!uid_eq(cred->fsuid, cred->uid) || !gid_eq(cred->fsgid, cred->gid)) return true; if (!issecure(SECURE_NO_SETUID_FIXUP)) { kuid_t root_uid = make_kuid(cred->user_ns, 0); if (!uid_eq(cred->uid, root_uid)) { if (!cap_isclear(cred->cap_effective)) return true; } else { if (!cap_isidentical(cred->cap_effective, cred->cap_permitted)) return true; } } return false; } static const struct cred *access_override_creds(void) { struct cred *override_cred; override_cred = prepare_creds(); if (!override_cred) return NULL; /* * XXX access_need_override_creds performs checks in hopes of skipping * this work. Make sure it stays in sync if making any changes in this * routine. */ override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ kuid_t root_uid = make_kuid(override_cred->user_ns, 0); if (!uid_eq(override_cred->uid, root_uid)) cap_clear(override_cred->cap_effective); else override_cred->cap_effective = override_cred->cap_permitted; } /* * The new set of credentials can *only* be used in * task-synchronous circumstances, and does not need * RCU freeing, unless somebody then takes a separate * reference to it. * * NOTE! This is _only_ true because this credential * is used purely for override_creds() that installs * it as the subjective cred. Other threads will be * accessing ->real_cred, not the subjective cred. * * If somebody _does_ make a copy of this (using the * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous * cred accesses will keep things non-racy to avoid RCU * freeing. */ override_cred->non_rcu = 1; return override_creds(override_cred); } static int do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; const struct cred *old_cred = NULL; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) return -EINVAL; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; if (access_need_override_creds(flags)) { old_cred = access_override_creds(); if (!old_cred) return -ENOMEM; } retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) goto out; inode = d_backing_inode(path.dentry); if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { /* * MAY_EXEC on regular files is denied if the fs is mounted * with the "noexec" flag. */ res = -EACCES; if (path_noexec(&path)) goto out_path_release; } res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; /* * This is a rare case where using __mnt_is_readonly() * is OK without a mnt_want/drop_write() pair. Since * no actual write to the fs is performed here, we do * not need to telegraph to that to anyone. * * By doing this, we accept that this access is * inherently racy and know that the fs may change * state before we even see this result. */ if (__mnt_is_readonly(path.mnt)) res = -EROFS; out_path_release: path_put(&path); if (retry_estale(res, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: if (old_cred) put_cred(revert_creds(old_cred)); return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { return do_faccessat(dfd, filename, mode, 0); } SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, int, flags) { return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { return do_faccessat(AT_FDCWD, filename, mode, 0); } SYSCALL_DEFINE1(chdir, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; set_fs_pwd(current->fs, &path); dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE1(fchdir, unsigned int, fd) { CLASS(fd_raw, f)(fd); int error; if (fd_empty(f)) return -EBADF; if (!d_can_lookup(fd_file(f)->f_path.dentry)) return -ENOTDIR; error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &fd_file(f)->f_path); return error; } SYSCALL_DEFINE1(chroot, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; error = -EPERM; if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) goto dput_and_out; set_fs_root(current->fs, &path); error = 0; dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; struct iattr newattrs; int error; error = mnt_want_write(path->mnt); if (error) return error; retry_deleg: inode_lock(inode); error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path->mnt); return error; } int vfs_fchmod(struct file *file, umode_t mode) { audit_file(file); return chmod_common(&file->f_path, mode); } SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_fchmod(fd_file(f), mode); } static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, unsigned int flags) { struct path path; int error; unsigned int lookup_flags; if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) return -EINVAL; lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { error = chmod_common(&path, mode); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } } return error; } SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, flags) { return do_fchmodat(dfd, filename, mode, flags); } SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { return do_fchmodat(dfd, filename, mode, 0); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { return do_fchmodat(AT_FDCWD, filename, mode, 0); } /* * Check whether @kuid is valid and if so generate and set vfsuid_t in * ia_vfsuid. * * Return: true if @kuid is valid, false if not. */ static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) { if (!uid_valid(kuid)) return false; attr->ia_valid |= ATTR_UID; attr->ia_vfsuid = VFSUIDT_INIT(kuid); return true; } /* * Check whether @kgid is valid and if so generate and set vfsgid_t in * ia_vfsgid. * * Return: true if @kgid is valid, false if not. */ static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) { if (!gid_valid(kgid)) return false; attr->ia_valid |= ATTR_GID; attr->ia_vfsgid = VFSGIDT_INIT(kgid); return true; } int chown_common(const struct path *path, uid_t user, gid_t group) { struct mnt_idmap *idmap; struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; struct iattr newattrs; kuid_t uid; kgid_t gid; uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); idmap = mnt_idmap(path->mnt); fs_userns = i_user_ns(inode); retry_deleg: newattrs.ia_vfsuid = INVALID_VFSUID; newattrs.ia_vfsgid = INVALID_VFSGID; newattrs.ia_valid = ATTR_CTIME; if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) return -EINVAL; if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) return -EINVAL; inode_lock(inode); if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | setattr_should_drop_sgid(idmap, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag) { struct path path; int error = -EINVAL; int lookup_flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); if (error) goto out_release; error = chown_common(&path, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag) { return do_fchownat(dfd, filename, user, group, flag); } SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, 0); } SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, AT_SYMLINK_NOFOLLOW); } int vfs_fchown(struct file *file, uid_t user, gid_t group) { int error; error = mnt_want_write_file(file); if (error) return error; audit_file(file); error = chown_common(&file->f_path, user, group); mnt_drop_write_file(file); return error; } int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_fchown(fd_file(f), user, group); } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { return ksys_fchown(fd, user, group); } static inline int file_get_write_access(struct file *f) { int error; error = get_write_access(f->f_inode); if (unlikely(error)) return error; error = mnt_get_write_access(f->f_path.mnt); if (unlikely(error)) goto cleanup_inode; if (unlikely(f->f_mode & FMODE_BACKING)) { error = mnt_get_write_access(backing_file_user_path(f)->mnt); if (unlikely(error)) goto cleanup_mnt; } return 0; cleanup_mnt: mnt_put_write_access(f->f_path.mnt); cleanup_inode: put_write_access(f->f_inode); return error; } static int do_dentry_open(struct file *f, int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; struct inode *inode = f->f_path.dentry->d_inode; int error; path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; f->f_wb_err = filemap_sample_wb_err(f->f_mapping); f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; file_set_fsnotify_mode(f, FMODE_NONOTIFY); f->f_op = &empty_fops; return 0; } if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = file_get_write_access(f); if (unlikely(error)) goto cleanup_file; f->f_mode |= FMODE_WRITER; } /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } error = security_file_open(f); if (error) goto cleanup_all; /* * Set FMODE_NONOTIFY_* bits according to existing permission watches. * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a * pseudo file, this call will not change the mode. */ file_set_fsnotify_mode_from_watchers(f); error = fsnotify_open_perm(f); if (error) goto cleanup_all; error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; if (!open) open = f->f_op->open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } f->f_mode |= FMODE_OPENED; if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) f->f_mode &= ~FMODE_LSEEK; if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; /* * XXX: Huge page cache doesn't support writing yet. Drop all page * cache for this file before processing writes. */ if (f->f_mode & FMODE_WRITE) { /* * Depends on full fence from get_write_access() to synchronize * against collapse_file() regarding i_writecount and nr_thps * updates. Ensures subsequent insertion of THPs into the page * cache will fail. */ if (filemap_nr_thps(inode->i_mapping)) { struct address_space *mapping = inode->i_mapping; filemap_invalidate_lock(inode->i_mapping); /* * unmap_mapping_range just need to be called once * here, because the private pages is not need to be * unmapped mapping (e.g. data segment of dynamic * shared libraries here). */ unmap_mapping_range(mapping, 0, 0, 0); truncate_inode_pages(mapping, 0); filemap_invalidate_unlock(inode->i_mapping); } } return 0; cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); put_file_access(f); cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; f->f_path.dentry = NULL; f->f_inode = NULL; return error; } /** * finish_open - finish opening a file * @file: file pointer * @dentry: pointer to dentry * @open: open callback * * This can be used to finish opening a file passed to i_op->atomic_open(). * * If the open callback is set to NULL, then the standard f_op->open() * filesystem callback is substituted. * * NB: the dentry reference is _not_ consumed. If, for example, the dentry is * the return value of d_splice_alias(), then the caller needs to perform dput() * on it after finish_open(). * * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)) { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); /** * finish_no_open - finish ->atomic_open() without opening the file * * @file: file pointer * @dentry: dentry or NULL (as returned from ->lookup()) * * This can be used to set the result of a successful lookup in ->atomic_open(). * * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * * Returns "0" which must be the return value of ->atomic_open() after having * called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { file->f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); char *file_path(struct file *filp, char *buf, int buflen) { return d_path(&filp->f_path, buf, buflen); } EXPORT_SYMBOL(file_path); /** * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized */ int vfs_open(const struct path *path, struct file *file) { int ret; file->f_path = *path; ret = do_dentry_open(file, NULL); if (!ret) { /* * Once we return a file with FMODE_OPENED, __fput() will call * fsnotify_close(), so we need fsnotify_open() here for * symmetry. */ fsnotify_open(file); } return ret; } struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { int error; struct file *f; /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { error = vfs_open(path, f); if (error) { fput(f); f = ERR_PTR(error); } } return f; } EXPORT_SYMBOL(dentry_open); struct file *dentry_open_nonotify(const struct path *path, int flags, const struct cred *cred) { struct file *f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { int error; file_set_fsnotify_mode(f, FMODE_NONOTIFY); error = vfs_open(path, f); if (error) { fput(f); f = ERR_PTR(error); } } return f; } /** * dentry_create - Create and open a file * @path: path to create * @flags: O_ flags * @mode: mode bits for new file * @cred: credentials to use * * Caller must hold the parent directory's lock, and have prepared * a negative dentry, placed in @path->dentry, for the new file. * * Caller sets @path->mnt to the vfsmount of the filesystem where * the new file is to be created. The parent directory and the * negative dentry must reside on the same filesystem instance. * * On success, returns a "struct file *". Otherwise a ERR_PTR * is returned. */ struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred) { struct file *f; int error; f = alloc_empty_file(flags, cred); if (IS_ERR(f)) return f; error = vfs_create(mnt_idmap(path->mnt), d_inode(path->dentry->d_parent), path->dentry, mode, true); if (!error) error = vfs_open(path, f); if (unlikely(error)) { fput(f); return ERR_PTR(error); } return f; } EXPORT_SYMBOL(dentry_create); /** * kernel_file_open - open a file for kernel internal use * @path: path of the file to open * @flags: open flags * @cred: credentials for open * * Open a file for use by in-kernel consumers. The file is not accounted * against nr_files and must not be installed into the file descriptor * table. * * Return: Opened file on success, an error pointer on failure. */ struct file *kernel_file_open(const struct path *path, int flags, const struct cred *cred) { struct file *f; int error; f = alloc_empty_file_noaccount(flags, cred); if (IS_ERR(f)) return f; f->f_path = *path; error = do_dentry_open(f, NULL); if (error) { fput(f); return ERR_PTR(error); } fsnotify_open(f); return f; } EXPORT_SYMBOL_GPL(kernel_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) inline struct open_how build_open_how(int flags, umode_t mode) { struct open_how how = { .flags = flags & VALID_OPEN_FLAGS, .mode = mode & S_IALLUGO, }; /* O_PATH beats everything else. */ if (how.flags & O_PATH) how.flags &= O_PATH_FLAGS; /* Modes should only be set for create-like flags. */ if (!WILL_CREATE(how.flags)) how.mode = 0; return how; } inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; u64 strip = O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), "struct open_flags doesn't yet handle flags > 32 bits"); /* * Strip flags that aren't relevant in determining struct open_flags. */ flags &= ~strip; /* * Older syscalls implicitly clear all of the invalid flags or argument * values before calling build_open_flags(), but openat2(2) checks all * of its arguments. */ if (flags & ~VALID_OPEN_FLAGS) return -EINVAL; if (how->resolve & ~VALID_RESOLVE_FLAGS) return -EINVAL; /* Scoping flags are mutually exclusive. */ if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) return -EINVAL; /* Deal with the mode. */ if (WILL_CREATE(flags)) { if (how->mode & ~S_IALLUGO) return -EINVAL; op->mode = how->mode | S_IFREG; } else { if (how->mode != 0) return -EINVAL; op->mode = 0; } /* * Block bugs where O_DIRECTORY | O_CREAT created regular files. * Note, that blocking O_DIRECTORY | O_CREAT here also protects * O_TMPFILE below which requires O_DIRECTORY being raised. */ if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) return -EINVAL; /* Now handle the creative implementation of O_TMPFILE. */ if (flags & __O_TMPFILE) { /* * In order to ensure programs get explicit errors when trying * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY * is raised alongside __O_TMPFILE. */ if (!(flags & O_DIRECTORY)) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; } if (flags & O_PATH) { /* O_PATH only permits certain other flags to be set. */ if (flags & ~O_PATH_FLAGS) return -EINVAL; acc_mode = 0; } /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only * check for O_DSYNC if the need any syncing at all we enforce it's * always set instead of having to deal with possibly weird behaviour * for malicious applications setting only __O_SYNC. */ if (flags & __O_SYNC) flags |= O_DSYNC; op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ if (flags & O_TRUNC) acc_mode |= MAY_WRITE; /* Allow the LSM permission hook to distinguish append access from general write access. */ if (flags & O_APPEND) acc_mode |= MAY_APPEND; op->acc_mode = acc_mode; op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; flags |= O_NOFOLLOW; } } if (flags & O_DIRECTORY) lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (how->resolve & RESOLVE_NO_XDEV) lookup_flags |= LOOKUP_NO_XDEV; if (how->resolve & RESOLVE_NO_MAGICLINKS) lookup_flags |= LOOKUP_NO_MAGICLINKS; if (how->resolve & RESOLVE_NO_SYMLINKS) lookup_flags |= LOOKUP_NO_SYMLINKS; if (how->resolve & RESOLVE_BENEATH) lookup_flags |= LOOKUP_BENEATH; if (how->resolve & RESOLVE_IN_ROOT) lookup_flags |= LOOKUP_IN_ROOT; if (how->resolve & RESOLVE_CACHED) { /* Don't bother even trying for create/truncate/tmpfile open */ if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) return -EAGAIN; lookup_flags |= LOOKUP_CACHED; } op->lookup_flags = lookup_flags; return 0; } /** * file_open_name - open file and return file pointer * * @name: struct filename containing path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_filp_open(AT_FDCWD, name, &op); } /** * filp_open - open file and return file pointer * * @filename: path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *filp_open(const char *filename, int flags, umode_t mode) { struct filename *name = getname_kernel(filename); struct file *file = ERR_CAST(name); if (!IS_ERR(name)) { file = file_open_name(name, flags, mode); putname(name); } return file; } EXPORT_SYMBOL(filp_open); struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); static int do_sys_openat2(int dfd, const char __user *filename, struct open_how *how) { struct open_flags op; struct filename *tmp; int err, fd; err = build_open_flags(how, &op); if (unlikely(err)) return err; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(how->flags); if (likely(fd >= 0)) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); } else { fd_install(fd, f); } } putname(tmp); return fd; } int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); } SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(dfd, filename, flags, mode); } SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, struct open_how __user *, how, size_t, usize) { int err; struct open_how tmp; BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); if (unlikely(usize < OPEN_HOW_SIZE_VER0)) return -EINVAL; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); if (err) return err; audit_openat2_how(&tmp); /* O_LARGEFILE is only allowed for non-O_PATH. */ if (!(tmp.flags & O_PATH) && force_o_largefile()) tmp.flags |= O_LARGEFILE; return do_sys_openat2(dfd, filename, &tmp); } #ifdef CONFIG_COMPAT /* * Exactly like sys_open(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(AT_FDCWD, filename, flags, mode); } /* * Exactly like sys_openat(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(dfd, filename, flags, mode); } #endif #ifndef __alpha__ /* * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { int flags = O_CREAT | O_WRONLY | O_TRUNC; if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, pathname, flags, mode); } #endif /* * "id" is the POSIX thread ID. We use the * files pointer for this.. */ static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp, "VFS: Close: file count is 0 (f_op=%ps)", filp->f_op)) { return 0; } if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); if (likely(!(filp->f_mode & FMODE_PATH))) { dnotify_flush(filp, id); locks_remove_posix(filp, id); } return retval; } int filp_close(struct file *filp, fl_owner_t id) { int retval; retval = filp_flush(filp, id); fput_close(filp); return retval; } EXPORT_SYMBOL(filp_close); /* * Careful here! We test whether the file pointer is NULL before * releasing the fd. This ensures that one clone task can't release * an fd while another clone is opening it. */ SYSCALL_DEFINE1(close, unsigned int, fd) { int retval; struct file *file; file = file_close_fd(fd); if (!file) return -EBADF; retval = filp_flush(file, current->files); /* * We're returning to user space. Don't bother * with any delayed fput() cases. */ fput_close_sync(file); if (likely(retval == 0)) return 0; /* can't restart close syscall because file table entry was cleared */ if (retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK) retval = -EINTR; return retval; } /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. */ SYSCALL_DEFINE0(vhangup) { if (capable(CAP_SYS_TTY_CONFIG)) { tty_vhangup_self(); return 0; } return -EPERM; } /* * Called when an inode is about to be open. * We use this to disallow opening large files on 32bit systems if * the caller didn't specify O_LARGEFILE. On 64bit systems we force * on this flag in sys_open. */ int generic_file_open(struct inode * inode, struct file * filp) { if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EOVERFLOW; return 0; } EXPORT_SYMBOL(generic_file_open); /* * This is used by subsystems that don't want seekable * file descriptors. The function is not supposed to ever fail, the only * reason it returns an 'int' and not 'void' is so that it can be plugged * directly into file_operations structure. */ int nonseekable_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); return 0; } EXPORT_SYMBOL(nonseekable_open); /* * stream_open is used by subsystems that want stream-like file descriptors. * Such file descriptors are not seekable and don't have notion of position * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). * Contrary to file descriptors of other regular files, .read() and .write() * can run simultaneously. * * stream_open never fails and is marked to return int so that it could be * directly used as file_operations.open . */ int stream_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); filp->f_mode |= FMODE_STREAM; return 0; } EXPORT_SYMBOL(stream_open); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _linux_POSIX_TIMERS_H #define _linux_POSIX_TIMERS_H #include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/pid.h> #include <linux/posix-timers_types.h> #include <linux/rcuref.h> #include <linux/spinlock.h> #include <linux/timerqueue.h> struct kernel_siginfo; struct task_struct; struct sigqueue; struct k_itimer; static inline clockid_t make_process_cpuclock(const unsigned int pid, const clockid_t clock) { return ((~pid) << 3) | clock; } static inline clockid_t make_thread_cpuclock(const unsigned int tid, const clockid_t clock) { return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK); } static inline clockid_t fd_to_clockid(const int fd) { return make_process_cpuclock((unsigned int) fd, CLOCKFD); } static inline int clockid_to_fd(const clockid_t clk) { return ~(clk >> 3); } #ifdef CONFIG_POSIX_TIMERS #include <linux/signal_types.h> /** * cpu_timer - Posix CPU timer representation for k_itimer * @node: timerqueue node to queue in the task/sig * @head: timerqueue head on which this timer is queued * @pid: Pointer to target task PID * @elist: List head for the expiry list * @firing: Timer is currently firing * @nanosleep: Timer is used for nanosleep and is not a regular posix-timer * @handling: Pointer to the task which handles expiry */ struct cpu_timer { struct timerqueue_node node; struct timerqueue_head *head; struct pid *pid; struct list_head elist; bool firing; bool nanosleep; struct task_struct __rcu *handling; }; static inline bool cpu_timer_enqueue(struct timerqueue_head *head, struct cpu_timer *ctmr) { ctmr->head = head; return timerqueue_add(head, &ctmr->node); } static inline bool cpu_timer_queued(struct cpu_timer *ctmr) { return !!ctmr->head; } static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr) { if (cpu_timer_queued(ctmr)) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; return true; } return false; } static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr) { return ctmr->node.expires; } static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp) { ctmr->node.expires = exp; } static inline void posix_cputimers_init(struct posix_cputimers *pct) { memset(pct, 0, sizeof(*pct)); pct->bases[0].nextevt = U64_MAX; pct->bases[1].nextevt = U64_MAX; pct->bases[2].nextevt = U64_MAX; } void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit); static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, u64 runtime) { pct->bases[CPUCLOCK_SCHED].nextevt = runtime; } void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_init_sigqueue(struct sigqueue *q); void posixtimer_send_sigqueue(struct k_itimer *tmr); bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); void posixtimer_free_timer(struct k_itimer *timer); long posixtimer_create_prctl(unsigned long ctrl); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ .nextevt = U64_MAX, \ } #define INIT_CPU_TIMERBASES(b) { \ INIT_CPU_TIMERBASE(b[0]), \ INIT_CPU_TIMERBASE(b[1]), \ INIT_CPU_TIMERBASE(b[2]), \ } #define INIT_CPU_TIMERS(s) \ .posix_cputimers = { \ .bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases), \ }, #else struct cpu_timer { }; #define INIT_CPU_TIMERS(s) static inline void posix_cputimers_init(struct posix_cputimers *pct) { } static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } static inline void posixtimer_rearm_itimer(struct task_struct *p) { } static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { return false; } static inline void posixtimer_free_timer(struct k_itimer *timer) { } static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK void clear_posix_cputimers_work(struct task_struct *p); void posix_cputimers_init_work(void); #else static inline void clear_posix_cputimers_work(struct task_struct *p) { } static inline void posix_cputimers_init_work(void) { } #endif /** * struct k_itimer - POSIX.1b interval timer structure. * @list: List node for binding the timer to tsk::signal::posix_timers * @ignored_list: List node for tracking ignored timers in tsk::signal::ignored_posix_timers * @t_hash: Entry in the posix timer hash table * @it_lock: Lock protecting the timer * @kclock: Pointer to the k_clock struct handling this timer * @it_clock: The posix timer clock id * @it_id: The posix timer id for identifying the timer * @it_status: The status of the timer * @it_sig_periodic: The periodic status at signal delivery * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_signal_seq: Sequence count to control signal delivery * @it_sigqueue_seq: The sequence count at the point where the signal was queued * @it_sigev_notify: The notify word of sigevent struct for signal delivery * @it_interval: The interval for periodic timers * @it_signal: Pointer to the creators signal struct * @it_pid: The pid of the process/task targeted by the signal * @it_process: The task to wakeup on clock_nanosleep (CPU timers) * @rcuref: Reference count for life time management * @sigq: Embedded sigqueue * @it: Union representing the various posix timer type * internals. * @rcu: RCU head for freeing the timer. */ struct k_itimer { /* 1st cacheline contains read-mostly fields */ struct hlist_node t_hash; struct hlist_node list; timer_t it_id; clockid_t it_clock; int it_sigev_notify; enum pid_type it_pid_type; struct signal_struct *it_signal; const struct k_clock *kclock; /* 2nd cacheline and above contain fields which are modified regularly */ spinlock_t it_lock; int it_status; bool it_sig_periodic; s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; unsigned int it_sigqueue_seq; ktime_t it_interval; struct hlist_node ignored_list; union { struct pid *it_pid; struct task_struct *it_process; }; struct sigqueue sigq; rcuref_t rcuref; union { struct { struct hrtimer timer; } real; struct cpu_timer cpu; struct { struct alarm alarmtimer; } alarm; } it; struct rcu_head rcu; } ____cacheline_aligned_in_smp; void run_posix_cpu_timers(void); void posix_cpu_timers_exit(struct task_struct *task); void posix_cpu_timers_exit_group(struct task_struct *task); void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, u64 *newval, u64 *oldval); int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); #ifdef CONFIG_POSIX_TIMERS static inline void posixtimer_putref(struct k_itimer *tmr) { if (rcuref_put(&tmr->rcuref)) posixtimer_free_timer(tmr); } static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); WARN_ON_ONCE(!rcuref_get(&tmr->rcuref)); } static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); posixtimer_putref(tmr); } static inline bool posixtimer_valid(const struct k_itimer *timer) { unsigned long val = (unsigned long)timer->it_signal; return !(val & 0x1UL); } #else /* CONFIG_POSIX_TIMERS */ static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { } static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { } #endif /* !CONFIG_POSIX_TIMERS */ #endif |
34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 | // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/signal.c * * Copyright (C) 1995-2009 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/cache.h> #include <linux/compat.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/freezer.h> #include <linux/stddef.h> #include <linux/uaccess.h> #include <linux/sizes.h> #include <linux/string.h> #include <linux/ratelimit.h> #include <linux/rseq.h> #include <linux/syscalls.h> #include <linux/pkeys.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/elf.h> #include <asm/exception.h> #include <asm/cacheflush.h> #include <asm/gcs.h> #include <asm/ucontext.h> #include <asm/unistd.h> #include <asm/fpsimd.h> #include <asm/ptrace.h> #include <asm/syscall.h> #include <asm/signal32.h> #include <asm/traps.h> #include <asm/vdso.h> #define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ struct rt_sigframe { struct siginfo info; struct ucontext uc; }; struct rt_sigframe_user_layout { struct rt_sigframe __user *sigframe; struct frame_record __user *next_frame; unsigned long size; /* size of allocated sigframe data */ unsigned long limit; /* largest allowed size */ unsigned long fpsimd_offset; unsigned long esr_offset; unsigned long gcs_offset; unsigned long sve_offset; unsigned long tpidr2_offset; unsigned long za_offset; unsigned long zt_offset; unsigned long fpmr_offset; unsigned long poe_offset; unsigned long extra_offset; unsigned long end_offset; }; /* * Holds any EL0-controlled state that influences unprivileged memory accesses. * This includes both accesses done in userspace and uaccess done in the kernel. * * This state needs to be carefully managed to ensure that it doesn't cause * uaccess to fail when setting up the signal frame, and the signal handler * itself also expects a well-defined state when entered. */ struct user_access_state { u64 por_el0; }; #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16) #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16) /* * Save the user access state into ua_state and reset it to disable any * restrictions. */ static void save_reset_user_access_state(struct user_access_state *ua_state) { if (system_supports_poe()) { u64 por_enable_all = 0; for (int pkey = 0; pkey < arch_max_pkey(); pkey++) por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RWX); ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); write_sysreg_s(por_enable_all, SYS_POR_EL0); /* Ensure that any subsequent uaccess observes the updated value */ isb(); } } /* * Set the user access state for invoking the signal handler. * * No uaccess should be done after that function is called. */ static void set_handler_user_access_state(void) { if (system_supports_poe()) write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); } /* * Restore the user access state to the values saved in ua_state. * * No uaccess should be done after that function is called. */ static void restore_user_access_state(const struct user_access_state *ua_state) { if (system_supports_poe()) write_sysreg_s(ua_state->por_el0, SYS_POR_EL0); } static void init_user_layout(struct rt_sigframe_user_layout *user) { const size_t reserved_size = sizeof(user->sigframe->uc.uc_mcontext.__reserved); memset(user, 0, sizeof(*user)); user->size = offsetof(struct rt_sigframe, uc.uc_mcontext.__reserved); user->limit = user->size + reserved_size; user->limit -= TERMINATOR_SIZE; user->limit -= EXTRA_CONTEXT_SIZE; /* Reserve space for extension and terminator ^ */ } static size_t sigframe_size(struct rt_sigframe_user_layout const *user) { return round_up(max(user->size, sizeof(struct rt_sigframe)), 16); } /* * Sanity limit on the approximate maximum size of signal frame we'll * try to generate. Stack alignment padding and the frame record are * not taken into account. This limit is not a guarantee and is * NOT ABI. */ #define SIGFRAME_MAXSZ SZ_256K static int __sigframe_alloc(struct rt_sigframe_user_layout *user, unsigned long *offset, size_t size, bool extend) { size_t padded_size = round_up(size, 16); if (padded_size > user->limit - user->size && !user->extra_offset && extend) { int ret; user->limit += EXTRA_CONTEXT_SIZE; ret = __sigframe_alloc(user, &user->extra_offset, sizeof(struct extra_context), false); if (ret) { user->limit -= EXTRA_CONTEXT_SIZE; return ret; } /* Reserve space for the __reserved[] terminator */ user->size += TERMINATOR_SIZE; /* * Allow expansion up to SIGFRAME_MAXSZ, ensuring space for * the terminator: */ user->limit = SIGFRAME_MAXSZ - TERMINATOR_SIZE; } /* Still not enough space? Bad luck! */ if (padded_size > user->limit - user->size) return -ENOMEM; *offset = user->size; user->size += padded_size; return 0; } /* * Allocate space for an optional record of <size> bytes in the user * signal frame. The offset from the signal frame base address to the * allocated block is assigned to *offset. */ static int sigframe_alloc(struct rt_sigframe_user_layout *user, unsigned long *offset, size_t size) { return __sigframe_alloc(user, offset, size, true); } /* Allocate the null terminator record and prevent further allocations */ static int sigframe_alloc_end(struct rt_sigframe_user_layout *user) { int ret; /* Un-reserve the space reserved for the terminator: */ user->limit += TERMINATOR_SIZE; ret = sigframe_alloc(user, &user->end_offset, sizeof(struct _aarch64_ctx)); if (ret) return ret; /* Prevent further allocation: */ user->limit = user->size; return 0; } static void __user *apply_user_offset( struct rt_sigframe_user_layout const *user, unsigned long offset) { char __user *base = (char __user *)user->sigframe; return base + offset; } struct user_ctxs { struct fpsimd_context __user *fpsimd; u32 fpsimd_size; struct sve_context __user *sve; u32 sve_size; struct tpidr2_context __user *tpidr2; u32 tpidr2_size; struct za_context __user *za; u32 za_size; struct zt_context __user *zt; u32 zt_size; struct fpmr_context __user *fpmr; u32 fpmr_size; struct poe_context __user *poe; u32 poe_size; struct gcs_context __user *gcs; u32 gcs_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) { struct user_fpsimd_state const *fpsimd = ¤t->thread.uw.fpsimd_state; int err; /* copy the FP and status/control registers */ err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); __put_user_error(fpsimd->fpsr, &ctx->fpsr, err); __put_user_error(fpsimd->fpcr, &ctx->fpcr, err); /* copy the magic/size information */ __put_user_error(FPSIMD_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(struct fpsimd_context), &ctx->head.size, err); return err ? -EFAULT : 0; } static int restore_fpsimd_context(struct user_ctxs *user) { struct user_fpsimd_state fpsimd; int err = 0; /* check the size information */ if (user->fpsimd_size != sizeof(struct fpsimd_context)) return -EINVAL; /* copy the FP and status/control registers */ err = __copy_from_user(fpsimd.vregs, &(user->fpsimd->vregs), sizeof(fpsimd.vregs)); __get_user_error(fpsimd.fpsr, &(user->fpsimd->fpsr), err); __get_user_error(fpsimd.fpcr, &(user->fpsimd->fpcr), err); clear_thread_flag(TIF_SVE); current->thread.fp_type = FP_STATE_FPSIMD; /* load the hardware registers from the fpsimd_state structure */ if (!err) fpsimd_update_current_state(&fpsimd); return err ? -EFAULT : 0; } static int preserve_fpmr_context(struct fpmr_context __user *ctx) { int err = 0; current->thread.uw.fpmr = read_sysreg_s(SYS_FPMR); __put_user_error(FPMR_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err); return err; } static int restore_fpmr_context(struct user_ctxs *user) { u64 fpmr; int err = 0; if (user->fpmr_size != sizeof(*user->fpmr)) return -EINVAL; __get_user_error(fpmr, &user->fpmr->fpmr, err); if (!err) write_sysreg_s(fpmr, SYS_FPMR); return err; } static int preserve_poe_context(struct poe_context __user *ctx, const struct user_access_state *ua_state) { int err = 0; __put_user_error(POE_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(ua_state->por_el0, &ctx->por_el0, err); return err; } static int restore_poe_context(struct user_ctxs *user, struct user_access_state *ua_state) { u64 por_el0; int err = 0; if (user->poe_size != sizeof(*user->poe)) return -EINVAL; __get_user_error(por_el0, &(user->poe->por_el0), err); if (!err) ua_state->por_el0 = por_el0; return err; } #ifdef CONFIG_ARM64_SVE static int preserve_sve_context(struct sve_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; u16 flags = 0; unsigned int vl = task_get_sve_vl(current); unsigned int vq = 0; if (thread_sm_enabled(¤t->thread)) { vl = task_get_sme_vl(current); vq = sve_vq_from_vl(vl); flags |= SVE_SIG_FLAG_SM; } else if (current->thread.fp_type == FP_STATE_SVE) { vq = sve_vq_from_vl(vl); } memset(reserved, 0, sizeof(reserved)); __put_user_error(SVE_MAGIC, &ctx->head.magic, err); __put_user_error(round_up(SVE_SIG_CONTEXT_SIZE(vq), 16), &ctx->head.size, err); __put_user_error(vl, &ctx->vl, err); __put_user_error(flags, &ctx->flags, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { /* * This assumes that the SVE state has already been saved to * the task struct by calling the function * fpsimd_signal_preserve_current_state(). */ err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, current->thread.sve_state, SVE_SIG_REGS_SIZE(vq)); } return err ? -EFAULT : 0; } static int restore_sve_fpsimd_context(struct user_ctxs *user) { int err = 0; unsigned int vl, vq; struct user_fpsimd_state fpsimd; u16 user_vl, flags; if (user->sve_size < sizeof(*user->sve)) return -EINVAL; __get_user_error(user_vl, &(user->sve->vl), err); __get_user_error(flags, &(user->sve->flags), err); if (err) return err; if (flags & SVE_SIG_FLAG_SM) { if (!system_supports_sme()) return -EINVAL; vl = task_get_sme_vl(current); } else { /* * A SME only system use SVE for streaming mode so can * have a SVE formatted context with a zero VL and no * payload data. */ if (!system_supports_sve() && !system_supports_sme()) return -EINVAL; vl = task_get_sve_vl(current); } if (user_vl != vl) return -EINVAL; if (user->sve_size == sizeof(*user->sve)) { clear_thread_flag(TIF_SVE); current->thread.svcr &= ~SVCR_SM_MASK; current->thread.fp_type = FP_STATE_FPSIMD; goto fpsimd_only; } vq = sve_vq_from_vl(vl); if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq)) return -EINVAL; /* * Careful: we are about __copy_from_user() directly into * thread.sve_state with preemption enabled, so protection is * needed to prevent a racing context switch from writing stale * registers back over the new data. */ fpsimd_flush_task_state(current); /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ sve_alloc(current, true); if (!current->thread.sve_state) { clear_thread_flag(TIF_SVE); return -ENOMEM; } err = __copy_from_user(current->thread.sve_state, (char __user const *)user->sve + SVE_SIG_REGS_OFFSET, SVE_SIG_REGS_SIZE(vq)); if (err) return -EFAULT; if (flags & SVE_SIG_FLAG_SM) current->thread.svcr |= SVCR_SM_MASK; else set_thread_flag(TIF_SVE); current->thread.fp_type = FP_STATE_SVE; fpsimd_only: /* copy the FP and status/control registers */ /* restore_sigframe() already checked that user->fpsimd != NULL. */ err = __copy_from_user(fpsimd.vregs, user->fpsimd->vregs, sizeof(fpsimd.vregs)); __get_user_error(fpsimd.fpsr, &user->fpsimd->fpsr, err); __get_user_error(fpsimd.fpcr, &user->fpsimd->fpcr, err); /* load the hardware registers from the fpsimd_state structure */ if (!err) fpsimd_update_current_state(&fpsimd); return err ? -EFAULT : 0; } #else /* ! CONFIG_ARM64_SVE */ static int restore_sve_fpsimd_context(struct user_ctxs *user) { WARN_ON_ONCE(1); return -EINVAL; } /* Turn any non-optimised out attempts to use this into a link error: */ extern int preserve_sve_context(void __user *ctx); #endif /* ! CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_SME static int preserve_tpidr2_context(struct tpidr2_context __user *ctx) { int err = 0; current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); __put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(current->thread.tpidr2_el0, &ctx->tpidr2, err); return err; } static int restore_tpidr2_context(struct user_ctxs *user) { u64 tpidr2_el0; int err = 0; if (user->tpidr2_size != sizeof(*user->tpidr2)) return -EINVAL; __get_user_error(tpidr2_el0, &user->tpidr2->tpidr2, err); if (!err) write_sysreg_s(tpidr2_el0, SYS_TPIDR2_EL0); return err; } static int preserve_za_context(struct za_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; unsigned int vl = task_get_sme_vl(current); unsigned int vq; if (thread_za_enabled(¤t->thread)) vq = sve_vq_from_vl(vl); else vq = 0; memset(reserved, 0, sizeof(reserved)); __put_user_error(ZA_MAGIC, &ctx->head.magic, err); __put_user_error(round_up(ZA_SIG_CONTEXT_SIZE(vq), 16), &ctx->head.size, err); __put_user_error(vl, &ctx->vl, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { /* * This assumes that the ZA state has already been saved to * the task struct by calling the function * fpsimd_signal_preserve_current_state(). */ err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, current->thread.sme_state, ZA_SIG_REGS_SIZE(vq)); } return err ? -EFAULT : 0; } static int restore_za_context(struct user_ctxs *user) { int err = 0; unsigned int vq; u16 user_vl; if (user->za_size < sizeof(*user->za)) return -EINVAL; __get_user_error(user_vl, &(user->za->vl), err); if (err) return err; if (user_vl != task_get_sme_vl(current)) return -EINVAL; if (user->za_size == sizeof(*user->za)) { current->thread.svcr &= ~SVCR_ZA_MASK; return 0; } vq = sve_vq_from_vl(user_vl); if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq)) return -EINVAL; /* * Careful: we are about __copy_from_user() directly into * thread.sme_state with preemption enabled, so protection is * needed to prevent a racing context switch from writing stale * registers back over the new data. */ fpsimd_flush_task_state(current); /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ sme_alloc(current, true); if (!current->thread.sme_state) { current->thread.svcr &= ~SVCR_ZA_MASK; clear_thread_flag(TIF_SME); return -ENOMEM; } err = __copy_from_user(current->thread.sme_state, (char __user const *)user->za + ZA_SIG_REGS_OFFSET, ZA_SIG_REGS_SIZE(vq)); if (err) return -EFAULT; set_thread_flag(TIF_SME); current->thread.svcr |= SVCR_ZA_MASK; return 0; } static int preserve_zt_context(struct zt_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; if (WARN_ON(!thread_za_enabled(¤t->thread))) return -EINVAL; memset(reserved, 0, sizeof(reserved)); __put_user_error(ZT_MAGIC, &ctx->head.magic, err); __put_user_error(round_up(ZT_SIG_CONTEXT_SIZE(1), 16), &ctx->head.size, err); __put_user_error(1, &ctx->nregs, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); /* * This assumes that the ZT state has already been saved to * the task struct by calling the function * fpsimd_signal_preserve_current_state(). */ err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET, thread_zt_state(¤t->thread), ZT_SIG_REGS_SIZE(1)); return err ? -EFAULT : 0; } static int restore_zt_context(struct user_ctxs *user) { int err; u16 nregs; /* ZA must be restored first for this check to be valid */ if (!thread_za_enabled(¤t->thread)) return -EINVAL; if (user->zt_size != ZT_SIG_CONTEXT_SIZE(1)) return -EINVAL; if (__copy_from_user(&nregs, &(user->zt->nregs), sizeof(nregs))) return -EFAULT; if (nregs != 1) return -EINVAL; /* * Careful: we are about __copy_from_user() directly into * thread.zt_state with preemption enabled, so protection is * needed to prevent a racing context switch from writing stale * registers back over the new data. */ fpsimd_flush_task_state(current); /* From now, fpsimd_thread_switch() won't touch ZT in thread state */ err = __copy_from_user(thread_zt_state(¤t->thread), (char __user const *)user->zt + ZT_SIG_REGS_OFFSET, ZT_SIG_REGS_SIZE(1)); if (err) return -EFAULT; return 0; } #else /* ! CONFIG_ARM64_SME */ /* Turn any non-optimised out attempts to use these into a link error: */ extern int preserve_tpidr2_context(void __user *ctx); extern int restore_tpidr2_context(struct user_ctxs *user); extern int preserve_za_context(void __user *ctx); extern int restore_za_context(struct user_ctxs *user); extern int preserve_zt_context(void __user *ctx); extern int restore_zt_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SME */ #ifdef CONFIG_ARM64_GCS static int preserve_gcs_context(struct gcs_context __user *ctx) { int err = 0; u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); /* * If GCS is enabled we will add a cap token to the frame, * include it in the GCSPR_EL0 we report to support stack * switching via sigreturn if GCS is enabled. We do not allow * enabling via sigreturn so the token is only relevant for * threads with GCS enabled. */ if (task_gcs_el0_enabled(current)) gcspr -= 8; __put_user_error(GCS_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(gcspr, &ctx->gcspr, err); __put_user_error(0, &ctx->reserved, err); __put_user_error(current->thread.gcs_el0_mode, &ctx->features_enabled, err); return err; } static int restore_gcs_context(struct user_ctxs *user) { u64 gcspr, enabled; int err = 0; if (user->gcs_size != sizeof(*user->gcs)) return -EINVAL; __get_user_error(gcspr, &user->gcs->gcspr, err); __get_user_error(enabled, &user->gcs->features_enabled, err); if (err) return err; /* Don't allow unknown modes */ if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) return -EINVAL; err = gcs_check_locked(current, enabled); if (err != 0) return err; /* Don't allow enabling */ if (!task_gcs_el0_enabled(current) && (enabled & PR_SHADOW_STACK_ENABLE)) return -EINVAL; /* If we are disabling disable everything */ if (!(enabled & PR_SHADOW_STACK_ENABLE)) enabled = 0; current->thread.gcs_el0_mode = enabled; /* * We let userspace set GCSPR_EL0 to anything here, we will * validate later in gcs_restore_signal(). */ write_sysreg_s(gcspr, SYS_GCSPR_EL0); return 0; } #else /* ! CONFIG_ARM64_GCS */ /* Turn any non-optimised out attempts to use these into a link error: */ extern int preserve_gcs_context(void __user *ctx); extern int restore_gcs_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_GCS */ static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) { struct sigcontext __user *const sc = &sf->uc.uc_mcontext; struct _aarch64_ctx __user *head; char __user *base = (char __user *)&sc->__reserved; size_t offset = 0; size_t limit = sizeof(sc->__reserved); bool have_extra_context = false; char const __user *const sfp = (char const __user *)sf; user->fpsimd = NULL; user->sve = NULL; user->tpidr2 = NULL; user->za = NULL; user->zt = NULL; user->fpmr = NULL; user->poe = NULL; user->gcs = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; while (1) { int err = 0; u32 magic, size; char const __user *userp; struct extra_context const __user *extra; u64 extra_datap; u32 extra_size; struct _aarch64_ctx const __user *end; u32 end_magic, end_size; if (limit - offset < sizeof(*head)) goto invalid; if (!IS_ALIGNED(offset, 16)) goto invalid; head = (struct _aarch64_ctx __user *)(base + offset); __get_user_error(magic, &head->magic, err); __get_user_error(size, &head->size, err); if (err) return err; if (limit - offset < size) goto invalid; switch (magic) { case 0: if (size) goto invalid; goto done; case FPSIMD_MAGIC: if (!system_supports_fpsimd()) goto invalid; if (user->fpsimd) goto invalid; user->fpsimd = (struct fpsimd_context __user *)head; user->fpsimd_size = size; break; case ESR_MAGIC: /* ignore */ break; case POE_MAGIC: if (!system_supports_poe()) goto invalid; if (user->poe) goto invalid; user->poe = (struct poe_context __user *)head; user->poe_size = size; break; case SVE_MAGIC: if (!system_supports_sve() && !system_supports_sme()) goto invalid; if (user->sve) goto invalid; user->sve = (struct sve_context __user *)head; user->sve_size = size; break; case TPIDR2_MAGIC: if (!system_supports_tpidr2()) goto invalid; if (user->tpidr2) goto invalid; user->tpidr2 = (struct tpidr2_context __user *)head; user->tpidr2_size = size; break; case ZA_MAGIC: if (!system_supports_sme()) goto invalid; if (user->za) goto invalid; user->za = (struct za_context __user *)head; user->za_size = size; break; case ZT_MAGIC: if (!system_supports_sme2()) goto invalid; if (user->zt) goto invalid; user->zt = (struct zt_context __user *)head; user->zt_size = size; break; case FPMR_MAGIC: if (!system_supports_fpmr()) goto invalid; if (user->fpmr) goto invalid; user->fpmr = (struct fpmr_context __user *)head; user->fpmr_size = size; break; case GCS_MAGIC: if (!system_supports_gcs()) goto invalid; if (user->gcs) goto invalid; user->gcs = (struct gcs_context __user *)head; user->gcs_size = size; break; case EXTRA_MAGIC: if (have_extra_context) goto invalid; if (size < sizeof(*extra)) goto invalid; userp = (char const __user *)head; extra = (struct extra_context const __user *)userp; userp += size; __get_user_error(extra_datap, &extra->datap, err); __get_user_error(extra_size, &extra->size, err); if (err) return err; /* Check for the dummy terminator in __reserved[]: */ if (limit - offset - size < TERMINATOR_SIZE) goto invalid; end = (struct _aarch64_ctx const __user *)userp; userp += TERMINATOR_SIZE; __get_user_error(end_magic, &end->magic, err); __get_user_error(end_size, &end->size, err); if (err) return err; if (end_magic || end_size) goto invalid; /* Prevent looping/repeated parsing of extra_context */ have_extra_context = true; base = (__force void __user *)extra_datap; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; if (!IS_ALIGNED(extra_size, 16)) goto invalid; if (base != userp) goto invalid; /* Reject "unreasonably large" frames: */ if (extra_size > sfp + SIGFRAME_MAXSZ - userp) goto invalid; /* * Ignore trailing terminator in __reserved[] * and start parsing extra data: */ offset = 0; limit = extra_size; if (!access_ok(base, limit)) goto invalid; continue; default: goto invalid; } if (size < sizeof(*head)) goto invalid; if (limit - offset < size) goto invalid; offset += size; } done: return 0; invalid: return -EINVAL; } static int restore_sigframe(struct pt_regs *regs, struct rt_sigframe __user *sf, struct user_access_state *ua_state) { sigset_t set; int i, err; struct user_ctxs user; err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); if (err == 0) set_current_blocked(&set); for (i = 0; i < 31; i++) __get_user_error(regs->regs[i], &sf->uc.uc_mcontext.regs[i], err); __get_user_error(regs->sp, &sf->uc.uc_mcontext.sp, err); __get_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err); __get_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err); /* * Avoid sys_rt_sigreturn() restarting. */ forget_syscall(regs); err |= !valid_user_regs(®s->user_regs, current); if (err == 0) err = parse_user_sigframe(&user, sf); if (err == 0 && system_supports_fpsimd()) { if (!user.fpsimd) return -EINVAL; if (user.sve) err = restore_sve_fpsimd_context(&user); else err = restore_fpsimd_context(&user); } if (err == 0 && system_supports_gcs() && user.gcs) err = restore_gcs_context(&user); if (err == 0 && system_supports_tpidr2() && user.tpidr2) err = restore_tpidr2_context(&user); if (err == 0 && system_supports_fpmr() && user.fpmr) err = restore_fpmr_context(&user); if (err == 0 && system_supports_sme() && user.za) err = restore_za_context(&user); if (err == 0 && system_supports_sme2() && user.zt) err = restore_zt_context(&user); if (err == 0 && system_supports_poe() && user.poe) err = restore_poe_context(&user, ua_state); return err; } #ifdef CONFIG_ARM64_GCS static int gcs_restore_signal(void) { u64 gcspr_el0, cap; int ret; if (!system_supports_gcs()) return 0; if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) return 0; gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Ensure that any changes to the GCS done via GCS operations * are visible to the normal reads we do to validate the * token. */ gcsb_dsync(); /* * GCSPR_EL0 should be pointing at a capped GCS, read the cap. * We don't enforce that this is in a GCS page, if it is not * then faults will be generated on GCS operations - the main * concern is to protect GCS pages. */ ret = copy_from_user(&cap, (unsigned long __user *)gcspr_el0, sizeof(cap)); if (ret) return -EFAULT; /* * Check that the cap is the actual GCS before replacing it. */ if (cap != GCS_SIGNAL_CAP(gcspr_el0)) return -EINVAL; /* Invalidate the token to prevent reuse */ put_user_gcs(0, (unsigned long __user *)gcspr_el0, &ret); if (ret != 0) return -EFAULT; write_sysreg_s(gcspr_el0 + 8, SYS_GCSPR_EL0); return 0; } #else static int gcs_restore_signal(void) { return 0; } #endif SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; struct user_access_state ua_state; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 128-bit boundary, then 'sp' should * be word aligned here. */ if (regs->sp & 15) goto badframe; frame = (struct rt_sigframe __user *)regs->sp; if (!access_ok(frame, sizeof (*frame))) goto badframe; if (restore_sigframe(regs, frame, &ua_state)) goto badframe; if (gcs_restore_signal()) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; restore_user_access_state(&ua_state); return regs->regs[0]; badframe: arm64_notify_segfault(regs->sp); return 0; } /* * Determine the layout of optional records in the signal frame * * add_all: if true, lays out the biggest possible signal frame for * this task; otherwise, generates a layout for the current state * of the task. */ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, bool add_all) { int err; if (system_supports_fpsimd()) { err = sigframe_alloc(user, &user->fpsimd_offset, sizeof(struct fpsimd_context)); if (err) return err; } /* fault information, if valid */ if (add_all || current->thread.fault_code) { err = sigframe_alloc(user, &user->esr_offset, sizeof(struct esr_context)); if (err) return err; } #ifdef CONFIG_ARM64_GCS if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) { err = sigframe_alloc(user, &user->gcs_offset, sizeof(struct gcs_context)); if (err) return err; } #endif if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; if (add_all || current->thread.fp_type == FP_STATE_SVE || thread_sm_enabled(¤t->thread)) { int vl = max(sve_max_vl(), sme_max_vl()); if (!add_all) vl = thread_get_cur_vl(¤t->thread); vq = sve_vq_from_vl(vl); } err = sigframe_alloc(user, &user->sve_offset, SVE_SIG_CONTEXT_SIZE(vq)); if (err) return err; } if (system_supports_tpidr2()) { err = sigframe_alloc(user, &user->tpidr2_offset, sizeof(struct tpidr2_context)); if (err) return err; } if (system_supports_sme()) { unsigned int vl; unsigned int vq = 0; if (add_all) vl = sme_max_vl(); else vl = task_get_sme_vl(current); if (thread_za_enabled(¤t->thread)) vq = sve_vq_from_vl(vl); err = sigframe_alloc(user, &user->za_offset, ZA_SIG_CONTEXT_SIZE(vq)); if (err) return err; } if (system_supports_sme2()) { if (add_all || thread_za_enabled(¤t->thread)) { err = sigframe_alloc(user, &user->zt_offset, ZT_SIG_CONTEXT_SIZE(1)); if (err) return err; } } if (system_supports_fpmr()) { err = sigframe_alloc(user, &user->fpmr_offset, sizeof(struct fpmr_context)); if (err) return err; } if (system_supports_poe()) { err = sigframe_alloc(user, &user->poe_offset, sizeof(struct poe_context)); if (err) return err; } return sigframe_alloc_end(user); } static int setup_sigframe(struct rt_sigframe_user_layout *user, struct pt_regs *regs, sigset_t *set, const struct user_access_state *ua_state) { int i, err = 0; struct rt_sigframe __user *sf = user->sigframe; /* set up the stack frame for unwinding */ __put_user_error(regs->regs[29], &user->next_frame->fp, err); __put_user_error(regs->regs[30], &user->next_frame->lr, err); for (i = 0; i < 31; i++) __put_user_error(regs->regs[i], &sf->uc.uc_mcontext.regs[i], err); __put_user_error(regs->sp, &sf->uc.uc_mcontext.sp, err); __put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err); __put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err); __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err); err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); if (err == 0 && system_supports_fpsimd()) { struct fpsimd_context __user *fpsimd_ctx = apply_user_offset(user, user->fpsimd_offset); err |= preserve_fpsimd_context(fpsimd_ctx); } /* fault information, if valid */ if (err == 0 && user->esr_offset) { struct esr_context __user *esr_ctx = apply_user_offset(user, user->esr_offset); __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err); __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err); __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } if (system_supports_gcs() && err == 0 && user->gcs_offset) { struct gcs_context __user *gcs_ctx = apply_user_offset(user, user->gcs_offset); err |= preserve_gcs_context(gcs_ctx); } /* Scalable Vector Extension state (including streaming), if present */ if ((system_supports_sve() || system_supports_sme()) && err == 0 && user->sve_offset) { struct sve_context __user *sve_ctx = apply_user_offset(user, user->sve_offset); err |= preserve_sve_context(sve_ctx); } /* TPIDR2 if supported */ if (system_supports_tpidr2() && err == 0) { struct tpidr2_context __user *tpidr2_ctx = apply_user_offset(user, user->tpidr2_offset); err |= preserve_tpidr2_context(tpidr2_ctx); } /* FPMR if supported */ if (system_supports_fpmr() && err == 0) { struct fpmr_context __user *fpmr_ctx = apply_user_offset(user, user->fpmr_offset); err |= preserve_fpmr_context(fpmr_ctx); } if (system_supports_poe() && err == 0) { struct poe_context __user *poe_ctx = apply_user_offset(user, user->poe_offset); err |= preserve_poe_context(poe_ctx, ua_state); } /* ZA state if present */ if (system_supports_sme() && err == 0 && user->za_offset) { struct za_context __user *za_ctx = apply_user_offset(user, user->za_offset); err |= preserve_za_context(za_ctx); } /* ZT state if present */ if (system_supports_sme2() && err == 0 && user->zt_offset) { struct zt_context __user *zt_ctx = apply_user_offset(user, user->zt_offset); err |= preserve_zt_context(zt_ctx); } if (err == 0 && user->extra_offset) { char __user *sfp = (char __user *)user->sigframe; char __user *userp = apply_user_offset(user, user->extra_offset); struct extra_context __user *extra; struct _aarch64_ctx __user *end; u64 extra_datap; u32 extra_size; extra = (struct extra_context __user *)userp; userp += EXTRA_CONTEXT_SIZE; end = (struct _aarch64_ctx __user *)userp; userp += TERMINATOR_SIZE; /* * extra_datap is just written to the signal frame. * The value gets cast back to a void __user * * during sigreturn. */ extra_datap = (__force u64)userp; extra_size = sfp + round_up(user->size, 16) - userp; __put_user_error(EXTRA_MAGIC, &extra->head.magic, err); __put_user_error(EXTRA_CONTEXT_SIZE, &extra->head.size, err); __put_user_error(extra_datap, &extra->datap, err); __put_user_error(extra_size, &extra->size, err); /* Add the terminator */ __put_user_error(0, &end->magic, err); __put_user_error(0, &end->size, err); } /* set the "end" magic */ if (err == 0) { struct _aarch64_ctx __user *end = apply_user_offset(user, user->end_offset); __put_user_error(0, &end->magic, err); __put_user_error(0, &end->size, err); } return err; } static int get_sigframe(struct rt_sigframe_user_layout *user, struct ksignal *ksig, struct pt_regs *regs) { unsigned long sp, sp_top; int err; init_user_layout(user); err = setup_sigframe_layout(user, false); if (err) return err; sp = sp_top = sigsp(regs->sp, ksig); sp = round_down(sp - sizeof(struct frame_record), 16); user->next_frame = (struct frame_record __user *)sp; sp = round_down(sp, 16) - sigframe_size(user); user->sigframe = (struct rt_sigframe __user *)sp; /* * Check that we can actually write to the signal frame. */ if (!access_ok(user->sigframe, sp_top - sp)) return -EFAULT; return 0; } #ifdef CONFIG_ARM64_GCS static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) { u64 gcspr_el0; int ret = 0; if (!system_supports_gcs()) return 0; if (!task_gcs_el0_enabled(current)) return 0; /* * We are entering a signal handler, current register state is * active. */ gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Push a cap and the GCS entry for the trampoline onto the GCS. */ put_user_gcs((unsigned long)sigtramp, (unsigned long __user *)(gcspr_el0 - 16), &ret); put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 8), (unsigned long __user *)(gcspr_el0 - 8), &ret); if (ret != 0) return ret; gcspr_el0 -= 16; write_sysreg_s(gcspr_el0, SYS_GCSPR_EL0); return 0; } #else static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) { return 0; } #endif static int setup_return(struct pt_regs *regs, struct ksignal *ksig, struct rt_sigframe_user_layout *user, int usig) { __sigrestore_t sigtramp; int err; if (ksig->ka.sa.sa_flags & SA_RESTORER) sigtramp = ksig->ka.sa.sa_restorer; else sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); err = gcs_signal_entry(sigtramp, ksig); if (err) return err; /* * We must not fail from this point onwards. We are going to update * registers, including SP, in order to invoke the signal handler. If * we failed and attempted to deliver a nested SIGSEGV to a handler * after that point, the subsequent sigreturn would end up restoring * the (partial) state for the original signal handler. */ regs->regs[0] = usig; if (ksig->ka.sa.sa_flags & SA_SIGINFO) { regs->regs[1] = (unsigned long)&user->sigframe->info; regs->regs[2] = (unsigned long)&user->sigframe->uc; } regs->sp = (unsigned long)user->sigframe; regs->regs[29] = (unsigned long)&user->next_frame->fp; regs->regs[30] = (unsigned long)sigtramp; regs->pc = (unsigned long)ksig->ka.sa.sa_handler; /* * Signal delivery is a (wacky) indirect function call in * userspace, so simulate the same setting of BTYPE as a BLR * <register containing the signal handler entry point>. * Signal delivery to a location in a PROT_BTI guarded page * that is not a function entry point will now trigger a * SIGILL in userspace. * * If the signal handler entry point is not in a PROT_BTI * guarded page, this is harmless. */ if (system_supports_bti()) { regs->pstate &= ~PSR_BTYPE_MASK; regs->pstate |= PSR_BTYPE_C; } /* TCO (Tag Check Override) always cleared for signal handlers */ regs->pstate &= ~PSR_TCO_BIT; /* Signal handlers are invoked with ZA and streaming mode disabled */ if (system_supports_sme()) { /* * If we were in streaming mode the saved register * state was SVE but we will exit SM and use the * FPSIMD register state - flush the saved FPSIMD * register state in case it gets loaded. */ if (current->thread.svcr & SVCR_SM_MASK) { memset(¤t->thread.uw.fpsimd_state, 0, sizeof(current->thread.uw.fpsimd_state)); current->thread.fp_type = FP_STATE_FPSIMD; } current->thread.svcr &= ~(SVCR_ZA_MASK | SVCR_SM_MASK); sme_smstop(); } return 0; } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe_user_layout user; struct rt_sigframe __user *frame; struct user_access_state ua_state; int err = 0; fpsimd_signal_preserve_current_state(); if (get_sigframe(&user, ksig, regs)) return 1; save_reset_user_access_state(&ua_state); frame = user.sigframe; __put_user_error(0, &frame->uc.uc_flags, err); __put_user_error(NULL, &frame->uc.uc_link, err); err |= __save_altstack(&frame->uc.uc_stack, regs->sp); err |= setup_sigframe(&user, regs, set, &ua_state); if (ksig->ka.sa.sa_flags & SA_SIGINFO) err |= copy_siginfo_to_user(&frame->info, &ksig->info); if (err == 0) err = setup_return(regs, ksig, &user, usig); /* * We must not fail if setup_return() succeeded - see comment at the * beginning of setup_return(). */ if (err == 0) set_handler_user_access_state(); else restore_user_access_state(&ua_state); return err; } static void setup_restart_syscall(struct pt_regs *regs) { if (is_compat_task()) compat_setup_restart_syscall(regs); else regs->regs[8] = __NR_restart_syscall; } /* * OK, we're invoking a handler */ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { sigset_t *oldset = sigmask_to_save(); int usig = ksig->sig; int ret; rseq_signal_deliver(ksig, regs); /* * Set up the stack frame */ if (is_compat_task()) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) ret = compat_setup_rt_frame(usig, ksig, oldset, regs); else ret = compat_setup_frame(usig, ksig, oldset, regs); } else { ret = setup_rt_frame(usig, ksig, oldset, regs); } /* * Check that the resulting registers are actually sane. */ ret |= !valid_user_regs(®s->user_regs, current); /* Step into the signal handler if we are stepping */ signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP)); } /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. * * Note that we go through the signals twice: once to check the signals that * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ void do_signal(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; struct ksignal ksig; bool syscall = in_syscall(regs); /* * If we were from a system call, check for system call restarting... */ if (syscall) { continue_addr = regs->pc; restart_addr = continue_addr - (compat_thumb_mode(regs) ? 2 : 4); retval = regs->regs[0]; /* * Avoid additional syscall restarting via ret_to_user. */ forget_syscall(regs); /* * Prepare for system call restart. We do this here so that a * debugger will see the already changed PC. */ switch (retval) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTART_RESTARTBLOCK: regs->regs[0] = regs->orig_x0; regs->pc = restart_addr; break; } } /* * Get the signal to deliver. When running under ptrace, at this point * the debugger may change all of our registers. */ if (get_signal(&ksig)) { /* * Depending on the signal settings, we may need to revert the * decision to restart the system call, but skip this if a * debugger has chosen to restart at a different PC. */ if (regs->pc == restart_addr && (retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK || (retval == -ERESTARTSYS && !(ksig.ka.sa.sa_flags & SA_RESTART)))) { syscall_set_return_value(current, regs, -EINTR, 0); regs->pc = continue_addr; } handle_signal(&ksig, regs); return; } /* * Handle restarting a different system call. As above, if a debugger * has chosen to restart at a different PC, ignore the restart. */ if (syscall && regs->pc == restart_addr) { if (retval == -ERESTART_RESTARTBLOCK) setup_restart_syscall(regs); user_rewind_single_step(current); } restore_saved_sigmask(); } unsigned long __ro_after_init signal_minsigstksz; /* * Determine the stack space required for guaranteed signal devliery. * This function is used to populate AT_MINSIGSTKSZ at process startup. * cpufeatures setup is assumed to be complete. */ void __init minsigstksz_setup(void) { struct rt_sigframe_user_layout user; init_user_layout(&user); /* * If this fails, SIGFRAME_MAXSZ needs to be enlarged. It won't * be big enough, but it's our best guess: */ if (WARN_ON(setup_sigframe_layout(&user, true))) return; signal_minsigstksz = sigframe_size(&user) + round_up(sizeof(struct frame_record), 16) + 16; /* max alignment padding */ } /* * Compile-time assertions for siginfo_t offsets. Check NSIG* as well, as * changes likely come with new fields that should be added below. */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); static_assert(NSIGSYS == 2); static_assert(sizeof(siginfo_t) == 128); static_assert(__alignof__(siginfo_t) == 8); static_assert(offsetof(siginfo_t, si_signo) == 0x00); static_assert(offsetof(siginfo_t, si_errno) == 0x04); static_assert(offsetof(siginfo_t, si_code) == 0x08); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); static_assert(offsetof(siginfo_t, si_tid) == 0x10); static_assert(offsetof(siginfo_t, si_overrun) == 0x14); static_assert(offsetof(siginfo_t, si_status) == 0x18); static_assert(offsetof(siginfo_t, si_utime) == 0x20); static_assert(offsetof(siginfo_t, si_stime) == 0x28); static_assert(offsetof(siginfo_t, si_value) == 0x18); static_assert(offsetof(siginfo_t, si_int) == 0x18); static_assert(offsetof(siginfo_t, si_ptr) == 0x18); static_assert(offsetof(siginfo_t, si_addr) == 0x10); static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18); static_assert(offsetof(siginfo_t, si_lower) == 0x20); static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x18); static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); static_assert(offsetof(siginfo_t, si_syscall) == 0x18); static_assert(offsetof(siginfo_t, si_arch) == 0x1c); |
265 261 261 261 261 261 261 261 3 3 265 265 3 111 261 223 235 235 265 265 235 235 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 | // SPDX-License-Identifier: GPL-2.0-or-later /* * fs/eventpoll.c (Efficient event retrieval implementation) * Copyright (C) 2001,...,2009 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #include <linux/init.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/signal.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/string.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/rbtree.h> #include <linux/wait.h> #include <linux/eventpoll.h> #include <linux/mount.h> #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/mman.h> #include <linux/atomic.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> #include <linux/capability.h> #include <net/busy_poll.h> /* * LOCKING: * There are three level of locking required by epoll : * * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) * 3) ep->lock (rwlock) * * The acquire order is the one listed above, from 1 to 3. * We need a rwlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need * a spinlock. During the event transfer loop (from kernel to * user space) we could end up sleeping due a copy_to_user(), so * we need a lock that will allow us to sleep. This lock is a * mutex (ep->mtx). It is acquired during the event transfer loop, * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). * The epnested_mutex is acquired when inserting an epoll fd onto another * epoll fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which * could lead to deadlock. We need a global mutex to prevent two * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. * It is necessary to acquire multiple "ep->mtx"es at once in the * case when one epoll fd is added to another. In this case, we * always acquire the locks in the order of nesting (i.e. after * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired * before e2->mtx). Since we disallow cycles of epoll file * descriptors, this ensures that the mutexes are well-ordered. In * order to communicate this nesting to lockdep, when walking a tree * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epnested_mutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epnested_mutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee * a better scalability. */ /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT) #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) #define EP_UNACTIVE_PTR ((void *) -1L) #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) struct epoll_filefd { struct file *file; int fd; } __packed; /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ struct eppoll_entry *next; /* The "base" pointer is set to the container "struct epitem" */ struct epitem *base; /* * Wait queue item that will be linked to the target file wait * queue head. */ wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; }; /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line. */ struct epitem { union { /* RB tree node links this structure to the eventpoll RB tree */ struct rb_node rbn; /* Used to free the struct epitem */ struct rcu_head rcu; }; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ struct epitem *next; /* The file descriptor information this item refers to */ struct epoll_filefd ffd; /* * Protected by file->f_lock, true for to-be-released epitem already * removed from the "struct file" items list; together with * eventpoll->refcount orchestrates "struct eventpoll" disposal */ bool dying; /* List containing poll wait queues */ struct eppoll_entry *pwqlist; /* The "container" of this item */ struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ struct hlist_node fllink; /* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws; /* The structure that describe the interested events and the source fd */ struct epoll_event event; }; /* * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. */ struct eventpoll { /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; /* Wait queue used by file->poll() */ wait_queue_head_t poll_wait; /* List of ready file descriptors */ struct list_head rdllist; /* Lock which protects rdllist and ovflist */ rwlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock. */ struct epitem *ovflist; /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ struct user_struct *user; struct file *file; /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; /* * usage count, used together with epitem->dying to * orchestrate the disposal of this struct */ refcount_t refcount; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; /* busy poll timeout */ u32 busy_poll_usecs; /* busy poll packet budget */ u16 busy_poll_budget; bool prefer_busy_poll; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC /* tracks wakeup nests for lockdep validation */ u8 nests; #endif }; /* Wrapper struct used by poll queueing */ struct ep_pqueue { poll_table pt; struct epitem *epi; }; /* * Configuration options available inside /proc/sys/fs/epoll/ */ /* Maximum number of epoll watched descriptors, per user */ static long max_user_watches __read_mostly; /* Used for cycles detection */ static DEFINE_MUTEX(epnested_mutex); static u64 loop_check_gen = 0; /* Used to check for epoll file descriptor inclusion loops */ static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __ro_after_init; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __ro_after_init; /* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epnested_mutex. */ struct epitems_head { struct hlist_head epitems; struct epitems_head *next; }; static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; static struct kmem_cache *ephead_cache __ro_after_init; static inline void free_ephead(struct epitems_head *head) { if (head) kmem_cache_free(ephead_cache, head); } static void list_file(struct file *file) { struct epitems_head *head; head = container_of(file->f_ep, struct epitems_head, epitems); if (!head->next) { head->next = tfile_check_list; tfile_check_list = head; } } static void unlist_file(struct epitems_head *head) { struct epitems_head *to_free = head; struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems)); if (p) { struct epitem *epi= container_of(p, struct epitem, fllink); spin_lock(&epi->ffd.file->f_lock); if (!hlist_empty(&head->epitems)) to_free = NULL; head->next = NULL; spin_unlock(&epi->ffd.file->f_lock); } free_ephead(to_free); } #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static long long_zero; static long long_max = LONG_MAX; static const struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, .maxlen = sizeof(max_user_watches), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &long_zero, .extra2 = &long_max, }, }; static void __init epoll_sysctls_init(void) { register_sysctl("fs/epoll", epoll_table); } #else #define epoll_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static const struct file_operations eventpoll_fops; static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; } /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, struct file *file, int fd) { ffd->file = file; ffd->fd = fd; } /* Compare RB tree keys */ static inline int ep_cmp_ffd(struct epoll_filefd *p1, struct epoll_filefd *p2) { return (p1->file > p2->file ? +1: (p1->file < p2->file ? -1 : p1->fd - p2->fd)); } /* Tells us if the item is currently linked */ static inline int ep_is_linked(struct epitem *epi) { return !list_empty(&epi->rdllink); } static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } /** * ep_events_available - Checks if ready events might be available. * * @ep: Pointer to the eventpoll context. * * Return: a value different than %zero if ready events are available, * or %zero otherwise. */ static inline int ep_events_available(struct eventpoll *ep) { return !list_empty_careful(&ep->rdllist) || READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; } #ifdef CONFIG_NET_RX_BUSY_POLL /** * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value * from the epoll instance ep is preferred, but if it is not set fallback to * the system-wide global via busy_loop_timeout. * * @start_time: The start time used to compute the remaining time until timeout. * @ep: Pointer to the eventpoll context. * * Return: true if the timeout has expired, false otherwise. */ static bool busy_loop_ep_timeout(unsigned long start_time, struct eventpoll *ep) { unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } else { return busy_loop_timeout(start_time); } } static bool ep_busy_loop_on(struct eventpoll *ep) { return !!READ_ONCE(ep->busy_poll_usecs) || READ_ONCE(ep->prefer_busy_poll) || net_busy_loop_on(); } static bool ep_busy_loop_end(void *p, unsigned long start_time) { struct eventpoll *ep = p; return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep); } /* * Busy poll if globally on and supporting sockets found && no events, * busy loop will return if need_resched or ep_events_available. * * we must do our busy polling with irqs enabled */ static bool ep_busy_loop(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); u16 budget = READ_ONCE(ep->busy_poll_budget); bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); if (!budget) budget = BUSY_POLL_BUDGET; if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) { napi_busy_loop(napi_id, ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ if (prefer_busy_poll) napi_resume_irqs(napi_id); ep->napi_id = 0; return false; } return false; } /* * Set epoll busy poll NAPI ID from sk. */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { struct eventpoll *ep = epi->ep; unsigned int napi_id; struct socket *sock; struct sock *sk; if (!ep_busy_loop_on(ep)) return; sock = sock_from_file(epi->ffd.file); if (!sock) return; sk = sock->sk; if (!sk) return; napi_id = READ_ONCE(sk->sk_napi_id); /* Non-NAPI IDs can be rejected * or * Nothing to do if we already have this ID */ if (!napi_id_valid(napi_id) || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ ep->napi_id = napi_id; } static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct eventpoll *ep = file->private_data; void __user *uarg = (void __user *)arg; struct epoll_params epoll_params; switch (cmd) { case EPIOCSPARAMS: if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params))) return -EFAULT; /* pad byte must be zero */ if (epoll_params.__pad) return -EINVAL; if (epoll_params.busy_poll_usecs > S32_MAX) return -EINVAL; if (epoll_params.prefer_busy_poll > 1) return -EINVAL; if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT && !capable(CAP_NET_ADMIN)) return -EPERM; WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs); WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget); WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll); return 0; case EPIOCGPARAMS: memset(&epoll_params, 0, sizeof(epoll_params)); epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs); epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget); epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params))) return -EFAULT; return 0; default: return -ENOIOCTLCMD; } } static void ep_suspend_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_suspend_irqs(napi_id); } static void ep_resume_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_resume_irqs(napi_id); } #else static inline bool ep_busy_loop(struct eventpoll *ep) { return false; } static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { } static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } static void ep_suspend_napi_irqs(struct eventpoll *ep) { } static void ep_resume_napi_irqs(struct eventpoll *ep) { } #endif /* CONFIG_NET_RX_BUSY_POLL */ /* * As described in commit 0ccf831cb lockdep: annotate epoll * the use of wait queues used by epoll is done in a very controlled * manner. Wake ups can nest inside each other, but are never done * with the same locking. For example: * * dfd = socket(...); * efd1 = epoll_create(); * efd2 = epoll_create(); * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); * * When a packet arrives to the device underneath "dfd", the net code will * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a * callback wakeup entry on that queue, and the wake_up() performed by the * "dfd" net code will end up in ep_poll_callback(). At this point epoll * (efd1) notices that it may have some event ready, so it needs to wake up * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() * that ends up in another wake_up(), after having checked about the * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid * stack blasting. * * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, unsigned pollflags) { struct eventpoll *ep_src; unsigned long flags; u8 nests = 0; /* * To set the subclass or nesting level for spin_lock_irqsave_nested() * it might be natural to create a per-cpu nest count. However, since * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can * schedule() in the -rt kernel, the per-cpu variable are no longer * protected. Thus, we are introducing a per eventpoll nest field. * If we are not being call from ep_poll_callback(), epi is NULL and * we are at the first level of nesting, 0. Otherwise, we are being * called from ep_poll_callback() and if a previous wakeup source is * not an epoll file itself, we are at depth 1 since the wakeup source * is depth 0. If the wakeup source is a previous epoll file in the * wakeup chain then we use its nests value and record ours as * nests + 1. The previous epoll file nests value is stable since its * already holding its own poll_wait.lock. */ if (epi) { if ((is_file_epoll(epi->ffd.file))) { ep_src = epi->ffd.file->private_data; nests = ep_src->nests; } else { nests = 1; } } spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); ep->nests = nests + 1; wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); ep->nests = 0; spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, __poll_t pollflags) { wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } #endif static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; rcu_read_lock(); /* * If it is cleared by POLLFREE, it should be rcu-safe. * If we read NULL we need a barrier paired with * smp_store_release() in ep_poll_callback(), otherwise * we rely on whead->lock. */ whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); } /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held. */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { struct eppoll_entry **p = &epi->pwqlist; struct eppoll_entry *pwq; while ((pwq = *p) != NULL) { *p = pwq->next; ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } } /* call only when ep->mtx is held */ static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) { return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); } /* call only when ep->mtx is held */ static inline void ep_pm_stay_awake(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); if (ws) __pm_stay_awake(ws); } static inline bool ep_has_wakeup_source(struct epitem *epi) { return rcu_access_pointer(epi->ws) ? true : false; } /* call when ep->mtx cannot be held (ep_poll_callback) */ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) { struct wakeup_source *ws; rcu_read_lock(); ws = rcu_dereference(epi->ws); if (ws) __pm_stay_awake(ws); rcu_read_unlock(); } /* * ep->mutex needs to be held because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) { /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way. */ lockdep_assert_irqs_enabled(); write_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); write_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, struct list_head *txlist) { struct epitem *epi, *nepi; write_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(epi)) { /* * ->ovflist is LIFO, so we have to reverse it in order * to keep in FIFO. */ list_add(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); /* * Quickly re-inject items left on "txlist". */ list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); } write_unlock_irq(&ep->lock); } static void ep_get(struct eventpoll *ep) { refcount_inc(&ep->refcount); } /* * Returns true if the event poll can be disposed */ static bool ep_refcount_dec_and_test(struct eventpoll *ep) { if (!refcount_dec_and_test(&ep->refcount)) return false; WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root)); return true; } static void ep_free(struct eventpoll *ep) { ep_resume_napi_irqs(ep); mutex_destroy(&ep->mtx); free_uid(ep->user); wakeup_source_unregister(ep->ws); kfree(ep); } /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. * If the dying flag is set, do the removal only if force is true. * This prevents ep_clear_and_put() from dropping all the ep references * while running concurrently with eventpoll_release_file(). * Returns true if the eventpoll can be disposed. */ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) { struct file *file = epi->ffd.file; struct epitems_head *to_free; struct hlist_head *head; lockdep_assert_irqs_enabled(); /* * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); if (epi->dying && !force) { spin_unlock(&file->f_lock); return false; } to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); if (!smp_load_acquire(&v->next)) to_free = v; } } hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); free_ephead(to_free); rb_erase_cached(&epi->rbn, &ep->rbr); write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* * At this point it is safe to free the eventpoll item. Use the union * field epi->rcu, since we are trying to minimize the size of * 'struct epitem'. The 'rbn' field is no longer in use. Protected by * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); return ep_refcount_dec_and_test(ep); } /* * ep_remove variant for callers owing an additional reference to the ep */ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) { WARN_ON_ONCE(__ep_remove(ep, epi, false)); } static void ep_clear_and_put(struct eventpoll *ep) { struct rb_node *rbp, *next; struct epitem *epi; bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) ep_poll_safewake(ep, NULL, 0); mutex_lock(&ep->mtx); /* * Walks through the whole tree by unregistering poll callbacks. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); cond_resched(); } /* * Walks through the whole tree and try to free each "struct epitem". * Note that ep_remove_safe() will not remove the epitem in case of a * racing eventpoll_release_file(); the latter will do the removal. * At this point we are sure no poll callbacks will be lingering around. * Since we still own a reference to the eventpoll struct, the loop can't * dispose it. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { next = rb_next(rbp); epi = rb_entry(rbp, struct epitem, rbn); ep_remove_safe(ep, epi); cond_resched(); } dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); if (dispose) ep_free(ep); } static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int ret; if (!is_file_epoll(file)) return -EINVAL; switch (cmd) { case EPIOCSPARAMS: case EPIOCGPARAMS: ret = ep_eventpoll_bp_ioctl(file, cmd, arg); break; default: ret = -EINVAL; break; } return ret; } static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; if (ep) ep_clear_and_put(ep); return 0; } static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth); static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) { struct eventpoll *ep = file->private_data; LIST_HEAD(txlist); struct epitem *epi, *tmp; poll_table pt; __poll_t res = 0; init_poll_funcptr(&pt, NULL); /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside * the ready list. */ mutex_lock_nested(&ep->mtx, depth); ep_start_scan(ep, &txlist); list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { if (ep_item_poll(epi, &pt, depth + 1)) { res = EPOLLIN | EPOLLRDNORM; break; } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ __pm_relax(ep_wakeup_source(epi)); list_del_init(&epi->rdllink); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } /* * The ffd.file pointer may be in the process of being torn down due to * being closed, but we may not have finished eventpoll_release() yet. * * Normally, even with the atomic_long_inc_not_zero, the file may have * been free'd and then gotten re-allocated to something else (since * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). * * But for epoll, users hold the ep->mtx mutex, and as such any file in * the process of being free'd will block in eventpoll_release_file() * and thus the underlying file allocation will not be free'd, and the * file re-use cannot happen. * * For the same reason we can avoid a rcu_read_lock() around the * operation - 'ffd.file' cannot go away even if the refcount has * reached zero (but we must still not call out to ->poll() functions * etc). */ static struct file *epi_fget(const struct epitem *epi) { struct file *file; file = epi->ffd.file; if (!file_ref_get(&file->f_ref)) file = NULL; return file; } /* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. */ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { struct file *file = epi_fget(epi); __poll_t res; /* * We could return EPOLLERR | EPOLLHUP or something, but let's * treat this more as "file doesn't exist, poll didn't happen". */ if (!file) return 0; pt->_key = epi->event.events; if (!is_file_epoll(file)) res = vfs_poll(file, pt); else res = __ep_eventpoll_poll(file, pt, depth); fput(file); return res & epi->event.events; } static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { return __ep_eventpoll_poll(file, wait, 0); } #ifdef CONFIG_PROC_FS static void ep_show_fdinfo(struct seq_file *m, struct file *f) { struct eventpoll *ep = f->private_data; struct rb_node *rbp; mutex_lock(&ep->mtx); for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); struct inode *inode = file_inode(epi->ffd.file); seq_printf(m, "tfd: %8d events: %8x data: %16llx " " pos:%lli ino:%lx sdev:%x\n", epi->ffd.fd, epi->event.events, (long long)epi->event.data, (long long)epi->ffd.file->f_pos, inode->i_ino, inode->i_sb->s_dev); if (seq_has_overflowed(m)) break; } mutex_unlock(&ep->mtx); } #endif /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, .unlocked_ioctl = ep_eventpoll_ioctl, .compat_ioctl = compat_ptr_ioctl, }; /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are * closed without being removed from the eventpoll interface. */ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; bool dispose; /* * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from * touching the epitems list before eventpoll_release_file() can access * the ep->mtx. */ again: spin_lock(&file->f_lock); if (file->f_ep && file->f_ep->first) { epi = hlist_entry(file->f_ep->first, struct epitem, fllink); epi->dying = true; spin_unlock(&file->f_lock); /* * ep access is safe as we still own a reference to the ep * struct */ ep = epi->ep; mutex_lock(&ep->mtx); dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); if (dispose) ep_free(ep); goto again; } spin_unlock(&file->f_lock); } static int ep_alloc(struct eventpoll **pep) { struct eventpoll *ep; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) return -ENOMEM; mutex_init(&ep->mtx); rwlock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = get_current_user(); refcount_set(&ep->refcount, 1); *pep = ep; return 0; } /* * Search the file inside the eventpoll tree. The RB tree operations * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) rbp = rbp->rb_left; else { epir = epi; break; } } return epir; } #ifdef CONFIG_KCMP static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) { struct rb_node *rbp; struct epitem *epi; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (epi->ffd.fd == tfd) { if (toff == 0) return epi; else toff--; } cond_resched(); } return NULL; } struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff) { struct file *file_raw; struct eventpoll *ep; struct epitem *epi; if (!is_file_epoll(file)) return ERR_PTR(-EINVAL); ep = file->private_data; mutex_lock(&ep->mtx); epi = ep_find_tfd(ep, tfd, toff); if (epi) file_raw = epi->ffd.file; else file_raw = ERR_PTR(-ENOENT); mutex_unlock(&ep->mtx); return file_raw; } #endif /* CONFIG_KCMP */ /* * Adds a new entry to the tail of the list in a lockless way, i.e. * multiple CPUs are allowed to call this function concurrently. * * Beware: it is necessary to prevent any other modifications of the * existing list until all changes are completed, in other words * concurrent list_add_tail_lockless() calls should be protected * with a read lock, where write lock acts as a barrier which * makes sure all list_add_tail_lockless() calls are fully * completed. * * Also an element can be locklessly added to the list only in one * direction i.e. either to the tail or to the head, otherwise * concurrent access will corrupt the list. * * Return: %false if element has been already added to the list, %true * otherwise. */ static inline bool list_add_tail_lockless(struct list_head *new, struct list_head *head) { struct list_head *prev; /* * This is simple 'new->next = head' operation, but cmpxchg() * is used in order to detect that same element has been just * added to the list from another CPU: the winner observes * new->next == new. */ if (!try_cmpxchg(&new->next, &new, head)) return false; /* * Initially ->next of a new element must be updated with the head * (we are inserting to the tail) and only then pointers are atomically * exchanged. XCHG guarantees memory ordering, thus ->next should be * updated before pointers are actually swapped and pointers are * swapped before prev->next is updated. */ prev = xchg(&head->prev, new); /* * It is safe to modify prev->next and new->prev, because a new element * is added only to the tail and new->next is updated before XCHG. */ prev->next = new; new->prev = prev; return true; } /* * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, * i.e. multiple CPUs are allowed to call this function concurrently. * * Return: %false if epi element has been already chained, %true otherwise. */ static inline bool chain_epi_lockless(struct epitem *epi) { struct eventpoll *ep = epi->ep; /* Fast preliminary check */ if (epi->next != EP_UNACTIVE_PTR) return false; /* Check that the same epi has not been just chained from another CPU */ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) return false; /* Atomically exchange tail */ epi->next = xchg(&ep->ovflist, epi); return true; } /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. * * This callback takes a read lock in order not to contend with concurrent * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from * ep_start/done_scan(), which stops all list modifications and guarantees * that lists state is seen correctly. * * Another thing worth to mention is that ep_poll_callback() can be called * concurrently for the same @epi from different CPUs if poll table was inited * with several wait queues entries. Plural wakeup from different CPUs of a * single wait queue is serialized by wq.lock, but the case when multiple wait * queues are used should be detected accordingly. This is detected using * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; __poll_t pollflags = key_to_poll(key); unsigned long flags; int ewake = 0; read_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ if (pollflags && !(pollflags & epi->event.events)) goto out_unlock; /* * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (chain_epi_lockless(epi)) ep_pm_stay_awake_rcu(epi); } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) ep_pm_stay_awake_rcu(epi); } /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && !(pollflags & POLLFREE)) { switch (pollflags & EPOLLINOUT_BITS) { case EPOLLIN: if (epi->event.events & EPOLLIN) ewake = 1; break; case EPOLLOUT: if (epi->event.events & EPOLLOUT) ewake = 1; break; case 0: ewake = 1; break; } } if (sync) wake_up_sync(&ep->wq); else wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: read_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; if (pollflags & POLLFREE) { /* * If we race with ep_remove_wait_queue() it can miss * ->whead = NULL and do another remove_wait_queue() after * us, so we can't use __remove_wait_queue(). */ list_del_init(&wait->entry); /* * ->whead != NULL protects us from the race with * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() * takes whead->lock held by the caller. Once we nullify it, * nothing protects ep/epi or even wait. */ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); } return ewake; } /* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); struct epitem *epi = epq->epi; struct eppoll_entry *pwq; if (unlikely(!epi)) // an earlier allocation has failed return; pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); if (unlikely(!pwq)) { epq->epi = NULL; return; } init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; if (epi->event.events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(whead, &pwq->wait); else add_wait_queue(whead, &pwq->wait); pwq->next = epi->pwqlist; epi->pwqlist = pwq; } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) { int kcmp; struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; struct epitem *epic; bool leftmost = true; while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); if (kcmp > 0) { p = &parent->rb_right; leftmost = false; } else p = &parent->rb_left; } rb_link_node(&epi->rbn, parent, p); rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); } #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate * from a single file of interest. For example, we allow 1000 paths of length * 1, to emanate from each file of interest. This essentially represents the * potential wakeup paths, which need to be limited in order to avoid massive * uncontrolled wakeup storms. The common use case should be a single ep which * is connected to n file sources. In this case each file source has 1 path * of length 1. Thus, the numbers below should be more than sufficient. These * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify * and delete can't add additional paths. Protected by the epnested_mutex. */ static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; static int path_count[PATH_ARR_SIZE]; static int path_count_inc(int nests) { /* Allow an arbitrary number of depth 1 paths */ if (nests == 0) return 0; if (++path_count[nests] > path_limits[nests]) return -1; return 0; } static void path_count_init(void) { int i; for (i = 0; i < PATH_ARR_SIZE; i++) path_count[i] = 0; } static int reverse_path_check_proc(struct hlist_head *refs, int depth) { int error = 0; struct epitem *epi; if (depth > EP_MAX_NESTS) /* too deep nesting */ return -1; /* CTL_DEL can remove links here, but that can't increase our count */ hlist_for_each_entry_rcu(epi, refs, fllink) { struct hlist_head *refs = &epi->ep->refs; if (hlist_empty(refs)) error = path_count_inc(depth); else error = reverse_path_check_proc(refs, depth + 1); if (error != 0) break; } return error; } /** * reverse_path_check - The tfile_check_list is list of epitem_head, which have * links that are proposed to be newly added. We need to * make sure that those added links don't add too many * paths such that we will spend all our time waking up * eventpoll objects. * * Return: %zero if the proposed links don't create too many paths, * %-1 otherwise. */ static int reverse_path_check(void) { struct epitems_head *p; for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { int error; path_count_init(); rcu_read_lock(); error = reverse_path_check_proc(&p->epitems, 0); rcu_read_unlock(); if (error) return error; } return 0; } static int ep_create_wakeup_source(struct epitem *epi) { struct name_snapshot n; struct wakeup_source *ws; if (!epi->ep->ws) { epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); if (!epi->ep->ws) return -ENOMEM; } take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); ws = wakeup_source_register(NULL, n.name.name); release_dentry_name_snapshot(&n); if (!ws) return -ENOMEM; rcu_assign_pointer(epi->ws, ws); return 0; } /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ static noinline void ep_destroy_wakeup_source(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); RCU_INIT_POINTER(epi->ws, NULL); /* * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is * used internally by wakeup_source_remove, too (called by * wakeup_source_unregister), so we cannot use call_rcu */ synchronize_rcu(); wakeup_source_unregister(ws); } static int attach_epitem(struct file *file, struct epitem *epi) { struct epitems_head *to_free = NULL; struct hlist_head *head = NULL; struct eventpoll *ep = NULL; if (is_file_epoll(file)) ep = file->private_data; if (ep) { head = &ep->refs; } else if (!READ_ONCE(file->f_ep)) { allocate: to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL); if (!to_free) return -ENOMEM; head = &to_free->epitems; } spin_lock(&file->f_lock); if (!file->f_ep) { if (unlikely(!head)) { spin_unlock(&file->f_lock); goto allocate; } /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); spin_unlock(&file->f_lock); free_ephead(to_free); return 0; } /* * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct file *tfile, int fd, int full_check) { int error, pwake = 0; __poll_t revents; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; if (is_file_epoll(tfile)) tep = tfile->private_data; lockdep_assert_irqs_enabled(); if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, max_user_watches) >= 0)) return -ENOSPC; percpu_counter_inc(&ep->user->epoll_watches); if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->next = EP_UNACTIVE_PTR; if (tep) mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { if (tep) mutex_unlock(&tep->mtx); kmem_cache_free(epi_cache, epi); percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); if (tep) mutex_unlock(&tep->mtx); /* * ep_remove_safe() calls in the later error paths can't lead to * ep_free() as the ep file itself still holds an ep reference. */ ep_get(ep); /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { ep_remove_safe(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { ep_remove_safe(ep, epi); return error; } } /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ if (unlikely(!epq.epi)) { ep_remove_safe(ep, epi); return -ENOMEM; } /* We have to drop the new item inside our item list to keep track of it */ write_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ if (revents && !ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } /* * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */ static int ep_modify(struct eventpoll *ep, struct epitem *epi, const struct epoll_event *event) { int pwake = 0; poll_table pt; lockdep_assert_irqs_enabled(); init_poll_funcptr(&pt, NULL); /* * Set the new event interest mask before calling f_op->poll(); * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; /* need barrier below */ epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } /* * The following barrier has two effects: * * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also * pairs with the barrier in wq_has_sleeper (see * comments for wq_has_sleeper). * * This barrier will now guarantee ep_poll_callback or f_op->poll * (or both) will notice the readiness of an item. */ smp_mb(); /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { write_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { struct epitem *epi, *tmp; LIST_HEAD(txlist); poll_table pt; int res = 0; /* * Always short-circuit for fatal signals to allow threads to make a * timely exit without the chance of finding more events available and * fetching repeatedly. */ if (fatal_signal_pending(current)) return -EINTR; init_poll_funcptr(&pt, NULL); mutex_lock(&ep->mtx); ep_start_scan(ep, &txlist); /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop we are holding ep->mtx. */ list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { struct wakeup_source *ws; __poll_t revents; if (res >= maxevents) break; /* * Activate ep->ws before deactivating epi->ws to prevent * triggering auto-suspend here (in case we reactive epi->ws * below). * * This could be rearranged to delay the deactivation of epi->ws * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked(). */ ws = ep_wakeup_source(epi); if (ws) { if (ws->active) __pm_stay_awake(ep->ws); __pm_relax(ws); } list_del_init(&epi->rdllink); /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, we are holding ep->mtx, * so no operations coming from userspace can change the item. */ revents = ep_item_poll(epi, &pt, 1); if (!revents) continue; events = epoll_put_uevent(revents, epi->event.data, events); if (!events) { list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); if (!res) res = -EFAULT; break; } res++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_send_events() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) { struct timespec64 now; if (ms < 0) return NULL; if (!ms) { to->tv_sec = 0; to->tv_nsec = 0; return to; } to->tv_sec = ms / MSEC_PER_SEC; to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC); ktime_get_ts64(&now); *to = timespec64_add_safe(now, *to); return to; } /* * autoremove_wake_function, but remove even on failure to wake up, because we * know that default_wake_function/ttwu will only fail if the thread is already * woken, and in that case the ep_poll loop will remove the entry anyways, not * try to reuse it. */ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wq_entry, mode, sync, key); /* * Pairs with list_empty_careful in ep_poll, and ensures future loop * iterations see the cause of this wakeup. */ list_del_init_careful(&wq_entry->entry); return ret; } static int ep_try_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { int res; /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of * more luck. */ res = ep_send_events(ep, events, maxevents); if (res > 0) ep_suspend_napi_irqs(ep); return res; } static int ep_schedule_timeout(ktime_t *to) { if (to) return ktime_after(*to, ktime_get()); else return 1; } /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. * * @ep: Pointer to the eventpoll context. * @events: Pointer to the userspace buffer where the ready events should be * stored. * @maxevents: Size (in terms of number of events) of the caller event buffer. * @timeout: Maximum timeout for the ready events fetch operation, in * timespec. If the timeout is zero, the function will not block, * while if the @timeout ptr is NULL, the function will block * until at least one event has been retrieved (or an error * occurred). * * Return: the number of ready events which have been fetched, or an * error code, in case of error. */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout) { int res, eavail, timed_out = 0; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; lockdep_assert_irqs_enabled(); if (timeout && (timeout->tv_sec | timeout->tv_nsec)) { slack = select_estimate_accuracy(timeout); to = &expires; *to = timespec64_to_ktime(*timeout); } else if (timeout) { /* * Avoid the unnecessary trip to the wait queue loop, if the * caller specified a non blocking operation. */ timed_out = 1; } /* * This call is racy: We may or may not see events that are being added * to the ready list under the lock (e.g., in IRQ callbacks). For cases * with a non-zero timeout, this thread will check the ready list under * lock and will add to the wait queue. For cases with a zero * timeout, the user by definition should not care and will have to * recheck again. */ eavail = ep_events_available(ep); while (1) { if (eavail) { res = ep_try_send_events(ep, events, maxevents); if (res) return res; } if (timed_out) return 0; eavail = ep_busy_loop(ep); if (eavail) continue; if (signal_pending(current)) return -EINTR; /* * Internally init_wait() uses autoremove_wake_function(), * thus wait entry is removed from the wait queue on each * wakeup. Why it is important? In case of several waiters * each new wakeup will hit the next waiter, giving it the * chance to harvest new event. Otherwise wakeup can be * lost. This is also good performance-wise, because on * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. * * In fact, we now use an even more aggressive function that * unconditionally removes, because we don't reuse the wait * entry between loop iterations. This lets us also avoid the * performance issue if a process is killed, causing all of its * threads to wake up without being removed normally. */ init_wait(&wait); wait.func = ep_autoremove_wake_function; write_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it * is safe to avoid an explicit barrier. */ __set_current_state(TASK_INTERRUPTIBLE); /* * Do the final check under the lock. ep_start/done_scan() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is * important. */ eavail = ep_events_available(ep); if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); write_unlock_irq(&ep->lock); if (!eavail && ep_schedule_timeout(to)) timed_out = !schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * We were woken up, thus go and try to harvest some events. * If timed out and still on the wait queue, recheck eavail * carefully under lock, below. */ eavail = 1; if (!list_empty_careful(&wait.entry)) { write_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its * timeout expired before it could reacquire the lock. * Thus, when wait.entry is empty, it needs to harvest * events. */ if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); write_unlock_irq(&ep->lock); } } } /** * ep_loop_check_proc - verify that adding an epoll file inside another * epoll structure does not violate the constraints, in * terms of closed loops, or too deep chains (which can * result in excessive stack usage). * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * * Return: %zero if adding the epoll @file inside current epoll * structure @ep does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { int error = 0; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) error = -1; else error = ep_loop_check_proc(ep_tovisit, depth + 1); if (error != 0) break; } else { /* * If we've reached a file that is not associated with * an ep, then we need to check if the newly added * links are going to add too many wakeup paths. We do * this by adding it to the tfile_check_list, if it's * not already there, and calling reverse_path_check() * during ep_insert(). */ list_file(epi->ffd.file); } } mutex_unlock(&ep->mtx); return error; } /** * ep_loop_check - Performs a check to verify that adding an epoll file (@to) * into another epoll file (represented by @ep) does not create * closed loops or too deep chains. * * @ep: Pointer to the epoll we are inserting into. * @to: Pointer to the epoll to be inserted. * * Return: %zero if adding the epoll @to inside the epoll @from * does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { inserting_into = ep; return ep_loop_check_proc(to, 0); } static void clear_tfile_check_list(void) { rcu_read_lock(); while (tfile_check_list != EP_UNACTIVE_PTR) { struct epitems_head *head = tfile_check_list; tfile_check_list = head->next; unlist_file(head); } rcu_read_unlock(); } /* * Open an eventpoll file descriptor. */ static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL; /* * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); if (error < 0) return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); if (fd < 0) { error = fd; goto out_free_ep; } file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_fd; } ep->file = file; fd_install(fd, file); return fd; out_free_fd: put_unused_fd(fd); out_free_ep: ep_clear_and_put(ep); return error; } SYSCALL_DEFINE1(epoll_create1, int, flags) { return do_epoll_create(flags); } SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; return do_epoll_create(0); } #ifdef CONFIG_PM_SLEEP static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) epev->events &= ~EPOLLWAKEUP; } #else static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { epev->events &= ~EPOLLWAKEUP; } #endif static inline int epoll_mutex_lock(struct mutex *mutex, int depth, bool nonblock) { if (!nonblock) { mutex_lock_nested(mutex, depth); return 0; } if (mutex_trylock(mutex)) return 0; return -EAGAIN; } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock) { int error; int full_check = 0; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; /* Get the "struct file *" for the target file */ CLASS(fd, tf)(fd); if (fd_empty(tf)) return -EBADF; /* The target file descriptor must support poll */ if (!file_can_poll(fd_file(tf))) return -EPERM; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself. */ error = -EINVAL; if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f))) goto error_tgt_fput; /* * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) || (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = fd_file(f)->private_data; /* * When we insert an epoll file descriptor inside another epoll file * descriptor, there is the chance of creating closed loops, which are * better be handled here, than in more critical paths. While we are * checking for loops we also determine the list of files reachable * and hang them on the tfile_check_list, so we can check that we * haven't created too many possible wakeup paths. * * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when * the epoll file descriptor is attaching directly to a wakeup source, * unless the epoll file descriptor is nested. The purpose of taking the * 'epnested_mutex' on add is to prevent complex toplogies such as loops and * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen || is_file_epoll(fd_file(tf))) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; full_check = 1; if (is_file_epoll(fd_file(tf))) { tep = fd_file(tf)->private_data; error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; } } /* * Try to lookup the file inside our RB tree. Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, fd_file(tf), fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_insert(ep, epds, fd_file(tf), fd, full_check); } else error = -EEXIST; break; case EPOLL_CTL_DEL: if (epi) { /* * The eventpoll itself is still alive: the refcount * can't go to zero here. */ ep_remove_safe(ep, epi); error = 0; } else { error = -ENOENT; } break; case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_modify(ep, epi, epds); } } else error = -ENOENT; break; } mutex_unlock(&ep->mtx); error_tgt_fput: if (full_check) { clear_tfile_check_list(); loop_check_gen++; mutex_unlock(&epnested_mutex); } return error; } /* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { struct epoll_event epds; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) return -EFAULT; return do_epoll_ctl(epfd, op, fd, &epds, false); } static int ep_check_params(struct file *file, struct epoll_event __user *evs, int maxevents) { /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* Verify that the area passed by the user is writeable */ if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) return -EFAULT; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ if (!is_file_epoll(file)) return -EINVAL; return 0; } int epoll_sendevents(struct file *file, struct epoll_event __user *events, int maxevents) { struct eventpoll *ep; int ret; ret = ep_check_params(file, events, maxevents); if (unlikely(ret)) return ret; ep = file->private_data; /* * Racy call, but that's ok - it should get retried based on * poll readiness anyway. */ if (ep_events_available(ep)) return ep_try_send_events(ep, events, maxevents); return 0; } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { struct eventpoll *ep; int ret; /* Get the "struct file *" for the eventpoll file */ CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; ret = ep_check_params(fd_file(f), events, maxevents); if (unlikely(ret)) return ret; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = fd_file(f)->private_data; /* Time to fish for events ... */ return ep_poll(ep, events, maxevents, to); } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { struct timespec64 to; return do_epoll_wait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout)); } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_pwait(2). */ static int do_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to, const sigset_t __user *sigmask, size_t sigsetsize) { int error; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ error = set_user_sigmask(sigmask, sigsetsize); if (error) return error; error = do_epoll_wait(epfd, events, maxevents, to); restore_saved_sigmask_unless(error == -EINTR); return error; } SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 to; return do_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #ifdef CONFIG_COMPAT static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { long err; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ err = set_compat_user_sigmask(sigmask, sigsetsize); if (err) return err; err = do_epoll_wait(epfd, events, maxevents, timeout); restore_saved_sigmask_unless(err == -EINTR); return err; } COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 to; return do_compat_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_compat_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #endif static int __init eventpoll_init(void) { struct sysinfo si; si_meminfo(&si); /* * Allows top 4% of lomem to be allocated for epoll watches (per user). */ max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; BUG_ON(max_user_watches < 0); /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs */ BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); epoll_sysctls_init(); ephead_cache = kmem_cache_create("ep_head", sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } fs_initcall(eventpoll_init); |
8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for diskquota-operations. When diskquota is configured these * macros expand to the right source-code. * * Author: Marco van Wieringen <mvw@planets.elm.net> */ #ifndef _LINUX_QUOTAOPS_ #define _LINUX_QUOTAOPS_ #include <linux/fs.h> #define DQUOT_SPACE_WARN 0x1 #define DQUOT_SPACE_RESERVE 0x2 #define DQUOT_SPACE_NOFAIL 0x4 static inline struct quota_info *sb_dqopt(struct super_block *sb) { return &sb->s_dquot; } /* i_mutex must being held */ static inline bool is_quota_modification(struct mnt_idmap *idmap, struct inode *inode, struct iattr *ia) { return ((ia->ia_valid & ATTR_SIZE) || i_uid_needs_update(idmap, ia, inode) || i_gid_needs_update(idmap, ia, inode)); } #if defined(CONFIG_QUOTA) #define quota_error(sb, fmt, args...) \ __quota_error((sb), __func__, fmt , ## args) extern __printf(3, 4) void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...); /* * declaration of quota_function calls in kernel. */ int dquot_initialize(struct inode *inode); bool dquot_initialize_needed(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); static inline struct dquot *dqgrab(struct dquot *dquot) { /* Make sure someone else has active reference to dquot */ WARN_ON_ONCE(!atomic_read(&dquot->dq_count)); WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); atomic_inc(&dquot->dq_count); return dquot; } static inline bool dquot_is_busy(struct dquot *dquot) { if (test_bit(DQ_MOD_B, &dquot->dq_flags)) return true; if (atomic_read(&dquot->dq_count) > 0) return true; return false; } void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), unsigned long priv); struct dquot *dquot_alloc(struct super_block *sb, int type); void dquot_destroy(struct dquot *dquot); int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags); void __dquot_free_space(struct inode *inode, qsize_t number, int flags); int dquot_alloc_inode(struct inode *inode); void dquot_claim_space_nodirty(struct inode *inode, qsize_t number); void dquot_free_inode(struct inode *inode); void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number); int dquot_disable(struct super_block *sb, int type, unsigned int flags); /* Suspend quotas on remount RO */ static inline int dquot_suspend(struct super_block *sb, int type) { return dquot_disable(sb, type, DQUOT_SUSPENDED); } int dquot_resume(struct super_block *sb, int type); int dquot_commit(struct dquot *dquot); int dquot_acquire(struct dquot *dquot); int dquot_release(struct dquot *dquot); int dquot_commit_info(struct super_block *sb, int type); int dquot_get_next_id(struct super_block *sb, struct kqid *qid); int dquot_mark_dquot_dirty(struct dquot *dquot); int dquot_file_open(struct inode *inode, struct file *file); int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags); int dquot_load_quota_inode(struct inode *inode, int type, int format_id, unsigned int flags); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); int dquot_quota_off(struct super_block *sb, int type); int dquot_writeback_dquots(struct super_block *sb, int type); int dquot_quota_sync(struct super_block *sb, int type); int dquot_get_state(struct super_block *sb, struct qc_state *state); int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii); int dquot_get_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id, struct qc_dqblk *di); int dquot_set_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int __dquot_transfer(struct inode *inode, struct dquot **transfer_to); int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) { return sb_dqopt(sb)->info + type; } /* * Functions for checking status of quota */ static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_USAGE_ENABLED, type); } static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_LIMITS_ENABLED, type); } static inline bool sb_has_quota_suspended(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_SUSPENDED, type); } static inline unsigned sb_any_quota_suspended(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_SUSPENDED); } /* Does kernel know about any quota information for given sb + type? */ static inline bool sb_has_quota_loaded(struct super_block *sb, int type) { /* Currently if anything is on, then quota usage is on as well */ return sb_has_quota_usage_enabled(sb, type); } static inline unsigned sb_any_quota_loaded(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_USAGE_ENABLED); } static inline bool sb_has_quota_active(struct super_block *sb, int type) { return sb_has_quota_loaded(sb, type) && !sb_has_quota_suspended(sb, type); } /* * Operations supported for diskquotas. */ extern const struct dquot_operations dquot_operations; extern const struct quotactl_ops dquot_quotactl_sysfile_ops; #else static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_suspended(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_suspended(struct super_block *sb) { return 0; } /* Does kernel know about any quota information for given sb + type? */ static inline int sb_has_quota_loaded(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_loaded(struct super_block *sb) { return 0; } static inline int sb_has_quota_active(struct super_block *sb, int type) { return 0; } static inline int dquot_initialize(struct inode *inode) { return 0; } static inline bool dquot_initialize_needed(struct inode *inode) { return false; } static inline void dquot_drop(struct inode *inode) { } static inline int dquot_alloc_inode(struct inode *inode) { return 0; } static inline void dquot_free_inode(struct inode *inode) { } static inline int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { return 0; } static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_add_bytes(inode, number); return 0; } static inline void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_sub_bytes(inode, number); } static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { inode_add_bytes(inode, number); } static inline int dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { inode_sub_bytes(inode, number); return 0; } static inline int dquot_disable(struct super_block *sb, int type, unsigned int flags) { return 0; } static inline int dquot_suspend(struct super_block *sb, int type) { return 0; } static inline int dquot_resume(struct super_block *sb, int type) { return 0; } #define dquot_file_open generic_file_open static inline int dquot_writeback_dquots(struct super_block *sb, int type) { return 0; } #endif /* CONFIG_QUOTA */ static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN); } static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr) { __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL); mark_inode_dirty_sync(inode); } static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) { int ret; ret = dquot_alloc_space_nodirty(inode, nr); if (!ret) { /* * Mark inode fully dirty. Since we are allocating blocks, inode * would become fully dirty soon anyway and it reportedly * reduces lock contention. */ mark_inode_dirty(inode); } return ret; } static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr) { return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr) { dquot_alloc_space_nofail(inode, nr << inode->i_blkbits); } static inline int dquot_alloc_block(struct inode *inode, qsize_t nr) { return dquot_alloc_space(inode, nr << inode->i_blkbits); } static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0); } static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr) { int ret; ret = dquot_prealloc_block_nodirty(inode, nr); if (!ret) mark_inode_dirty_sync(inode); return ret; } static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE); } static inline void dquot_claim_block(struct inode *inode, qsize_t nr) { dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr) { dquot_reclaim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr, 0); } static inline void dquot_free_space(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr); mark_inode_dirty_sync(inode); } static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_free_block(struct inode *inode, qsize_t nr) { dquot_free_space(inode, nr << inode->i_blkbits); } static inline void dquot_release_reservation_block(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE); } unsigned int qtype_enforce_flag(int type); #endif /* _LINUX_QUOTAOPS_ */ |
5 255 255 255 255 43 5 5 26 26 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> linux/include/linux/rbtree.h To use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H #include <linux/container_of.h> #include <linux/rbtree_types.h> #include <linux/stddef.h> #include <linux/rcupdate.h> #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; rcu_assign_pointer(*rb_link, node); } #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) /** * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. * * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as * list_for_each_entry_safe() and allows the iteration to continue independent * of changes to @pos by the body of the loop. * * Note, however, that it cannot handle other modifications that re-order the * rbtree it is iterating over. This includes calling rb_erase() on @pos, as * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) /* Same as rb_first(), but O(1) */ #define rb_first_cached(root) (root)->rb_leftmost static inline void rb_insert_color_cached(struct rb_node *node, struct rb_root_cached *root, bool leftmost) { if (leftmost) root->rb_leftmost = node; rb_insert_color(node, &root->rb_root); } static inline struct rb_node * rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { struct rb_node *leftmost = NULL; if (root->rb_leftmost == node) leftmost = root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root); return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, struct rb_root_cached *root) { if (root->rb_leftmost == victim) root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root); } /* * The below helper functions use 2 operators with 3 different * calling conventions. The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * rb_find(). * * The reason for this is to allow the find() interface without requiring an * on-stack dummy object, which might not be feasible due to object size. */ /** * rb_add_cached() - insert @node into the leftmost cached tree @tree * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order * * Returns @node when it is the new leftmost, or NULL. */ static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return leftmost ? node : NULL; } /** * rb_add() - insert @node into @tree * @node: node to insert * @tree: tree to insert @node into * @less: operator defining the (partial) node order */ static __always_inline void rb_add(struct rb_node *node, struct rb_root *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; while (*link) { parent = *link; if (less(node, parent)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node(node, parent, link); rb_insert_color(node, tree); } /** * rb_find_add_cached() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_cached(struct rb_node *node, struct rb_root_cached *tree, int (*cmp)(const struct rb_node *new, const struct rb_node *exist)) { bool leftmost = true; struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) { link = &parent->rb_left; } else if (c > 0) { link = &parent->rb_right; leftmost = false; } else { return parent; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return NULL; } /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find_add_rcu() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Adds a Store-Release for link_node. * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_rcu(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node_rcu(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = node->rb_left; else if (c > 0) node = node->rb_right; else return node; } return NULL; } /** * rb_find_rcu() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Notably, tree descent vs concurrent tree rotations is unsound and can result * in false-negatives. * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find_rcu(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = rcu_dereference_raw(node->rb_left); else if (c > 0) node = rcu_dereference_raw(node->rb_right); else return node; } return NULL; } /** * rb_find_first() - find the first @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the leftmost node matching @key, or NULL. */ static __always_inline struct rb_node * rb_find_first(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; struct rb_node *match = NULL; while (node) { int c = cmp(key, node); if (c <= 0) { if (!c) match = node; node = node->rb_left; } else if (c > 0) { node = node->rb_right; } } return match; } /** * rb_next_match() - find the next @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the next node matching @key, or NULL. */ static __always_inline struct rb_node * rb_next_match(const void *key, struct rb_node *node, int (*cmp)(const void *key, const struct rb_node *)) { node = rb_next(node); if (node && cmp(key, node)) node = NULL; return node; } /** * rb_for_each() - iterates a subtree matching @key * @node: iterator * @key: key to match * @tree: tree to search * @cmp: operator defining node order */ #define rb_for_each(node, key, tree, cmp) \ for ((node) = rb_find_first((key), (tree), (cmp)); \ (node); (node) = rb_next_match((key), (node), (cmp))) #endif /* _LINUX_RBTREE_H */ |
246 247 26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MM_PERCPU_INTERNAL_H #define _MM_PERCPU_INTERNAL_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/memcontrol.h> /* * pcpu_block_md is the metadata block struct. * Each chunk's bitmap is split into a number of full blocks. * All units are in terms of bits. * * The scan hint is the largest known contiguous area before the contig hint. * It is not necessarily the actual largest contig hint though. There is an * invariant that the scan_hint_start > contig_hint_start iff * scan_hint == contig_hint. This is necessary because when scanning forward, * we don't know if a new contig hint would be better than the current one. */ struct pcpu_block_md { int scan_hint; /* scan hint for block */ int scan_hint_start; /* block relative starting position of the scan hint */ int contig_hint; /* contig hint for block */ int contig_hint_start; /* block relative starting position of the contig hint */ int left_free; /* size of free space along the left side of the block */ int right_free; /* size of free space along the right side of the block */ int first_free; /* block position of first free */ int nr_bits; /* total bits responsible for */ }; struct pcpuobj_ext { #ifdef CONFIG_MEMCG struct obj_cgroup *cgroup; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING union codetag_ref tag; #endif }; #if defined(CONFIG_MEMCG) || defined(CONFIG_MEM_ALLOC_PROFILING) #define NEED_PCPUOBJ_EXT #endif struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ size_t max_alloc_size; /* largest allocation size */ #endif struct list_head list; /* linked to pcpu_slot lists */ int free_bytes; /* free bytes in the chunk */ struct pcpu_block_md chunk_md; unsigned long *bound_map; /* boundary map */ /* * base_addr is the base address of this chunk. * To reduce false sharing, current layout is optimized to make sure * base_addr locate in the different cacheline with free_bytes and * chunk_md. */ void *base_addr ____cacheline_aligned_in_smp; unsigned long *alloc_map; /* allocation map */ struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ bool immutable; /* no [de]population allowed */ bool isolated; /* isolated from active chunk slots */ int start_offset; /* the overlap with the previous region to have a page aligned base_addr */ int end_offset; /* additional area required to have the region end page aligned */ #ifdef NEED_PCPUOBJ_EXT struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ #endif int nr_pages; /* # of pages served by this chunk */ int nr_populated; /* # of populated pages */ int nr_empty_pop_pages; /* # of empty populated pages */ unsigned long populated[]; /* populated bitmap */ }; static inline bool need_pcpuobj_ext(void) { if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING)) return true; if (!mem_cgroup_kmem_disabled()) return true; return false; } extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; extern int pcpu_sidelined_slot; extern int pcpu_to_depopulate_slot; extern int pcpu_nr_empty_pop_pages; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; /** * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bitmap blocks used. */ static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk) { return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE; } /** * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap * @pages: number of physical pages * * This conversion is from physical pages to the number of bits * required in the bitmap. */ static inline int pcpu_nr_pages_to_map_bits(int pages) { return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; } /** * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bits in the bitmap. */ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) { return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } /** * pcpu_obj_full_size - helper to calculate size of each accounted object * @size: size of area to allocate in bytes * * For each accounted object there is an extra space which is used to store * obj_cgroup membership if kmemcg is not disabled. Charge it too. */ static inline size_t pcpu_obj_full_size(size_t size) { size_t extra_size = 0; #ifdef CONFIG_MEMCG if (!mem_cgroup_kmem_disabled()) extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); #endif return size * num_possible_cpus() + extra_size; } #ifdef CONFIG_PERCPU_STATS #include <linux/spinlock.h> struct percpu_stats { u64 nr_alloc; /* lifetime # of allocations */ u64 nr_dealloc; /* lifetime # of deallocations */ u64 nr_cur_alloc; /* current # of allocations */ u64 nr_max_alloc; /* max # of live allocations */ u32 nr_chunks; /* current # of live chunks */ u32 nr_max_chunks; /* max # of live chunks */ size_t min_alloc_size; /* min allocation size */ size_t max_alloc_size; /* max allocation size */ }; extern struct percpu_stats pcpu_stats; extern struct pcpu_alloc_info pcpu_stats_ai; /* * For debug purposes. We don't care about the flexible array. */ static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info)); /* initialize min_alloc_size to unit_size */ pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size; } /* * pcpu_stats_area_alloc - increment area allocation stats * @chunk: the location of the area being allocated * @size: size of area to allocate in bytes * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_alloc++; pcpu_stats.nr_cur_alloc++; pcpu_stats.nr_max_alloc = max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc); pcpu_stats.min_alloc_size = min(pcpu_stats.min_alloc_size, size); pcpu_stats.max_alloc_size = max(pcpu_stats.max_alloc_size, size); chunk->nr_alloc++; chunk->max_alloc_size = max(chunk->max_alloc_size, size); } /* * pcpu_stats_area_dealloc - decrement allocation stats * @chunk: the location of the area being deallocated * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_dealloc++; pcpu_stats.nr_cur_alloc--; chunk->nr_alloc--; } /* * pcpu_stats_chunk_alloc - increment chunk stats */ static inline void pcpu_stats_chunk_alloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks++; pcpu_stats.nr_max_chunks = max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks); spin_unlock_irqrestore(&pcpu_lock, flags); } /* * pcpu_stats_chunk_dealloc - decrement chunk stats */ static inline void pcpu_stats_chunk_dealloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks--; spin_unlock_irqrestore(&pcpu_lock, flags); } #else static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { } static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { } static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { } static inline void pcpu_stats_chunk_alloc(void) { } static inline void pcpu_stats_chunk_dealloc(void) { } #endif /* !CONFIG_PERCPU_STATS */ #endif |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 | // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/hrtimer.h> #include <linux/list.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/bcm.h> #include <linux/slab.h> #include <net/sock.h> #include <net/net_namespace.h> /* * To send multiple CAN frame content within TX_SETUP or to filter * CAN messages with multiplex index within RX_SETUP, the number of * different filters is limited to 256 due to the one byte index value. */ #define MAX_NFRAMES 256 /* limit timers to 400 days for sending/timeouts */ #define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60) /* use of last_frames[index].flags */ #define RX_LOCAL 0x10 /* frame was created on the local host */ #define RX_OWN 0x20 /* frame was sent via the socket it was received on */ #define RX_RECV 0x40 /* received data for this element */ #define RX_THR 0x80 /* element not been sent due to throttle feature */ #define BCM_CAN_FLAGS_MASK 0x0F /* to clean private flags after usage */ /* get best masking value for can_rx_register() for a given single can_id */ #define REGMASK(id) ((id & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS("can-proto-2"); #define BCM_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex) /* * easy access to the first 64 bit of can(fd)_frame payload. cp->data is * 64 bit aligned so the offset has to be multiples of 8 which is ensured * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler(). */ static inline u64 get_u64(const struct canfd_frame *cp, int offset) { return *(u64 *)(cp->data + offset); } struct bcm_op { struct list_head list; struct rcu_head rcu; int ifindex; canid_t can_id; u32 flags; unsigned long frames_abs, frames_filtered; struct bcm_timeval ival1, ival2; struct hrtimer timer, thrtimer; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; int rx_ifindex; int cfsiz; u32 count; u32 nframes; u32 currframe; /* void pointers to arrays of struct can[fd]_frame */ void *frames; void *last_frames; struct canfd_frame sframe; struct canfd_frame last_sframe; struct sock *sk; struct net_device *rx_reg_dev; }; struct bcm_sock { struct sock sk; int bound; int ifindex; struct list_head notifier; struct list_head rx_ops; struct list_head tx_ops; unsigned long dropped_usr_msgs; struct proc_dir_entry *bcm_proc_read; char procname [32]; /* inode number in decimal with \0 */ }; static LIST_HEAD(bcm_notifier_list); static DEFINE_SPINLOCK(bcm_notifier_lock); static struct bcm_sock *bcm_busy_notifier; /* Return pointer to store the extra msg flags for bcm_recvmsg(). * We use the space of one unsigned int beyond the 'struct sockaddr_can' * in skb->cb. */ static inline unsigned int *bcm_flags(struct sk_buff *skb) { /* return pointer after struct sockaddr_can */ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]); } static inline struct bcm_sock *bcm_sk(const struct sock *sk) { return (struct bcm_sock *)sk; } static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv) { return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); } /* check limitations for timeval provided by user */ static bool bcm_is_invalid_tv(struct bcm_msg_head *msg_head) { if ((msg_head->ival1.tv_sec < 0) || (msg_head->ival1.tv_sec > BCM_TIMER_SEC_MAX) || (msg_head->ival1.tv_usec < 0) || (msg_head->ival1.tv_usec >= USEC_PER_SEC) || (msg_head->ival2.tv_sec < 0) || (msg_head->ival2.tv_sec > BCM_TIMER_SEC_MAX) || (msg_head->ival2.tv_usec < 0) || (msg_head->ival2.tv_usec >= USEC_PER_SEC)) return true; return false; } #define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU) #define OPSIZ sizeof(struct bcm_op) #define MHSIZ sizeof(struct bcm_msg_head) /* * procfs functions */ #if IS_ENABLED(CONFIG_PROC_FS) static char *bcm_proc_getifname(struct net *net, char *result, int ifindex) { struct net_device *dev; if (!ifindex) return "any"; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) strcpy(result, dev->name); else strcpy(result, "???"); rcu_read_unlock(); return result; } static int bcm_proc_show(struct seq_file *m, void *v) { char ifname[IFNAMSIZ]; struct net *net = m->private; struct sock *sk = (struct sock *)pde_data(m->file->f_inode); struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; seq_printf(m, ">>> socket %pK", sk->sk_socket); seq_printf(m, " / sk %pK", sk); seq_printf(m, " / bo %pK", bo); seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs); seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex)); seq_printf(m, " <<<\n"); list_for_each_entry(op, &bo->rx_ops, list) { unsigned long reduction; /* print only active entries & prevent division by zero */ if (!op->frames_abs) continue; seq_printf(m, "rx_op: %03X %-5s ", op->can_id, bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u)", op->nframes); else seq_printf(m, "[%u]", op->nframes); seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' '); if (op->kt_ival1) seq_printf(m, "timeo=%lld ", (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2) seq_printf(m, "thr=%lld ", (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# recv %ld (%ld) => reduction: ", op->frames_filtered, op->frames_abs); reduction = 100 - (op->frames_filtered * 100) / op->frames_abs; seq_printf(m, "%s%ld%%\n", (reduction == 100) ? "near " : "", reduction); } list_for_each_entry(op, &bo->tx_ops, list) { seq_printf(m, "tx_op: %03X %s ", op->can_id, bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u) ", op->nframes); else seq_printf(m, "[%u] ", op->nframes); if (op->kt_ival1) seq_printf(m, "t1=%lld ", (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2) seq_printf(m, "t2=%lld ", (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# sent %ld\n", op->frames_abs); } seq_putc(m, '\n'); return 0; } #endif /* CONFIG_PROC_FS */ /* * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface * of the given bcm tx op */ static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe; int err; /* no target device? => exit */ if (!op->ifindex) return; dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (!dev) { /* RFC: should this bcm_op remove itself here? */ return; } skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any()); if (!skb) goto out; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb_put_data(skb, cf, op->cfsiz); /* send with loopback */ skb->dev = dev; can_skb_set_owner(skb, op->sk); err = can_send(skb, 1); if (!err) op->frames_abs++; op->currframe++; /* reached last frame? */ if (op->currframe >= op->nframes) op->currframe = 0; out: dev_put(dev); } /* * bcm_send_to_user - send a BCM message to the userspace * (consisting of bcm_msg_head + x CAN frames) */ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, struct canfd_frame *frames, int has_timestamp) { struct sk_buff *skb; struct canfd_frame *firstframe; struct sockaddr_can *addr; struct sock *sk = op->sk; unsigned int datalen = head->nframes * op->cfsiz; int err; unsigned int *pflags; skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); if (!skb) return; skb_put_data(skb, head, sizeof(*head)); /* ensure space for sockaddr_can and msg flags */ sock_skb_cb_check_size(sizeof(struct sockaddr_can) + sizeof(unsigned int)); /* initialize msg flags */ pflags = bcm_flags(skb); *pflags = 0; if (head->nframes) { /* CAN frames starting here */ firstframe = (struct canfd_frame *)skb_tail_pointer(skb); skb_put_data(skb, frames, datalen); /* * the BCM uses the flags-element of the canfd_frame * structure for internal purposes. This is only * relevant for updates that are generated by the * BCM, where nframes is 1 */ if (head->nframes == 1) { if (firstframe->flags & RX_LOCAL) *pflags |= MSG_DONTROUTE; if (firstframe->flags & RX_OWN) *pflags |= MSG_CONFIRM; firstframe->flags &= BCM_CAN_FLAGS_MASK; } } if (has_timestamp) { /* restore rx timestamp */ skb->tstamp = op->rx_stamp; } /* * Put the datagram to the queue so that bcm_recvmsg() can * get it from there. We need to pass the interface index to * bcm_recvmsg(). We pass a whole struct sockaddr_can in skb->cb * containing the interface index. */ addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = op->rx_ifindex; err = sock_queue_rcv_skb(sk, skb); if (err < 0) { struct bcm_sock *bo = bcm_sk(sk); kfree_skb(skb); /* don't care about overflows in this statistic */ bo->dropped_usr_msgs++; } } static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt) { ktime_t ival; if (op->kt_ival1 && op->count) ival = op->kt_ival1; else if (op->kt_ival2) ival = op->kt_ival2; else return false; hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival)); return true; } static void bcm_tx_start_timer(struct bcm_op *op) { if (bcm_tx_set_expiry(op, &op->timer)) hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT); } /* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; if (op->kt_ival1 && (op->count > 0)) { op->count--; if (!op->count && (op->flags & TX_COUNTEVT)) { /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = TX_EXPIRED; msg_head.flags = op->flags; msg_head.count = op->count; msg_head.ival1 = op->ival1; msg_head.ival2 = op->ival2; msg_head.can_id = op->can_id; msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); } bcm_can_tx(op); } else if (op->kt_ival2) { bcm_can_tx(op); } return bcm_tx_set_expiry(op, &op->timer) ? HRTIMER_RESTART : HRTIMER_NORESTART; } /* * bcm_rx_changed - create a RX_CHANGED notification due to changed content */ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) { struct bcm_msg_head head; /* update statistics */ op->frames_filtered++; /* prevent statistics overflow */ if (op->frames_filtered > ULONG_MAX/100) op->frames_filtered = op->frames_abs = 0; /* this element is not throttled anymore */ data->flags &= ~RX_THR; memset(&head, 0, sizeof(head)); head.opcode = RX_CHANGED; head.flags = op->flags; head.count = op->count; head.ival1 = op->ival1; head.ival2 = op->ival2; head.can_id = op->can_id; head.nframes = 1; bcm_send_to_user(op, &head, data, 1); } /* * bcm_rx_update_and_send - process a detected relevant receive content change * 1. update the last received data * 2. send a notification to the user (if possible) */ static void bcm_rx_update_and_send(struct bcm_op *op, struct canfd_frame *lastdata, const struct canfd_frame *rxdata, unsigned char traffic_flags) { memcpy(lastdata, rxdata, op->cfsiz); /* mark as used and throttled by default */ lastdata->flags |= (RX_RECV|RX_THR); /* add own/local/remote traffic flags */ lastdata->flags |= traffic_flags; /* throttling mode inactive ? */ if (!op->kt_ival2) { /* send RX_CHANGED to the user immediately */ bcm_rx_changed(op, lastdata); return; } /* with active throttling timer we are just done here */ if (hrtimer_active(&op->thrtimer)) return; /* first reception with enabled throttling mode */ if (!op->kt_lastmsg) goto rx_changed_settime; /* got a second frame inside a potential throttle period? */ if (ktime_us_delta(ktime_get(), op->kt_lastmsg) < ktime_to_us(op->kt_ival2)) { /* do not send the saved data - only start throttle timer */ hrtimer_start(&op->thrtimer, ktime_add(op->kt_lastmsg, op->kt_ival2), HRTIMER_MODE_ABS_SOFT); return; } /* the gap was that big, that throttling was not needed here */ rx_changed_settime: bcm_rx_changed(op, lastdata); op->kt_lastmsg = ktime_get(); } /* * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly * received data stored in op->last_frames[] */ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, const struct canfd_frame *rxdata, unsigned char traffic_flags) { struct canfd_frame *cf = op->frames + op->cfsiz * index; struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; int i; /* * no one uses the MSBs of flags for comparison, * so we use it here to detect the first time of reception */ if (!(lcf->flags & RX_RECV)) { /* received data for the first time => send update to user */ bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } /* do a real check in CAN frame data section */ for (i = 0; i < rxdata->len; i += 8) { if ((get_u64(cf, i) & get_u64(rxdata, i)) != (get_u64(cf, i) & get_u64(lcf, i))) { bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } } if (op->flags & RX_CHECK_DLC) { /* do a real check in CAN frame length */ if (rxdata->len != lcf->len) { bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } } } /* * bcm_rx_starttimer - enable timeout monitoring for CAN frame reception */ static void bcm_rx_starttimer(struct bcm_op *op) { if (op->flags & RX_NO_AUTOTIMER) return; if (op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); } /* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; /* if user wants to be informed, when cyclic CAN-Messages come back */ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { /* clear received CAN frames to indicate 'nothing received' */ memset(op->last_frames, 0, op->nframes * op->cfsiz); } /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = RX_TIMEOUT; msg_head.flags = op->flags; msg_head.count = op->count; msg_head.ival1 = op->ival1; msg_head.ival2 = op->ival2; msg_head.can_id = op->can_id; msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); return HRTIMER_NORESTART; } /* * bcm_rx_do_flush - helper for bcm_rx_thr_flush */ static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index) { struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; if ((op->last_frames) && (lcf->flags & RX_THR)) { bcm_rx_changed(op, lcf); return 1; } return 0; } /* * bcm_rx_thr_flush - Check for throttled data and send it to the userspace */ static int bcm_rx_thr_flush(struct bcm_op *op) { int updated = 0; if (op->nframes > 1) { unsigned int i; /* for MUX filter we start at index 1 */ for (i = 1; i < op->nframes; i++) updated += bcm_rx_do_flush(op, i); } else { /* for RX_FILTER_ID and simple filter */ updated += bcm_rx_do_flush(op, 0); } return updated; } /* * bcm_rx_thr_handler - the time for blocked content updates is over now: * Check for throttled data and send it to the userspace */ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer); if (bcm_rx_thr_flush(op)) { hrtimer_forward_now(hrtimer, op->kt_ival2); return HRTIMER_RESTART; } else { /* rearm throttle handling */ op->kt_lastmsg = 0; return HRTIMER_NORESTART; } } /* * bcm_rx_handler - handle a CAN frame reception */ static void bcm_rx_handler(struct sk_buff *skb, void *data) { struct bcm_op *op = (struct bcm_op *)data; const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data; unsigned int i; unsigned char traffic_flags; if (op->can_id != rxframe->can_id) return; /* make sure to handle the correct frame type (CAN / CAN FD) */ if (op->flags & CAN_FD_FRAME) { if (!can_is_canfd_skb(skb)) return; } else { if (!can_is_can_skb(skb)) return; } /* disable timeout */ hrtimer_cancel(&op->timer); /* save rx timestamp */ op->rx_stamp = skb->tstamp; /* save originator for recvfrom() */ op->rx_ifindex = skb->dev->ifindex; /* update statistics */ op->frames_abs++; if (op->flags & RX_RTR_FRAME) { /* send reply for RTR-request (placed in op->frames[0]) */ bcm_can_tx(op); return; } /* compute flags to distinguish between own/local/remote CAN traffic */ traffic_flags = 0; if (skb->sk) { traffic_flags |= RX_LOCAL; if (skb->sk == op->sk) traffic_flags |= RX_OWN; } if (op->flags & RX_FILTER_ID) { /* the easiest case */ bcm_rx_update_and_send(op, op->last_frames, rxframe, traffic_flags); goto rx_starttimer; } if (op->nframes == 1) { /* simple compare with index 0 */ bcm_rx_cmp_to_index(op, 0, rxframe, traffic_flags); goto rx_starttimer; } if (op->nframes > 1) { /* * multiplex compare * * find the first multiplex mask that fits. * Remark: The MUX-mask is stored in index 0 - but only the * first 64 bits of the frame data[] are relevant (CAN FD) */ for (i = 1; i < op->nframes; i++) { if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) == (get_u64(op->frames, 0) & get_u64(op->frames + op->cfsiz * i, 0))) { bcm_rx_cmp_to_index(op, i, rxframe, traffic_flags); break; } } } rx_starttimer: bcm_rx_starttimer(op); } /* * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements */ static struct bcm_op *bcm_find_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op; list_for_each_entry(op, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) return op; } return NULL; } static void bcm_free_op_rcu(struct rcu_head *rcu_head) { struct bcm_op *op = container_of(rcu_head, struct bcm_op, rcu); if ((op->frames) && (op->frames != &op->sframe)) kfree(op->frames); if ((op->last_frames) && (op->last_frames != &op->last_sframe)) kfree(op->last_frames); kfree(op); } static void bcm_remove_op(struct bcm_op *op) { hrtimer_cancel(&op->timer); hrtimer_cancel(&op->thrtimer); call_rcu(&op->rcu, bcm_free_op_rcu); } static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) { if (op->rx_reg_dev == dev) { can_rx_unregister(dev_net(dev), dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); /* mark as removed subscription */ op->rx_reg_dev = NULL; } else printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device " "mismatch %p %p\n", op->rx_reg_dev, dev); } /* * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops) */ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { /* disable automatic timer on frame reception */ op->flags |= RX_NO_AUTOTIMER; /* * Don't care if we're bound or not (due to netdev * problems) can_rx_unregister() is always a save * thing to do here. */ if (op->ifindex) { /* * Only remove subscriptions that had not * been removed due to NETDEV_UNREGISTER * in bcm_notifier() */ if (op->rx_reg_dev) { struct net_device *dev; dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (dev) { bcm_rx_unreg(dev, op); dev_put(dev); } } } else can_rx_unregister(sock_net(op->sk), NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); list_del(&op->list); bcm_remove_op(op); return 1; /* done */ } } return 0; /* not found */ } /* * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops) */ static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { list_del(&op->list); bcm_remove_op(op); return 1; /* done */ } } return 0; /* not found */ } /* * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg) */ static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head, int ifindex) { struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex); if (!op) return -EINVAL; /* put current values into msg_head */ msg_head->flags = op->flags; msg_head->count = op->count; msg_head->ival1 = op->ival1; msg_head->ival2 = op->ival2; msg_head->nframes = op->nframes; bcm_send_to_user(op, msg_head, op->frames, 0); return MHSIZ; } /* * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg) */ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, int ifindex, struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; struct canfd_frame *cf; unsigned int i; int err; /* we need a real device to send frames */ if (!ifindex) return -ENODEV; /* check nframes boundaries - we need at least one CAN frame */ if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES) return -EINVAL; /* check timeval limitations */ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) return -EINVAL; /* check the given can_id */ op = bcm_find_op(&bo->tx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; /* update CAN frames content */ for (i = 0; i < msg_head->nframes; i++) { cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) err = -EINVAL; } else { if (cf->len > 8) err = -EINVAL; } if (err < 0) return err; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ cf->can_id = msg_head->can_id; } } op->flags = msg_head->flags; } else { /* insert new BCM operation for the given can_id */ op = kzalloc(OPSIZ, GFP_KERNEL); if (!op) return -ENOMEM; op->can_id = msg_head->can_id; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { op->frames = kmalloc_array(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } } else op->frames = &op->sframe; for (i = 0; i < msg_head->nframes; i++) { cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); if (err < 0) goto free_op; if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) err = -EINVAL; } else { if (cf->len > 8) err = -EINVAL; } if (err < 0) goto free_op; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ cf->can_id = msg_head->can_id; } } /* tx_ops never compare with previous received messages */ op->last_frames = NULL; /* bcm_can_tx / bcm_tx_timeout_handler needs this */ op->sk = sk; op->ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ hrtimer_setup(&op->timer, bcm_tx_timeout_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); /* currently unused in tx_ops */ hrtimer_setup(&op->thrtimer, hrtimer_dummy_timeout, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the tx_ops */ list_add(&op->list, &bo->tx_ops); } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */ if (op->nframes != msg_head->nframes) { op->nframes = msg_head->nframes; /* start multiple frame transmission with index 0 */ op->currframe = 0; } /* check flags */ if (op->flags & TX_RESET_MULTI_IDX) { /* start multiple frame transmission with index 0 */ op->currframe = 0; } if (op->flags & SETTIMER) { /* set timer values */ op->count = msg_head->count; op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero values? */ if (!op->kt_ival1 && !op->kt_ival2) hrtimer_cancel(&op->timer); } if (op->flags & STARTTIMER) { hrtimer_cancel(&op->timer); /* spec: send CAN frame when starting timer */ op->flags |= TX_ANNOUNCE; } if (op->flags & TX_ANNOUNCE) { bcm_can_tx(op); if (op->count) op->count--; } if (op->flags & STARTTIMER) bcm_tx_start_timer(op); return msg_head->nframes * op->cfsiz + MHSIZ; free_op: if (op->frames != &op->sframe) kfree(op->frames); kfree(op); return err; } /* * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg) */ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, int ifindex, struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; int do_rx_register; int err = 0; if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) { /* be robust against wrong usage ... */ msg_head->flags |= RX_FILTER_ID; /* ignore trailing garbage */ msg_head->nframes = 0; } /* the first element contains the mux-mask => MAX_NFRAMES + 1 */ if (msg_head->nframes > MAX_NFRAMES + 1) return -EINVAL; if ((msg_head->flags & RX_RTR_FRAME) && ((msg_head->nframes != 1) || (!(msg_head->can_id & CAN_RTR_FLAG)))) return -EINVAL; /* check timeval limitations */ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) return -EINVAL; /* check the given can_id */ op = bcm_find_op(&bo->rx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; if (msg_head->nframes) { /* update CAN frames content */ err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) return err; /* clear last_frames to indicate 'nothing received' */ memset(op->last_frames, 0, msg_head->nframes * op->cfsiz); } op->nframes = msg_head->nframes; op->flags = msg_head->flags; /* Only an update -> do not call can_rx_register() */ do_rx_register = 0; } else { /* insert new BCM operation for the given can_id */ op = kzalloc(OPSIZ, GFP_KERNEL); if (!op) return -ENOMEM; op->can_id = msg_head->can_id; op->nframes = msg_head->nframes; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; if (msg_head->nframes > 1) { /* create array for CAN frames and copy the data */ op->frames = kmalloc_array(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } /* create and init array for received CAN frames */ op->last_frames = kcalloc(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->last_frames) { kfree(op->frames); kfree(op); return -ENOMEM; } } else { op->frames = &op->sframe; op->last_frames = &op->last_sframe; } if (msg_head->nframes) { err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) { if (op->frames != &op->sframe) kfree(op->frames); if (op->last_frames != &op->last_sframe) kfree(op->last_frames); kfree(op); return err; } } /* bcm_can_tx / bcm_tx_timeout_handler needs this */ op->sk = sk; op->ifindex = ifindex; /* ifindex for timeout events w/o previous frame reception */ op->rx_ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ hrtimer_setup(&op->timer, bcm_rx_timeout_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); hrtimer_setup(&op->thrtimer, bcm_rx_thr_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the rx_ops */ list_add(&op->list, &bo->rx_ops); /* call can_rx_register() */ do_rx_register = 1; } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */ /* check flags */ if (op->flags & RX_RTR_FRAME) { struct canfd_frame *frame0 = op->frames; /* no timers in RTR-mode */ hrtimer_cancel(&op->thrtimer); hrtimer_cancel(&op->timer); /* * funny feature in RX(!)_SETUP only for RTR-mode: * copy can_id into frame BUT without RTR-flag to * prevent a full-load-loopback-test ... ;-] */ if ((op->flags & TX_CP_CAN_ID) || (frame0->can_id == op->can_id)) frame0->can_id = op->can_id & ~CAN_RTR_FLAG; } else { if (op->flags & SETTIMER) { /* set timer value */ op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero value? */ if (!op->kt_ival1) hrtimer_cancel(&op->timer); /* * In any case cancel the throttle timer, flush * potentially blocked msgs and reset throttle handling */ op->kt_lastmsg = 0; hrtimer_cancel(&op->thrtimer); bcm_rx_thr_flush(op); } if ((op->flags & STARTTIMER) && op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); } /* now we can register for can_ids, if we added a new bcm_op */ if (do_rx_register) { if (ifindex) { struct net_device *dev; dev = dev_get_by_index(sock_net(sk), ifindex); if (dev) { err = can_rx_register(sock_net(sk), dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); op->rx_reg_dev = dev; dev_put(dev); } } else err = can_rx_register(sock_net(sk), NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); if (err) { /* this bcm rx op is broken -> remove it */ list_del(&op->list); bcm_remove_op(op); return err; } } return msg_head->nframes * op->cfsiz + MHSIZ; } /* * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg) */ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, int cfsiz) { struct sk_buff *skb; struct net_device *dev; int err; /* we need a real device to send frames */ if (!ifindex) return -ENODEV; skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL); if (!skb) return -ENOMEM; can_skb_reserve(skb); err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz); if (err < 0) { kfree_skb(skb); return err; } dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) { kfree_skb(skb); return -ENODEV; } can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb->dev = dev; can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ dev_put(dev); if (err) return err; return cfsiz + MHSIZ; } /* * bcm_sendmsg - process BCM commands (opcodes) from the userspace */ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); int ifindex = bo->ifindex; /* default ifindex for this bcm_op */ struct bcm_msg_head msg_head; int cfsiz; int ret; /* read bytes or error codes as return value */ if (!bo->bound) return -ENOTCONN; /* check for valid message length from userspace */ if (size < MHSIZ) return -EINVAL; /* read message head information */ ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ); if (ret < 0) return ret; cfsiz = CFSIZ(msg_head.flags); if ((size - MHSIZ) % cfsiz) return -EINVAL; /* check for alternative ifindex for this bcm_op */ if (!ifindex && msg->msg_name) { /* no bound device as default => check msg_name */ DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); if (msg->msg_namelen < BCM_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; /* ifindex from sendto() */ ifindex = addr->can_ifindex; if (ifindex) { struct net_device *dev; dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) return -ENODEV; if (dev->type != ARPHRD_CAN) { dev_put(dev); return -ENODEV; } dev_put(dev); } } lock_sock(sk); switch (msg_head.opcode) { case TX_SETUP: ret = bcm_tx_setup(&msg_head, msg, ifindex, sk); break; case RX_SETUP: ret = bcm_rx_setup(&msg_head, msg, ifindex, sk); break; case TX_DELETE: if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case RX_DELETE: if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case TX_READ: /* reuse msg_head for the reply to TX_READ */ msg_head.opcode = TX_STATUS; ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex); break; case RX_READ: /* reuse msg_head for the reply to RX_READ */ msg_head.opcode = RX_STATUS; ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex); break; case TX_SEND: /* we need exactly one CAN frame behind the msg head */ if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ)) ret = -EINVAL; else ret = bcm_tx_send(msg, ifindex, sk, cfsiz); break; default: ret = -EINVAL; break; } release_sock(sk); return ret; } /* * notification handler for netdevice status changes */ static void bcm_notify(struct bcm_sock *bo, unsigned long msg, struct net_device *dev) { struct sock *sk = &bo->sk; struct bcm_op *op; int notify_enodev = 0; if (!net_eq(dev_net(dev), sock_net(sk))) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove device specific receive entries */ list_for_each_entry(op, &bo->rx_ops, list) if (op->rx_reg_dev == dev) bcm_rx_unreg(dev, op); /* remove device reference, if this is our bound device */ if (bo->bound && bo->ifindex == dev->ifindex) { #if IS_ENABLED(CONFIG_PROC_FS) if (sock_net(sk)->can.bcmproc_dir && bo->bcm_proc_read) { remove_proc_entry(bo->procname, sock_net(sk)->can.bcmproc_dir); bo->bcm_proc_read = NULL; } #endif bo->bound = 0; bo->ifindex = 0; notify_enodev = 1; } release_sock(sk); if (notify_enodev) { sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } break; case NETDEV_DOWN: if (bo->bound && bo->ifindex == dev->ifindex) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } } } static int bcm_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&bcm_notifier_lock); list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) { spin_unlock(&bcm_notifier_lock); bcm_notify(bcm_busy_notifier, msg, dev); spin_lock(&bcm_notifier_lock); } bcm_busy_notifier = NULL; spin_unlock(&bcm_notifier_lock); return NOTIFY_DONE; } /* * initial settings for all BCM sockets to be set at socket creation time */ static int bcm_init(struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); bo->bound = 0; bo->ifindex = 0; bo->dropped_usr_msgs = 0; bo->bcm_proc_read = NULL; INIT_LIST_HEAD(&bo->tx_ops); INIT_LIST_HEAD(&bo->rx_ops); /* set notifier */ spin_lock(&bcm_notifier_lock); list_add_tail(&bo->notifier, &bcm_notifier_list); spin_unlock(&bcm_notifier_lock); return 0; } /* * standard socket functions */ static int bcm_release(struct socket *sock) { struct sock *sk = sock->sk; struct net *net; struct bcm_sock *bo; struct bcm_op *op, *next; if (!sk) return 0; net = sock_net(sk); bo = bcm_sk(sk); /* remove bcm_ops, timer, rx_unregister(), etc. */ spin_lock(&bcm_notifier_lock); while (bcm_busy_notifier == bo) { spin_unlock(&bcm_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&bcm_notifier_lock); } list_del(&bo->notifier); spin_unlock(&bcm_notifier_lock); lock_sock(sk); #if IS_ENABLED(CONFIG_PROC_FS) /* remove procfs entry */ if (net->can.bcmproc_dir && bo->bcm_proc_read) remove_proc_entry(bo->procname, net->can.bcmproc_dir); #endif /* CONFIG_PROC_FS */ list_for_each_entry_safe(op, next, &bo->tx_ops, list) bcm_remove_op(op); list_for_each_entry_safe(op, next, &bo->rx_ops, list) { /* * Don't care if we're bound or not (due to netdev problems) * can_rx_unregister() is always a save thing to do here. */ if (op->ifindex) { /* * Only remove subscriptions that had not * been removed due to NETDEV_UNREGISTER * in bcm_notifier() */ if (op->rx_reg_dev) { struct net_device *dev; dev = dev_get_by_index(net, op->ifindex); if (dev) { bcm_rx_unreg(dev, op); dev_put(dev); } } } else can_rx_unregister(net, NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); } synchronize_rcu(); list_for_each_entry_safe(op, next, &bo->rx_ops, list) bcm_remove_op(op); /* remove device reference */ if (bo->bound) { bo->bound = 0; bo->ifindex = 0; } sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_put(sk); return 0; } static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); struct net *net = sock_net(sk); int ret = 0; if (len < BCM_MIN_NAMELEN) return -EINVAL; lock_sock(sk); if (bo->bound) { ret = -EISCONN; goto fail; } /* bind a device to this socket */ if (addr->can_ifindex) { struct net_device *dev; dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { ret = -ENODEV; goto fail; } if (dev->type != ARPHRD_CAN) { dev_put(dev); ret = -ENODEV; goto fail; } bo->ifindex = dev->ifindex; dev_put(dev); } else { /* no interface reference for ifindex = 0 ('any' CAN device) */ bo->ifindex = 0; } #if IS_ENABLED(CONFIG_PROC_FS) if (net->can.bcmproc_dir) { /* unique socket address as filename */ sprintf(bo->procname, "%lu", sock_i_ino(sk)); bo->bcm_proc_read = proc_create_net_single(bo->procname, 0644, net->can.bcmproc_dir, bcm_proc_show, sk); if (!bo->bcm_proc_read) { ret = -ENOMEM; goto fail; } } #endif /* CONFIG_PROC_FS */ bo->bound = 1; fail: release_sock(sk); return ret; } static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int error = 0; int err; skb = skb_recv_datagram(sk, flags, &error); if (!skb) return error; if (skb->len < size) size = skb->len; err = memcpy_to_msg(msg, skb->data, size); if (err < 0) { skb_free_datagram(sk, skb); return err; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(BCM_MIN_NAMELEN); msg->msg_namelen = BCM_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* assign the flags that have been recorded in bcm_send_to_user() */ msg->msg_flags |= *(bcm_flags(skb)); skb_free_datagram(sk, skb); return size; } static int bcm_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops bcm_ops = { .family = PF_CAN, .release = bcm_release, .bind = sock_no_bind, .connect = bcm_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = bcm_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = bcm_sendmsg, .recvmsg = bcm_recvmsg, .mmap = sock_no_mmap, }; static struct proto bcm_proto __read_mostly = { .name = "CAN_BCM", .owner = THIS_MODULE, .obj_size = sizeof(struct bcm_sock), .init = bcm_init, }; static const struct can_proto bcm_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_BCM, .ops = &bcm_ops, .prot = &bcm_proto, }; static int canbcm_pernet_init(struct net *net) { #if IS_ENABLED(CONFIG_PROC_FS) /* create /proc/net/can-bcm directory */ net->can.bcmproc_dir = proc_net_mkdir(net, "can-bcm", net->proc_net); #endif /* CONFIG_PROC_FS */ return 0; } static void canbcm_pernet_exit(struct net *net) { #if IS_ENABLED(CONFIG_PROC_FS) /* remove /proc/net/can-bcm directory */ if (net->can.bcmproc_dir) remove_proc_entry("can-bcm", net->proc_net); #endif /* CONFIG_PROC_FS */ } static struct pernet_operations canbcm_pernet_ops __read_mostly = { .init = canbcm_pernet_init, .exit = canbcm_pernet_exit, }; static struct notifier_block canbcm_notifier = { .notifier_call = bcm_notifier }; static int __init bcm_module_init(void) { int err; pr_info("can: broadcast manager protocol\n"); err = register_pernet_subsys(&canbcm_pernet_ops); if (err) return err; err = register_netdevice_notifier(&canbcm_notifier); if (err) goto register_notifier_failed; err = can_proto_register(&bcm_can_proto); if (err < 0) { printk(KERN_ERR "can: registration of bcm protocol failed\n"); goto register_proto_failed; } return 0; register_proto_failed: unregister_netdevice_notifier(&canbcm_notifier); register_notifier_failed: unregister_pernet_subsys(&canbcm_pernet_ops); return err; } static void __exit bcm_module_exit(void) { can_proto_unregister(&bcm_can_proto); unregister_netdevice_notifier(&canbcm_notifier); unregister_pernet_subsys(&canbcm_pernet_ops); } module_init(bcm_module_init); module_exit(bcm_module_exit); |
1493 1494 220 220 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | // SPDX-License-Identifier: GPL-2.0 #include <linux/bug.h> #include <linux/export.h> #include <linux/types.h> #include <linux/mmdebug.h> #include <linux/mm.h> #include <asm/memory.h> phys_addr_t __virt_to_phys(unsigned long x) { WARN(!__is_lm_address(__tag_reset(x)), "virt_to_phys used for non-linear address: %p (%pS)\n", (void *)x, (void *)x); return __virt_to_phys_nodebug(x); } EXPORT_SYMBOL(__virt_to_phys); phys_addr_t __phys_addr_symbol(unsigned long x) { /* * This is bounds checking against the kernel image only. * __pa_symbol should only be used on kernel symbol addresses. */ VIRTUAL_BUG_ON(x < (unsigned long) KERNEL_START || x > (unsigned long) KERNEL_END); return __pa_symbol_nodebug(x); } EXPORT_SYMBOL(__phys_addr_symbol); |
22 21 22 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | // SPDX-License-Identifier: GPL-2.0-only /* * Lock-less NULL terminated single linked list * * The basic atomic operation of this list is cmpxchg on long. On * architectures that don't have NMI-safe cmpxchg implementation, the * list can NOT be used in NMI handlers. So code that uses the list in * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. * * Copyright 2010,2011 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/llist.h> /** * llist_add_batch - add several linked entries in batch * @new_first: first entry in batch to be added * @new_last: last entry in batch to be added * @head: the head for your lock-less list * * Return whether list is empty before adding. */ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, struct llist_head *head) { struct llist_node *first = READ_ONCE(head->first); do { new_last->next = first; } while (!try_cmpxchg(&head->first, &first, new_first)); return !first; } EXPORT_SYMBOL_GPL(llist_add_batch); /** * llist_del_first - delete the first entry of lock-less list * @head: the head for your lock-less list * * If list is empty, return NULL, otherwise, return the first entry * deleted, this is the newest added one. * * Only one llist_del_first user can be used simultaneously with * multiple llist_add users without lock. Because otherwise * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add, * llist_add) sequence in another user may change @head->first->next, * but keep @head->first. If multiple consumers are needed, please * use llist_del_all or use lock between consumers. */ struct llist_node *llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; entry = smp_load_acquire(&head->first); do { if (entry == NULL) return NULL; next = READ_ONCE(entry->next); } while (!try_cmpxchg(&head->first, &entry, next)); return entry; } EXPORT_SYMBOL_GPL(llist_del_first); /** * llist_del_first_this - delete given entry of lock-less list if it is first * @head: the head for your lock-less list * @this: a list entry. * * If head of the list is given entry, delete and return %true else * return %false. * * Multiple callers can safely call this concurrently with multiple * llist_add() callers, providing all the callers offer a different @this. */ bool llist_del_first_this(struct llist_head *head, struct llist_node *this) { struct llist_node *entry, *next; /* acquire ensures orderig wrt try_cmpxchg() is llist_del_first() */ entry = smp_load_acquire(&head->first); do { if (entry != this) return false; next = READ_ONCE(entry->next); } while (!try_cmpxchg(&head->first, &entry, next)); return true; } EXPORT_SYMBOL_GPL(llist_del_first_this); /** * llist_reverse_order - reverse order of a llist chain * @head: first item of the list to be reversed * * Reverse the order of a chain of llist entries and return the * new first entry. */ struct llist_node *llist_reverse_order(struct llist_node *head) { struct llist_node *new_head = NULL; while (head) { struct llist_node *tmp = head; head = head->next; tmp->next = new_head; new_head = tmp; } return new_head; } EXPORT_SYMBOL_GPL(llist_reverse_order); |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | // SPDX-License-Identifier: GPL-2.0-or-later /* * "TEE" target extension for Xtables * Copyright © Sebastian Claßen, 2007 * Jan Engelhardt, 2007-2010 * * based on ipt_ROUTE.c from Cédric de Launois * <delaunois@info.ucl.be> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/route.h> #include <linux/netfilter/x_tables.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/route.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> #include <net/netfilter/ipv6/nf_dup_ipv6.h> #include <linux/netfilter/xt_TEE.h> struct xt_tee_priv { struct list_head list; struct xt_tee_tginfo *tginfo; int oif; }; static unsigned int tee_net_id __read_mostly; static const union nf_inet_addr tee_zero_address; struct tee_net { struct list_head priv_list; /* lock protects the priv_list */ struct mutex lock; }; static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif); return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif); return XT_CONTINUE; } #endif static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct tee_net *tn = net_generic(net, tee_net_id); struct xt_tee_priv *priv; mutex_lock(&tn->lock); list_for_each_entry(priv, &tn->priv_list, list) { switch (event) { case NETDEV_REGISTER: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; break; case NETDEV_UNREGISTER: if (dev->ifindex == priv->oif) priv->oif = -1; break; case NETDEV_CHANGENAME: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; else if (dev->ifindex == priv->oif) priv->oif = -1; break; } } mutex_unlock(&tn->lock); return NOTIFY_DONE; } static int tee_tg_check(const struct xt_tgchk_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; struct xt_tee_priv *priv; /* 0.0.0.0 and :: not allowed */ if (memcmp(&info->gw, &tee_zero_address, sizeof(tee_zero_address)) == 0) return -EINVAL; if (info->oif[0]) { struct net_device *dev; if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv == NULL) return -ENOMEM; priv->tginfo = info; priv->oif = -1; info->priv = priv; dev = dev_get_by_name(par->net, info->oif); if (dev) { priv->oif = dev->ifindex; dev_put(dev); } mutex_lock(&tn->lock); list_add(&priv->list, &tn->priv_list); mutex_unlock(&tn->lock); } else info->priv = NULL; static_key_slow_inc(&xt_tee_enabled); return 0; } static void tee_tg_destroy(const struct xt_tgdtor_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; if (info->priv) { mutex_lock(&tn->lock); list_del(&info->priv->list); mutex_unlock(&tn->lock); kfree(info->priv); } static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { { .name = "TEE", .revision = 1, .family = NFPROTO_IPV4, .target = tee_tg4, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TEE", .revision = 1, .family = NFPROTO_IPV6, .target = tee_tg6, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #endif }; static int __net_init tee_net_init(struct net *net) { struct tee_net *tn = net_generic(net, tee_net_id); INIT_LIST_HEAD(&tn->priv_list); mutex_init(&tn->lock); return 0; } static struct pernet_operations tee_net_ops = { .init = tee_net_init, .id = &tee_net_id, .size = sizeof(struct tee_net), }; static struct notifier_block tee_netdev_notifier = { .notifier_call = tee_netdev_event, }; static int __init tee_tg_init(void) { int ret; ret = register_pernet_subsys(&tee_net_ops); if (ret < 0) return ret; ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); if (ret < 0) goto cleanup_subsys; ret = register_netdevice_notifier(&tee_netdev_notifier); if (ret < 0) goto unregister_targets; return 0; unregister_targets: xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); cleanup_subsys: unregister_pernet_subsys(&tee_net_ops); return ret; } static void __exit tee_tg_exit(void) { unregister_netdevice_notifier(&tee_netdev_notifier); xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); unregister_pernet_subsys(&tee_net_ops); } module_init(tee_tg_init); module_exit(tee_tg_exit); MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: Reroute packet copy"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_TEE"); MODULE_ALIAS("ip6t_TEE"); |
935 936 960 960 960 960 935 948 947 946 945 947 89 925 928 926 926 927 76 961 962 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 | // SPDX-License-Identifier: GPL-2.0 /* * Lockless hierarchical page accounting & limiting * * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner */ #include <linux/page_counter.h> #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/bug.h> #include <asm/page.h> static bool track_protection(struct page_counter *c) { return c->protection_support; } static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { unsigned long protected, old_protected; long delta; if (!c->parent) return; protected = min(usage, READ_ONCE(c->min)); old_protected = atomic_long_read(&c->min_usage); if (protected != old_protected) { old_protected = atomic_long_xchg(&c->min_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_min_usage); } protected = min(usage, READ_ONCE(c->low)); old_protected = atomic_long_read(&c->low_usage); if (protected != old_protected) { old_protected = atomic_long_xchg(&c->low_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_low_usage); } } /** * page_counter_cancel - take pages out of the local counter * @counter: counter * @nr_pages: number of pages to cancel */ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; new = atomic_long_sub_return(nr_pages, &counter->usage); /* More uncharges than charges? */ if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", new, nr_pages)) { new = 0; atomic_long_set(&counter->usage, new); } if (track_protection(counter)) propagate_protected_usage(counter, new); } /** * page_counter_charge - hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * * NOTE: This does not consider any configured counter limits. */ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; bool protection = track_protection(counter); for (c = counter; c; c = c->parent) { long new; new = atomic_long_add_return(nr_pages, &c->usage); if (protection) propagate_protected_usage(c, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. * * Notably, we have two watermarks to allow for both a globally * visible peak and one that can be reset at a smaller scope. * * Since we reset both watermarks when the global reset occurs, * we can guarantee that watermark >= local_watermark, so we * don't need to do both comparisons every time. * * On systems with branch predictors, the inner condition should * be almost free. */ if (new > READ_ONCE(c->local_watermark)) { WRITE_ONCE(c->local_watermark, new); if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } } } /** * page_counter_try_charge - try to hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * @fail: points first counter to hit its limit, if any * * Returns %true on success, or %false and @fail if the counter or one * of its ancestors has hit its configured limit. */ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail) { struct page_counter *c; bool protection = track_protection(counter); bool track_failcnt = counter->track_failcnt; for (c = counter; c; c = c->parent) { long new; /* * Charge speculatively to avoid an expensive CAS. If * a bigger charge fails, it might falsely lock out a * racing smaller charge and send it into reclaim * early, but the error is limited to the difference * between the two sizes, which is less than 2M/4M in * case of a THP locking out a regular page charge. * * The atomic_long_add_return() implies a full memory * barrier between incrementing the count and reading * the limit. When racing with page_counter_set_max(), * we either see the new limit or the setter sees the * counter has changed and retries. */ new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); /* * This is racy, but we can live with some * inaccuracy in the failcnt which is only used * to report stats. */ if (track_failcnt) data_race(c->failcnt++); *fail = c; goto failed; } if (protection) propagate_protected_usage(c, new); /* see comment on page_counter_charge */ if (new > READ_ONCE(c->local_watermark)) { WRITE_ONCE(c->local_watermark, new); if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } } return true; failed: for (c = counter; c != *fail; c = c->parent) page_counter_cancel(c, nr_pages); return false; } /** * page_counter_uncharge - hierarchically uncharge pages * @counter: counter * @nr_pages: number of pages to uncharge */ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; for (c = counter; c; c = c->parent) page_counter_cancel(c, nr_pages); } /** * page_counter_set_max - set the maximum number of pages allowed * @counter: counter * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; long usage; /* * Update the limit while making sure that it's not * below the concurrently-changing counter value. * * The xchg implies two full memory barriers before * and after, so the read-swap-read is ordered and * ensures coherency with page_counter_try_charge(): * that function modifies the count before checking * the limit, so if it sees the old limit, we see the * modified counter and retry. */ usage = page_counter_read(counter); if (usage > nr_pages) return -EBUSY; old = xchg(&counter->max, nr_pages); if (page_counter_read(counter) <= usage || nr_pages >= old) return 0; counter->max = old; cond_resched(); } } /** * page_counter_set_min - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->min, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_set_low - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->low, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse * @max: string meaning maximum possible value * @nr_pages: returns the result in number of pages * * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be * limited to %PAGE_COUNTER_MAX. */ int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages) { char *end; u64 bytes; if (!strcmp(buf, max)) { *nr_pages = PAGE_COUNTER_MAX; return 0; } bytes = memparse(buf, &end); if (*end != '\0') return -EINVAL; *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); return 0; } #if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) /* * This function calculates an individual page counter's effective * protection which is derived from its own memory.min/low, its * parent's and siblings' settings, as well as the actual memory * distribution in the tree. * * The following rules apply to the effective protection values: * * 1. At the first level of reclaim, effective protection is equal to * the declared protection in memory.min and memory.low. * * 2. To enable safe delegation of the protection configuration, at * subsequent levels the effective protection is capped to the * parent's effective protection. * * 3. To make complex and dynamic subtrees easier to configure, the * user is allowed to overcommit the declared protection at a given * level. If that is the case, the parent's effective protection is * distributed to the children in proportion to how much protection * they have declared and how much of it they are utilizing. * * This makes distribution proportional, but also work-conserving: * if one counter claims much more protection than it uses memory, * the unused remainder is available to its siblings. * * 4. Conversely, when the declared protection is undercommitted at a * given level, the distribution of the larger parental protection * budget is NOT proportional. A counter's protection from a sibling * is capped to its own memory.min/low setting. * * 5. However, to allow protecting recursive subtrees from each other * without having to declare each individual counter's fixed share * of the ancestor's claim to protection, any unutilized - * "floating" - protection from up the tree is distributed in * proportion to each counter's *usage*. This makes the protection * neutral wrt sibling cgroups and lets them compete freely over * the shared parental protection budget, but it protects the * subtree as a whole from neighboring subtrees. * * Note that 4. and 5. are not in conflict: 4. is about protecting * against immediate siblings whereas 5. is about protecting against * neighboring subtrees. */ static unsigned long effective_protection(unsigned long usage, unsigned long parent_usage, unsigned long setting, unsigned long parent_effective, unsigned long siblings_protected, bool recursive_protection) { unsigned long protected; unsigned long ep; protected = min(usage, setting); /* * If all cgroups at this level combined claim and use more * protection than what the parent affords them, distribute * shares in proportion to utilization. * * We are using actual utilization rather than the statically * claimed protection in order to be work-conserving: claimed * but unused protection is available to siblings that would * otherwise get a smaller chunk than what they claimed. */ if (siblings_protected > parent_effective) return protected * parent_effective / siblings_protected; /* * Ok, utilized protection of all children is within what the * parent affords them, so we know whatever this child claims * and utilizes is effectively protected. * * If there is unprotected usage beyond this value, reclaim * will apply pressure in proportion to that amount. * * If there is unutilized protection, the cgroup will be fully * shielded from reclaim, but we do return a smaller value for * protection than what the group could enjoy in theory. This * is okay. With the overcommit distribution above, effective * protection is always dependent on how memory is actually * consumed among the siblings anyway. */ ep = protected; /* * If the children aren't claiming (all of) the protection * afforded to them by the parent, distribute the remainder in * proportion to the (unprotected) memory of each cgroup. That * way, cgroups that aren't explicitly prioritized wrt each * other compete freely over the allowance, but they are * collectively protected from neighboring trees. * * We're using unprotected memory for the weight so that if * some cgroups DO claim explicit protection, we don't protect * the same bytes twice. * * Check both usage and parent_usage against the respective * protected values. One should imply the other, but they * aren't read atomically - make sure the division is sane. */ if (!recursive_protection) return ep; if (parent_effective > siblings_protected && parent_usage > siblings_protected && usage > protected) { unsigned long unclaimed; unclaimed = parent_effective - siblings_protected; unclaimed *= usage - protected; unclaimed /= parent_usage - siblings_protected; ep += unclaimed; } return ep; } /** * page_counter_calculate_protection - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @counter: the page_counter the counter to update * @recursive_protection: Whether to use memory_recursiveprot behavior. * * Calculates elow/emin thresholds for given page_counter. * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. */ void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection) { unsigned long usage, parent_usage; struct page_counter *parent = counter->parent; /* * Effective values of the reclaim targets are ignored so they * can be stale. Have a look at mem_cgroup_protection for more * details. * TODO: calculation should be more robust so that we do not need * that special casing. */ if (root == counter) return; usage = page_counter_read(counter); if (!usage) return; if (parent == root) { counter->emin = READ_ONCE(counter->min); counter->elow = READ_ONCE(counter->low); return; } parent_usage = page_counter_read(parent); WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage, READ_ONCE(counter->min), READ_ONCE(parent->emin), atomic_long_read(&parent->children_min_usage), recursive_protection)); WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage, READ_ONCE(counter->low), READ_ONCE(parent->elow), atomic_long_read(&parent->children_low_usage), recursive_protection)); } #endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */ |
780 969 116 961 262 209 145 1396 251 1259 681 585 476 275 5 95 142 22 186 1306 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 | // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-long.sh // DO NOT MODIFY THIS FILE DIRECTLY #ifndef _LINUX_ATOMIC_LONG_H #define _LINUX_ATOMIC_LONG_H #include <linux/compiler.h> #include <asm/types.h> #ifdef CONFIG_64BIT typedef atomic64_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i) #define atomic_long_cond_read_acquire atomic64_cond_read_acquire #define atomic_long_cond_read_relaxed atomic64_cond_read_relaxed #else typedef atomic_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i) #define atomic_long_cond_read_acquire atomic_cond_read_acquire #define atomic_long_cond_read_relaxed atomic_cond_read_relaxed #endif /** * raw_atomic_long_read() - atomic load with relaxed ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline long raw_atomic_long_read(const atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_read(v); #else return raw_atomic_read(v); #endif } /** * raw_atomic_long_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline long raw_atomic_long_read_acquire(const atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_read_acquire(v); #else return raw_atomic_read_acquire(v); #endif } /** * raw_atomic_long_set() - atomic set with relaxed ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_set(atomic_long_t *v, long i) { #ifdef CONFIG_64BIT raw_atomic64_set(v, i); #else raw_atomic_set(v, i); #endif } /** * raw_atomic_long_set_release() - atomic set with release ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic_long_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_set_release(atomic_long_t *v, long i) { #ifdef CONFIG_64BIT raw_atomic64_set_release(v, i); #else raw_atomic_set_release(v, i); #endif } /** * raw_atomic_long_add() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_add(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_add(i, v); #else raw_atomic_add(i, v); #endif } /** * raw_atomic_long_add_return() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return(i, v); #else return raw_atomic_add_return(i, v); #endif } /** * raw_atomic_long_add_return_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_acquire(i, v); #else return raw_atomic_add_return_acquire(i, v); #endif } /** * raw_atomic_long_add_return_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_release(i, v); #else return raw_atomic_add_return_release(i, v); #endif } /** * raw_atomic_long_add_return_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_relaxed(i, v); #else return raw_atomic_add_return_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_add() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add(i, v); #else return raw_atomic_fetch_add(i, v); #endif } /** * raw_atomic_long_fetch_add_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_acquire(i, v); #else return raw_atomic_fetch_add_acquire(i, v); #endif } /** * raw_atomic_long_fetch_add_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_release(i, v); #else return raw_atomic_fetch_add_release(i, v); #endif } /** * raw_atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_relaxed(i, v); #else return raw_atomic_fetch_add_relaxed(i, v); #endif } /** * raw_atomic_long_sub() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_sub(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_sub(i, v); #else raw_atomic_sub(i, v); #endif } /** * raw_atomic_long_sub_return() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return(i, v); #else return raw_atomic_sub_return(i, v); #endif } /** * raw_atomic_long_sub_return_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_acquire(i, v); #else return raw_atomic_sub_return_acquire(i, v); #endif } /** * raw_atomic_long_sub_return_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_release(i, v); #else return raw_atomic_sub_return_release(i, v); #endif } /** * raw_atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_relaxed(i, v); #else return raw_atomic_sub_return_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_sub() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub(i, v); #else return raw_atomic_fetch_sub(i, v); #endif } /** * raw_atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_acquire(i, v); #else return raw_atomic_fetch_sub_acquire(i, v); #endif } /** * raw_atomic_long_fetch_sub_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_release(i, v); #else return raw_atomic_fetch_sub_release(i, v); #endif } /** * raw_atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_relaxed(i, v); #else return raw_atomic_fetch_sub_relaxed(i, v); #endif } /** * raw_atomic_long_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_inc(atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_inc(v); #else raw_atomic_inc(v); #endif } /** * raw_atomic_long_inc_return() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return(v); #else return raw_atomic_inc_return(v); #endif } /** * raw_atomic_long_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_acquire(v); #else return raw_atomic_inc_return_acquire(v); #endif } /** * raw_atomic_long_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_release(v); #else return raw_atomic_inc_return_release(v); #endif } /** * raw_atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_relaxed(v); #else return raw_atomic_inc_return_relaxed(v); #endif } /** * raw_atomic_long_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc(v); #else return raw_atomic_fetch_inc(v); #endif } /** * raw_atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_acquire(v); #else return raw_atomic_fetch_inc_acquire(v); #endif } /** * raw_atomic_long_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_release(v); #else return raw_atomic_fetch_inc_release(v); #endif } /** * raw_atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_relaxed(v); #else return raw_atomic_fetch_inc_relaxed(v); #endif } /** * raw_atomic_long_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_dec(atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_dec(v); #else raw_atomic_dec(v); #endif } /** * raw_atomic_long_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return(v); #else return raw_atomic_dec_return(v); #endif } /** * raw_atomic_long_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_acquire(v); #else return raw_atomic_dec_return_acquire(v); #endif } /** * raw_atomic_long_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_release(v); #else return raw_atomic_dec_return_release(v); #endif } /** * raw_atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_relaxed(v); #else return raw_atomic_dec_return_relaxed(v); #endif } /** * raw_atomic_long_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec(v); #else return raw_atomic_fetch_dec(v); #endif } /** * raw_atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_acquire(v); #else return raw_atomic_fetch_dec_acquire(v); #endif } /** * raw_atomic_long_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_release(v); #else return raw_atomic_fetch_dec_release(v); #endif } /** * raw_atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_relaxed(v); #else return raw_atomic_fetch_dec_relaxed(v); #endif } /** * raw_atomic_long_and() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_and(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_and(i, v); #else raw_atomic_and(i, v); #endif } /** * raw_atomic_long_fetch_and() - atomic bitwise AND with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and(i, v); #else return raw_atomic_fetch_and(i, v); #endif } /** * raw_atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_acquire(i, v); #else return raw_atomic_fetch_and_acquire(i, v); #endif } /** * raw_atomic_long_fetch_and_release() - atomic bitwise AND with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_release(i, v); #else return raw_atomic_fetch_and_release(i, v); #endif } /** * raw_atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_relaxed(i, v); #else return raw_atomic_fetch_and_relaxed(i, v); #endif } /** * raw_atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_andnot(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_andnot(i, v); #else raw_atomic_andnot(i, v); #endif } /** * raw_atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot(i, v); #else return raw_atomic_fetch_andnot(i, v); #endif } /** * raw_atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_acquire(i, v); #else return raw_atomic_fetch_andnot_acquire(i, v); #endif } /** * raw_atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_release(i, v); #else return raw_atomic_fetch_andnot_release(i, v); #endif } /** * raw_atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_relaxed(i, v); #else return raw_atomic_fetch_andnot_relaxed(i, v); #endif } /** * raw_atomic_long_or() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_or(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_or(i, v); #else raw_atomic_or(i, v); #endif } /** * raw_atomic_long_fetch_or() - atomic bitwise OR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or(i, v); #else return raw_atomic_fetch_or(i, v); #endif } /** * raw_atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_acquire(i, v); #else return raw_atomic_fetch_or_acquire(i, v); #endif } /** * raw_atomic_long_fetch_or_release() - atomic bitwise OR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_release(i, v); #else return raw_atomic_fetch_or_release(i, v); #endif } /** * raw_atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_relaxed(i, v); #else return raw_atomic_fetch_or_relaxed(i, v); #endif } /** * raw_atomic_long_xor() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_xor(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_xor(i, v); #else raw_atomic_xor(i, v); #endif } /** * raw_atomic_long_fetch_xor() - atomic bitwise XOR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor(i, v); #else return raw_atomic_fetch_xor(i, v); #endif } /** * raw_atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_acquire(i, v); #else return raw_atomic_fetch_xor_acquire(i, v); #endif } /** * raw_atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_release(i, v); #else return raw_atomic_fetch_xor_release(i, v); #endif } /** * raw_atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_relaxed(i, v); #else return raw_atomic_fetch_xor_relaxed(i, v); #endif } /** * raw_atomic_long_xchg() - atomic exchange with full ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg(v, new); #else return raw_atomic_xchg(v, new); #endif } /** * raw_atomic_long_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_acquire(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_acquire(v, new); #else return raw_atomic_xchg_acquire(v, new); #endif } /** * raw_atomic_long_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_release(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_release(v, new); #else return raw_atomic_xchg_release(v, new); #endif } /** * raw_atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_relaxed(v, new); #else return raw_atomic_xchg_relaxed(v, new); #endif } /** * raw_atomic_long_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg(v, old, new); #else return raw_atomic_cmpxchg(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_acquire(v, old, new); #else return raw_atomic_cmpxchg_acquire(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_release(v, old, new); #else return raw_atomic_cmpxchg_release(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_relaxed(v, old, new); #else return raw_atomic_cmpxchg_relaxed(v, old, new); #endif } /** * raw_atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_acquire(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_acquire(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_release(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_release(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_relaxed(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_relaxed(v, (int *)old, new); #endif } /** * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_sub_and_test(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_and_test(i, v); #else return raw_atomic_sub_and_test(i, v); #endif } /** * raw_atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_dec_and_test(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_and_test(v); #else return raw_atomic_dec_and_test(v); #endif } /** * raw_atomic_long_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_and_test(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_and_test(v); #else return raw_atomic_inc_and_test(v); #endif } /** * raw_atomic_long_add_negative() - atomic add and test if negative with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative(i, v); #else return raw_atomic_add_negative(i, v); #endif } /** * raw_atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_acquire(i, v); #else return raw_atomic_add_negative_acquire(i, v); #endif } /** * raw_atomic_long_add_negative_release() - atomic add and test if negative with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_release(i, v); #else return raw_atomic_add_negative_release(i, v); #endif } /** * raw_atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_relaxed(i, v); #else return raw_atomic_add_negative_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_unless(v, a, u); #else return raw_atomic_fetch_add_unless(v, a, u); #endif } /** * raw_atomic_long_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_add_unless(atomic_long_t *v, long a, long u) { #ifdef CONFIG_64BIT return raw_atomic64_add_unless(v, a, u); #else return raw_atomic_add_unless(v, a, u); #endif } /** * raw_atomic_long_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_long_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_not_zero(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_not_zero(v); #else return raw_atomic_inc_not_zero(v); #endif } /** * raw_atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_long_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_unless_negative(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_unless_negative(v); #else return raw_atomic_inc_unless_negative(v); #endif } /** * raw_atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_long_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_dec_unless_positive(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_unless_positive(v); #else return raw_atomic_dec_unless_positive(v); #endif } /** * raw_atomic_long_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_long_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline long raw_atomic_long_dec_if_positive(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_if_positive(v); #else return raw_atomic_dec_if_positive(v); #endif } #endif /* _LINUX_ATOMIC_LONG_H */ // eadf183c3600b8b92b91839dd3be6bcc560c752d |
1278 1278 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | // SPDX-License-Identifier: GPL-2.0 /* * Generic sched_clock() support, to extend low level hardware time * counters to full 64-bit ns values. */ #include <linux/clocksource.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/ktime.h> #include <linux/kernel.h> #include <linux/math.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/syscore_ops.h> #include <linux/hrtimer.h> #include <linux/sched_clock.h> #include <linux/seqlock.h> #include <linux/bitops.h> #include "timekeeping.h" /** * struct clock_data - all data needed for sched_clock() (including * registration of a new clock source) * * @seq: Sequence counter for protecting updates. The lowest * bit is the index for @read_data. * @read_data: Data required to read from sched_clock. * @wrap_kt: Duration for which clock can run before wrapping. * @rate: Tick rate of the registered clock. * @actual_read_sched_clock: Registered hardware level clock read function. * * The ordering of this structure has been chosen to optimize cache * performance. In particular 'seq' and 'read_data[0]' (combined) should fit * into a single 64-byte cache line. */ struct clock_data { seqcount_latch_t seq; struct clock_read_data read_data[2]; ktime_t wrap_kt; unsigned long rate; u64 (*actual_read_sched_clock)(void); }; static struct hrtimer sched_clock_timer; static int irqtime = -1; core_param(irqtime, irqtime, int, 0400); static u64 notrace jiffy_sched_clock_read(void) { /* * We don't need to use get_jiffies_64 on 32-bit arches here * because we register with BITS_PER_LONG */ return (u64)(jiffies - INITIAL_JIFFIES); } static struct clock_data cd ____cacheline_aligned = { .read_data[0] = { .mult = NSEC_PER_SEC / HZ, .read_sched_clock = jiffy_sched_clock_read, }, .actual_read_sched_clock = jiffy_sched_clock_read, }; static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift) { return (cyc * mult) >> shift; } notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { *seq = read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } notrace int sched_clock_read_retry(unsigned int seq) { return read_seqcount_latch_retry(&cd.seq, seq); } static __always_inline unsigned long long __sched_clock(void) { struct clock_read_data *rd; unsigned int seq; u64 cyc, res; do { seq = raw_read_seqcount_latch(&cd.seq); rd = cd.read_data + (seq & 1); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); } while (raw_read_seqcount_latch_retry(&cd.seq, seq)); return res; } unsigned long long noinstr sched_clock_noinstr(void) { return __sched_clock(); } unsigned long long notrace sched_clock(void) { unsigned long long ns; preempt_disable_notrace(); /* * All of __sched_clock() is a seqcount_latch reader critical section, * but relies on the raw helpers which are uninstrumented. For KCSAN, * mark all accesses in __sched_clock() as atomic. */ kcsan_nestable_atomic_begin(); ns = __sched_clock(); kcsan_nestable_atomic_end(); preempt_enable_notrace(); return ns; } /* * Updating the data required to read the clock. * * sched_clock() will never observe mis-matched data even if called from * an NMI. We do this by maintaining an odd/even copy of the data and * steering sched_clock() to one or the other using a sequence counter. * In order to preserve the data cache profile of sched_clock() as much * as possible the system reverts back to the even copy when the update * completes; the odd copy is used *only* during an update. */ static void update_clock_read_data(struct clock_read_data *rd) { /* steer readers towards the odd copy */ write_seqcount_latch_begin(&cd.seq); /* now its safe for us to update the normal (even) copy */ cd.read_data[0] = *rd; /* switch readers back to the even copy */ write_seqcount_latch(&cd.seq); /* update the backup (odd) copy with the new data */ cd.read_data[1] = *rd; write_seqcount_latch_end(&cd.seq); } /* * Atomically update the sched_clock() epoch. */ static void update_sched_clock(void) { u64 cyc; u64 ns; struct clock_read_data rd; rd = cd.read_data[0]; cyc = cd.actual_read_sched_clock(); ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); rd.epoch_ns = ns; rd.epoch_cyc = cyc; update_clock_read_data(&rd); } static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) { update_sched_clock(); hrtimer_forward_now(hrt, cd.wrap_kt); return HRTIMER_RESTART; } void __init sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; unsigned long r, flags; char r_unit; struct clock_read_data rd; if (cd.rate > rate) return; /* Cannot register a sched_clock with interrupts on */ local_irq_save(flags); /* Calculate the mult/shift to convert counter ticks to ns. */ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); new_mask = CLOCKSOURCE_MASK(bits); cd.rate = rate; /* Calculate how many nanosecs until we risk wrapping */ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); cd.wrap_kt = ns_to_ktime(wrap); rd = cd.read_data[0]; /* Update epoch for new counter and update 'epoch_ns' from old counter*/ new_epoch = read(); cyc = cd.actual_read_sched_clock(); ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); cd.actual_read_sched_clock = read; rd.read_sched_clock = read; rd.sched_clock_mask = new_mask; rd.mult = new_mult; rd.shift = new_shift; rd.epoch_cyc = new_epoch; rd.epoch_ns = ns; update_clock_read_data(&rd); if (sched_clock_timer.function != NULL) { /* update timeout for clock wrap */ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } r = rate; if (r >= 4000000) { r = DIV_ROUND_CLOSEST(r, 1000000); r_unit = 'M'; } else if (r >= 4000) { r = DIV_ROUND_CLOSEST(r, 1000); r_unit = 'k'; } else { r_unit = ' '; } /* Calculate the ns resolution of this counter */ res = cyc_to_ns(1ULL, new_mult, new_shift); pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", bits, r, r_unit, res, wrap); /* Enable IRQ time accounting if we have a fast enough sched_clock() */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); local_irq_restore(flags); pr_debug("Registered %pS as sched_clock source\n", read); } void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, * make it the final one. */ if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); update_sched_clock(); /* * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } /* * Clock read function for use when the clock is suspended. * * This function makes it appear to sched_clock() as if the clock * stopped counting at its last update. * * This function must only be called from the critical * section in sched_clock(). It relies on the read_seqcount_retry() * at the end of the critical section to be sure we observe the * correct copy of 'epoch_cyc'. */ static u64 notrace suspended_sched_clock_read(void) { unsigned int seq = read_seqcount_latch(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } int sched_clock_suspend(void) { struct clock_read_data *rd = &cd.read_data[0]; update_sched_clock(); hrtimer_cancel(&sched_clock_timer); rd->read_sched_clock = suspended_sched_clock_read; return 0; } void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data[0]; rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); rd->read_sched_clock = cd.actual_read_sched_clock; } static struct syscore_ops sched_clock_ops = { .suspend = sched_clock_suspend, .resume = sched_clock_resume, }; static int __init sched_clock_syscore_init(void) { register_syscore_ops(&sched_clock_ops); return 0; } device_initcall(sched_clock_syscore_init); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Events for filesystem locks * * Copyright 2013 Jeff Layton <jlayton@poochiereds.net> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filelock #if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILELOCK_H #include <linux/tracepoint.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/kdev_t.h> #define show_fl_flags(val) \ __print_flags(val, "|", \ { FL_POSIX, "FL_POSIX" }, \ { FL_FLOCK, "FL_FLOCK" }, \ { FL_DELEG, "FL_DELEG" }, \ { FL_ACCESS, "FL_ACCESS" }, \ { FL_EXISTS, "FL_EXISTS" }, \ { FL_LEASE, "FL_LEASE" }, \ { FL_CLOSE, "FL_CLOSE" }, \ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ { FL_OFDLCK, "FL_OFDLCK" }) #define show_fl_type(val) \ __print_symbolic(val, \ { F_RDLCK, "F_RDLCK" }, \ { F_WRLCK, "F_WRLCK" }, \ { F_UNLCK, "F_UNLCK" }) TRACE_EVENT(locks_get_lock_context, TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx), TP_ARGS(inode, type, ctx), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(unsigned char, type) __field(struct file_lock_context *, ctx) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->type = type; __entry->ctx = ctx; ), TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, show_fl_type(__entry->type), __entry->ctx) ); DECLARE_EVENT_CLASS(filelock_lock, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) __field(unsigned int, pid) __field(unsigned int, flags) __field(unsigned char, type) __field(loff_t, fl_start) __field(loff_t, fl_end) __field(int, ret) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->blocker = fl ? fl->c.flc_blocker : NULL; __entry->owner = fl ? fl->c.flc_owner : NULL; __entry->pid = fl ? fl->c.flc_pid : 0; __entry->flags = fl ? fl->c.flc_flags : 0; __entry->type = fl ? fl->c.flc_type : 0; __entry->fl_start = fl ? fl->fl_start : 0; __entry->fl_end = fl ? fl->fl_end : 0; __entry->ret = ret; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, __entry->pid, show_fl_flags(__entry->flags), show_fl_type(__entry->type), __entry->fl_start, __entry->fl_end, __entry->ret) ); DEFINE_EVENT(filelock_lock, posix_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, fcntl_setlk, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, locks_remove_posix, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, flock_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DECLARE_EVENT_CLASS(filelock_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(struct file_lease *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) __field(unsigned int, flags) __field(unsigned char, type) __field(unsigned long, break_time) __field(unsigned long, downgrade_time) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->blocker = fl ? fl->c.flc_blocker : NULL; __entry->owner = fl ? fl->c.flc_owner : NULL; __entry->flags = fl ? fl->c.flc_flags : 0; __entry->type = fl ? fl->c.flc_type : 0; __entry->break_time = fl ? fl->fl_break_time : 0; __entry->downgrade_time = fl ? fl->fl_downgrade_time : 0; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, show_fl_flags(__entry->flags), show_fl_type(__entry->type), __entry->break_time, __entry->downgrade_time) ); DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); TRACE_EVENT(generic_add_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(int, wcount) __field(int, rcount) __field(int, icount) __field(dev_t, s_dev) __field(fl_owner_t, owner) __field(unsigned int, flags) __field(unsigned char, type) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); __entry->icount = atomic_read(&inode->i_count); __entry->owner = fl->c.flc_owner; __entry->flags = fl->c.flc_flags; __entry->type = fl->c.flc_type; ), TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->wcount, __entry->rcount, __entry->icount, __entry->owner, show_fl_flags(__entry->flags), show_fl_type(__entry->type)) ); TRACE_EVENT(leases_conflict, TP_PROTO(bool conflict, struct file_lease *lease, struct file_lease *breaker), TP_ARGS(conflict, lease, breaker), TP_STRUCT__entry( __field(void *, lease) __field(void *, breaker) __field(unsigned int, l_fl_flags) __field(unsigned int, b_fl_flags) __field(unsigned char, l_fl_type) __field(unsigned char, b_fl_type) __field(bool, conflict) ), TP_fast_assign( __entry->lease = lease; __entry->l_fl_flags = lease->c.flc_flags; __entry->l_fl_type = lease->c.flc_type; __entry->breaker = breaker; __entry->b_fl_flags = breaker->c.flc_flags; __entry->b_fl_type = breaker->c.flc_type; __entry->conflict = conflict; ), TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s", __entry->conflict, __entry->lease, show_fl_flags(__entry->l_fl_flags), show_fl_type(__entry->l_fl_type), __entry->breaker, show_fl_flags(__entry->b_fl_flags), show_fl_type(__entry->b_fl_type)) ); #endif /* _TRACE_FILELOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
1484 353 22 957 755 708 34 34 820 251 681 799 168 1259 657 275 1506 1496 1506 904 222 177 142 611 991 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/atomic.h * * Copyright (C) 1996 Russell King. * Copyright (C) 2002 Deep Blue Solutions Ltd. * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_ATOMIC_LSE_H #define __ASM_ATOMIC_LSE_H #define ATOMIC_OP(op, asm_op) \ static __always_inline void \ __lse_atomic_##op(int i, atomic_t *v) \ { \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op " %w[i], %[v]\n" \ : [v] "+Q" (v->counter) \ : [i] "r" (i)); \ } ATOMIC_OP(andnot, stclr) ATOMIC_OP(or, stset) ATOMIC_OP(xor, steor) ATOMIC_OP(add, stadd) static __always_inline void __lse_atomic_sub(int i, atomic_t *v) { __lse_atomic_add(-i, v); } #undef ATOMIC_OP #define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...) \ static __always_inline int \ __lse_atomic_fetch_##op##name(int i, atomic_t *v) \ { \ int old; \ \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op #mb " %w[i], %w[old], %[v]" \ : [v] "+Q" (v->counter), \ [old] "=r" (old) \ : [i] "r" (i) \ : cl); \ \ return old; \ } #define ATOMIC_FETCH_OPS(op, asm_op) \ ATOMIC_FETCH_OP(_relaxed, , op, asm_op) \ ATOMIC_FETCH_OP(_acquire, a, op, asm_op, "memory") \ ATOMIC_FETCH_OP(_release, l, op, asm_op, "memory") \ ATOMIC_FETCH_OP( , al, op, asm_op, "memory") ATOMIC_FETCH_OPS(andnot, ldclr) ATOMIC_FETCH_OPS(or, ldset) ATOMIC_FETCH_OPS(xor, ldeor) ATOMIC_FETCH_OPS(add, ldadd) #undef ATOMIC_FETCH_OP #undef ATOMIC_FETCH_OPS #define ATOMIC_FETCH_OP_SUB(name) \ static __always_inline int \ __lse_atomic_fetch_sub##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_add##name(-i, v); \ } ATOMIC_FETCH_OP_SUB(_relaxed) ATOMIC_FETCH_OP_SUB(_acquire) ATOMIC_FETCH_OP_SUB(_release) ATOMIC_FETCH_OP_SUB( ) #undef ATOMIC_FETCH_OP_SUB #define ATOMIC_OP_ADD_SUB_RETURN(name) \ static __always_inline int \ __lse_atomic_add_return##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_add##name(i, v) + i; \ } \ \ static __always_inline int \ __lse_atomic_sub_return##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_sub(i, v) - i; \ } ATOMIC_OP_ADD_SUB_RETURN(_relaxed) ATOMIC_OP_ADD_SUB_RETURN(_acquire) ATOMIC_OP_ADD_SUB_RETURN(_release) ATOMIC_OP_ADD_SUB_RETURN( ) #undef ATOMIC_OP_ADD_SUB_RETURN static __always_inline void __lse_atomic_and(int i, atomic_t *v) { return __lse_atomic_andnot(~i, v); } #define ATOMIC_FETCH_OP_AND(name, mb, cl...) \ static __always_inline int \ __lse_atomic_fetch_and##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_andnot##name(~i, v); \ } ATOMIC_FETCH_OP_AND(_relaxed, ) ATOMIC_FETCH_OP_AND(_acquire, a, "memory") ATOMIC_FETCH_OP_AND(_release, l, "memory") ATOMIC_FETCH_OP_AND( , al, "memory") #undef ATOMIC_FETCH_OP_AND #define ATOMIC64_OP(op, asm_op) \ static __always_inline void \ __lse_atomic64_##op(s64 i, atomic64_t *v) \ { \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op " %[i], %[v]\n" \ : [v] "+Q" (v->counter) \ : [i] "r" (i)); \ } ATOMIC64_OP(andnot, stclr) ATOMIC64_OP(or, stset) ATOMIC64_OP(xor, steor) ATOMIC64_OP(add, stadd) static __always_inline void __lse_atomic64_sub(s64 i, atomic64_t *v) { __lse_atomic64_add(-i, v); } #undef ATOMIC64_OP #define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...) \ static __always_inline long \ __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v) \ { \ s64 old; \ \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op #mb " %[i], %[old], %[v]" \ : [v] "+Q" (v->counter), \ [old] "=r" (old) \ : [i] "r" (i) \ : cl); \ \ return old; \ } #define ATOMIC64_FETCH_OPS(op, asm_op) \ ATOMIC64_FETCH_OP(_relaxed, , op, asm_op) \ ATOMIC64_FETCH_OP(_acquire, a, op, asm_op, "memory") \ ATOMIC64_FETCH_OP(_release, l, op, asm_op, "memory") \ ATOMIC64_FETCH_OP( , al, op, asm_op, "memory") ATOMIC64_FETCH_OPS(andnot, ldclr) ATOMIC64_FETCH_OPS(or, ldset) ATOMIC64_FETCH_OPS(xor, ldeor) ATOMIC64_FETCH_OPS(add, ldadd) #undef ATOMIC64_FETCH_OP #undef ATOMIC64_FETCH_OPS #define ATOMIC64_FETCH_OP_SUB(name) \ static __always_inline long \ __lse_atomic64_fetch_sub##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_add##name(-i, v); \ } ATOMIC64_FETCH_OP_SUB(_relaxed) ATOMIC64_FETCH_OP_SUB(_acquire) ATOMIC64_FETCH_OP_SUB(_release) ATOMIC64_FETCH_OP_SUB( ) #undef ATOMIC64_FETCH_OP_SUB #define ATOMIC64_OP_ADD_SUB_RETURN(name) \ static __always_inline long \ __lse_atomic64_add_return##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_add##name(i, v) + i; \ } \ \ static __always_inline long \ __lse_atomic64_sub_return##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_sub##name(i, v) - i; \ } ATOMIC64_OP_ADD_SUB_RETURN(_relaxed) ATOMIC64_OP_ADD_SUB_RETURN(_acquire) ATOMIC64_OP_ADD_SUB_RETURN(_release) ATOMIC64_OP_ADD_SUB_RETURN( ) #undef ATOMIC64_OP_ADD_SUB_RETURN static __always_inline void __lse_atomic64_and(s64 i, atomic64_t *v) { return __lse_atomic64_andnot(~i, v); } #define ATOMIC64_FETCH_OP_AND(name, mb, cl...) \ static __always_inline long \ __lse_atomic64_fetch_and##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_andnot##name(~i, v); \ } ATOMIC64_FETCH_OP_AND(_relaxed, ) ATOMIC64_FETCH_OP_AND(_acquire, a, "memory") ATOMIC64_FETCH_OP_AND(_release, l, "memory") ATOMIC64_FETCH_OP_AND( , al, "memory") #undef ATOMIC64_FETCH_OP_AND static __always_inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v) { unsigned long tmp; asm volatile( __LSE_PREAMBLE "1: ldr %x[tmp], %[v]\n" " subs %[ret], %x[tmp], #1\n" " b.lt 2f\n" " casal %x[tmp], %[ret], %[v]\n" " sub %x[tmp], %x[tmp], #1\n" " sub %x[tmp], %x[tmp], %[ret]\n" " cbnz %x[tmp], 1b\n" "2:" : [ret] "+&r" (v), [v] "+Q" (v->counter), [tmp] "=&r" (tmp) : : "cc", "memory"); return (long)v; } #define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...) \ static __always_inline u##sz \ __lse__cmpxchg_case_##name##sz(volatile void *ptr, \ u##sz old, \ u##sz new) \ { \ asm volatile( \ __LSE_PREAMBLE \ " cas" #mb #sfx " %" #w "[old], %" #w "[new], %[v]\n" \ : [v] "+Q" (*(u##sz *)ptr), \ [old] "+r" (old) \ : [new] "rZ" (new) \ : cl); \ \ return old; \ } __CMPXCHG_CASE(w, b, , 8, ) __CMPXCHG_CASE(w, h, , 16, ) __CMPXCHG_CASE(w, , , 32, ) __CMPXCHG_CASE(x, , , 64, ) __CMPXCHG_CASE(w, b, acq_, 8, a, "memory") __CMPXCHG_CASE(w, h, acq_, 16, a, "memory") __CMPXCHG_CASE(w, , acq_, 32, a, "memory") __CMPXCHG_CASE(x, , acq_, 64, a, "memory") __CMPXCHG_CASE(w, b, rel_, 8, l, "memory") __CMPXCHG_CASE(w, h, rel_, 16, l, "memory") __CMPXCHG_CASE(w, , rel_, 32, l, "memory") __CMPXCHG_CASE(x, , rel_, 64, l, "memory") __CMPXCHG_CASE(w, b, mb_, 8, al, "memory") __CMPXCHG_CASE(w, h, mb_, 16, al, "memory") __CMPXCHG_CASE(w, , mb_, 32, al, "memory") __CMPXCHG_CASE(x, , mb_, 64, al, "memory") #undef __CMPXCHG_CASE #define __CMPXCHG128(name, mb, cl...) \ static __always_inline u128 \ __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ { \ union __u128_halves r, o = { .full = (old) }, \ n = { .full = (new) }; \ register unsigned long x0 asm ("x0") = o.low; \ register unsigned long x1 asm ("x1") = o.high; \ register unsigned long x2 asm ("x2") = n.low; \ register unsigned long x3 asm ("x3") = n.high; \ register unsigned long x4 asm ("x4") = (unsigned long)ptr; \ \ asm volatile( \ __LSE_PREAMBLE \ " casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\ : [old1] "+&r" (x0), [old2] "+&r" (x1), \ [v] "+Q" (*(u128 *)ptr) \ : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ [oldval1] "r" (o.low), [oldval2] "r" (o.high) \ : cl); \ \ r.low = x0; r.high = x1; \ \ return r.full; \ } __CMPXCHG128( , ) __CMPXCHG128(_mb, al, "memory") #undef __CMPXCHG128 #endif /* __ASM_ATOMIC_LSE_H */ |
318 342 256 291 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 | /* SPDX-License-Identifier: GPL-2.0 */ /* * NUMA memory policies for Linux. * Copyright 2003,2004 Andi Kleen SuSE Labs */ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 #include <linux/sched.h> #include <linux/mmzone.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <uapi/linux/mempolicy.h> struct mm_struct; #define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ #ifdef CONFIG_NUMA /* * Describe a memory policy. * * A mempolicy can be either associated with a process or with a VMA. * For VMA related allocations the VMA policy is preferred, otherwise * the process policy is used. Interrupts ignore the memory policy * of the current process. * * Locking policy for interleave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on * mmap_lock. * * Freeing policy: * Mempolicy objects are reference counted. A mempolicy will be freed when * mpol_put() decrements the reference count to zero. * * Duplicating policy objects: * mpol_dup() allocates a new mempolicy and copies the specified mempolicy * to the new storage. The reference count of the new object is initialized * to 1, representing the caller of mpol_dup(). */ struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/preferred/etc */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; }; /* * Support for managing mempolicy data objects (clone, copy, destroy) * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. */ extern void __mpol_put(struct mempolicy *pol); static inline void mpol_put(struct mempolicy *pol) { if (pol) __mpol_put(pol); } /* * Does mempolicy pol need explicit unref after use? * Currently only needed for shared policies. */ static inline int mpol_needs_cond_ref(struct mempolicy *pol) { return (pol && (pol->flags & MPOL_F_SHARED)); } static inline void mpol_cond_put(struct mempolicy *pol) { if (mpol_needs_cond_ref(pol)) __mpol_put(pol); } extern struct mempolicy *__mpol_dup(struct mempolicy *pol); static inline struct mempolicy *mpol_dup(struct mempolicy *pol) { if (pol) pol = __mpol_dup(pol); return pol; } static inline void mpol_get(struct mempolicy *pol) { if (pol) atomic_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (a == b) return true; return __mpol_equal(a, b); } /* * Tree of shared policies for a shared memory region. */ struct shared_policy { struct rb_root root; rwlock_t lock; }; struct sp_node { struct rb_node nd; pgoff_t start, end; struct mempolicy *policy; }; int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *mpol); void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx); struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; static inline void check_highest_zone(enum zone_type k) { if (k > policy_zone && k != ZONE_MOVABLE) policy_zone = k; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags); #ifdef CONFIG_TMPFS extern int mpol_parse_str(char *str, struct mempolicy **mpol); #endif extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ extern bool vma_migratable(struct vm_area_struct *vma); int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long addr); extern void mpol_put_task_policy(struct task_struct *); static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return (pol->mode == MPOL_PREFERRED_MANY); } extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); #else struct mempolicy {}; static inline struct mempolicy *get_task_policy(struct task_struct *p) { return NULL; } static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; } static inline void mpol_put(struct mempolicy *pol) { } static inline void mpol_cond_put(struct mempolicy *pol) { } static inline void mpol_get(struct mempolicy *pol) { } struct shared_policy {}; static inline void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { } static inline void mpol_free_shared_policy(struct shared_policy *sp) { } static inline struct mempolicy * mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { return NULL; } static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { *ilx = 0; return NULL; } static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { return 0; } static inline void numa_policy_init(void) { } static inline void numa_default_policy(void) { } static inline void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { } static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } static inline int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { *mpol = NULL; *nodemask = NULL; return 0; } static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return 0; } static inline void check_highest_zone(int k) { } #ifdef CONFIG_TMPFS static inline int mpol_parse_str(char *str, struct mempolicy **mpol) { return 1; /* error */ } #endif static inline int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long address) { return -1; /* no node preference */ } static inline void mpol_put_task_policy(struct task_struct *task) { } static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; } #endif /* CONFIG_NUMA */ #endif |
255 255 255 255 255 255 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic Timer-queue * * Manages a simple queue of timers, ordered by expiration time. * Uses rbtrees for quick list adds and expiration. * * NOTE: All of the following functions need to be serialized * to avoid races. No locking is done by this library code. */ #include <linux/bug.h> #include <linux/timerqueue.h> #include <linux/rbtree.h> #include <linux/export.h> #define __node_2_tq(_n) \ rb_entry((_n), struct timerqueue_node, node) static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b) { return __node_2_tq(a)->expires < __node_2_tq(b)->expires; } /** * timerqueue_add - Adds timer to timerqueue. * * @head: head of timerqueue * @node: timer node to be added * * Adds the timer node to the timerqueue, sorted by the node's expires * value. Returns true if the newly added timer is the first expiring timer in * the queue. */ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { /* Make sure we don't add nodes that are already added */ WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less); } EXPORT_SYMBOL_GPL(timerqueue_add); /** * timerqueue_del - Removes a timer from the timerqueue. * * @head: head of timerqueue * @node: timer node to be removed * * Removes the timer node from the timerqueue. Returns true if the queue is * not empty after the remove. */ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) { WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); rb_erase_cached(&node->node, &head->rb_root); RB_CLEAR_NODE(&node->node); return !RB_EMPTY_ROOT(&head->rb_root.rb_root); } EXPORT_SYMBOL_GPL(timerqueue_del); /** * timerqueue_iterate_next - Returns the timer after the provided timer * * @node: Pointer to a timer. * * Provides the timer that is after the given node. This is used, when * necessary, to iterate through the list of timers in a timer list * without modifying the list. */ struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) { struct rb_node *next; if (!node) return NULL; next = rb_next(&node->node); if (!next) return NULL; return container_of(next, struct timerqueue_node, node); } EXPORT_SYMBOL_GPL(timerqueue_iterate_next); |
1 156 156 18 11 156 1 145 156 552 552 552 328 331 331 302 302 4 4 4 285 301 72 54 188 43 241 32 32 13 231 382 32 276 271 46 5 42 410 229 229 6 271 47 47 156 158 16 16 16 471 29 257 580 229 261 230 179 55 153 285 164 223 53 158 82 471 471 470 467 36 53 7 391 53 169 168 169 7 7 7 7 7 7 53 40 40 40 203 148 3 307 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_H #define _LINUX_MM_H #include <linux/errno.h> #include <linux/mmdebug.h> #include <linux/gfp.h> #include <linux/pgalloc_tag.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/mmzone.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/debug_locks.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/range.h> #include <linux/pfn.h> #include <linux/percpu-refcount.h> #include <linux/bit_spinlock.h> #include <linux/shrinker.h> #include <linux/resource.h> #include <linux/page_ext.h> #include <linux/err.h> #include <linux/page-flags.h> #include <linux/page_ref.h> #include <linux/overflow.h> #include <linux/sizes.h> #include <linux/sched.h> #include <linux/pgtable.h> #include <linux/kasan.h> #include <linux/memremap.h> #include <linux/slab.h> #include <linux/cacheinfo.h> #include <linux/rcuwait.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; struct folio_batch; void arch_mm_preinit(void); void mm_core_init(void); void init_mm_internals(void); extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { return (unsigned long)atomic_long_read(&_totalram_pages); } static inline void totalram_pages_inc(void) { atomic_long_inc(&_totalram_pages); } static inline void totalram_pages_dec(void) { atomic_long_dec(&_totalram_pages); } static inline void totalram_pages_add(long count) { atomic_long_add(count, &_totalram_pages); } extern void * high_memory; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else #define sysctl_legacy_va_layout 0 #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; extern int mmap_rnd_bits_max __ro_after_init; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS extern const int mmap_rnd_compat_bits_min; extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif #ifndef DIRECT_MAP_PHYSMEM_END # ifdef MAX_PHYSMEM_BITS # define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) # else # define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) # endif #endif #include <asm/page.h> #include <asm/processor.h> #ifndef __pa_symbol #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) #endif #ifndef page_to_virt #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) #endif #ifndef lm_alias #define lm_alias(x) __va(__pa_symbol(x)) #endif /* * To prevent common memory management code establishing * a zero page mapping on a read fault. * This macro should be defined within <asm/pgtable.h>. * s390 does this to prevent multiplexing of hardware bits * related to the physical page in case of virtualization. */ #ifndef mm_forbids_zeropage #define mm_forbids_zeropage(X) (0) #endif /* * On some architectures it is expensive to call memset() for small sizes. * If an architecture decides to implement their own version of * mm_zero_struct_page they should wrap the defines below in a #ifndef and * define their own version of this macro in <asm/pgtable.h> */ #if BITS_PER_LONG == 64 /* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statements if they are both assignments and can be reordered, * this can result in several of the writes here being dropped. */ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) static inline void __mm_zero_struct_page(struct page *page) { unsigned long *_pp = (void *)page; /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ BUILD_BUG_ON(sizeof(struct page) & 7); BUILD_BUG_ON(sizeof(struct page) < 56); BUILD_BUG_ON(sizeof(struct page) > 96); switch (sizeof(struct page)) { case 96: _pp[11] = 0; fallthrough; case 88: _pp[10] = 0; fallthrough; case 80: _pp[9] = 0; fallthrough; case 72: _pp[8] = 0; fallthrough; case 64: _pp[7] = 0; fallthrough; case 56: _pp[6] = 0; _pp[5] = 0; _pp[4] = 0; _pp[3] = 0; _pp[2] = 0; _pp[1] = 0; _pp[0] = 0; } } #else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif /* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a * problem. * * When a program's coredump is generated as ELF format, a section is created * per a vma. In ELF, the number of sections is represented in unsigned short. * This means the number of sections should be smaller than 65535 at coredump. * Because the kernel adds some informative sections to a image of program at * generating coredump, we need some margin. The number of extra sections is * 1-3 now and depends on arch. We use "5" as safe margin, here. * * ELF extended numbering allows more than 65535 sections, so 16-bit bound is * not a hard limit any more. Although some userspace tools can be surprised by * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) #define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) /* to align the pointer to the (prev) page boundary */ #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); } void setup_initial_init_mm(void *start_code, void *end_code, void *end_data, void *brk); /* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way * we have a virtual fs - giving a cleaner interface to the * mm details, and allowing different kinds of memory mappings * (from shared memory to executable loading to arbitrary * mmap() functions). */ struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ #define VM_NONE 0x00000000 #define VM_READ 0x00000001 /* currently active flags */ #define VM_WRITE 0x00000002 #define VM_EXEC 0x00000004 #define VM_SHARED 0x00000008 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ #define VM_MAYWRITE 0x00000020 #define VM_MAYEXEC 0x00000040 #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #else /* CONFIG_MMU */ #define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ #define VM_UFFD_MISSING 0 #endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ /* Used by sys_madvise() */ #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #ifdef CONFIG_MEM_SOFT_DIRTY # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ #else # define VM_SOFTDIRTY 0 #endif #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 # define VM_PKEY_BIT1 VM_HIGH_ARCH_1 # define VM_PKEY_BIT2 VM_HIGH_ARCH_2 #if CONFIG_ARCH_PKEY_BITS > 3 # define VM_PKEY_BIT3 VM_HIGH_ARCH_3 #else # define VM_PKEY_BIT3 0 #endif #if CONFIG_ARCH_PKEY_BITS > 4 # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_X86_USER_SHADOW_STACK /* * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of * support core mm. * * These VMAs will get a single end guard page. This helps userspace protect * itself from attacks. A single page is enough for current shadow stack archs * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 #endif #if defined(CONFIG_ARM64_GCS) /* * arm64's Guarded Control Stack implements similar functionality and * has similar constraints to shadow stacks. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_6 #endif #ifndef VM_SHADOW_STACK # define VM_SHADOW_STACK VM_NONE #endif #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC64) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI #elif defined(CONFIG_ARM64) # define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ # define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif #if defined(CONFIG_ARM64_MTE) # define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ # define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE #endif #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR # define VM_UFFD_MINOR_BIT 38 # define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ #else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ /* * This flag is used to connect VFIO to arch specific KVM code. It * indicates that the memory under this VMA is safe for use with any * non-cachable memory type inside KVM. Some VFIO devices, on some * platforms, are thought to be unsafe and can cause machine crashes * if KVM does not lock down the memory type. */ #ifdef CONFIG_64BIT #define VM_ALLOW_ANY_UNCACHED_BIT 39 #define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) #else #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif #ifdef CONFIG_64BIT #define VM_DROPPABLE_BIT 40 #define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) #elif defined(CONFIG_PPC32) #define VM_DROPPABLE VM_ARCH_1 #else #define VM_DROPPABLE VM_NONE #endif #ifdef CONFIG_64BIT /* VM is sealed, in vm_flags */ #define VM_SEALED _BITUL(63) #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) #define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) /* Common data flag combinations */ #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC #endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) #ifdef CONFIG_STACK_GROWSUP #define VM_STACK VM_GROWSUP #define VM_STACK_EARLY VM_GROWSDOWN #else #define VM_STACK VM_GROWSDOWN #define VM_STACK_EARLY 0 #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) /* * Special vmas that are non-mergable, non-mlock()able. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. */ #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ FAULT_FLAG_KILLABLE | \ FAULT_FLAG_INTERRUPTIBLE) /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition * to change, in which case we can try to be polite to release the * mmap_lock in the first round to avoid potential starvation of other * processes that would also want the mmap_lock. * * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); } #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ { FAULT_FLAG_TRIED, "TRIED" }, \ { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } /* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * * MM layer fills up gfp_mask for page allocations but fault handler might * alter it if its implementation requires a different allocation context. * * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { const struct { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address - masked */ unsigned long real_address; /* Faulting virtual address - unmasked */ }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching * the 'address' */ union { pte_t orig_pte; /* Value of PTE at the time of fault */ pmd_t orig_pmd; /* Value of PMD at the time of fault, * used by PMD fault only. */ }; struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR). */ /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated. */ spinlock_t *ptl; /* Page table lock. * Protects pte page table if 'pte' * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. * vm_ops->map_pages() sets up a page * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. */ }; /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not * be modified. Returns 0 if mprotect() can proceed. */ int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs. See also generic_access_phys() for a generic * implementation useful for any iomem mapping. */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); /* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this * vma to be dumped unconditionally. */ const char *(*name)(struct vm_area_struct *vma); #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy. */ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); /* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page. */ struct page *(*find_special_page)(struct vm_area_struct *vma, unsigned long addr); }; #ifdef CONFIG_NUMA_BALANCING static inline void vma_numab_state_init(struct vm_area_struct *vma) { vma->numab_state = NULL; } static inline void vma_numab_state_free(struct vm_area_struct *vma) { kfree(vma->numab_state); } #else static inline void vma_numab_state_init(struct vm_area_struct *vma) {} static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) { #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key lockdep_key; lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); #endif if (reset_refcnt) refcount_set(&vma->vm_refcnt, 0); vma->vm_lock_seq = UINT_MAX; } static inline bool is_vma_writer_only(int refcnt) { /* * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on * a detached vma happens only in vma_mark_detached() and is a rare * case, therefore most of the time there will be no unnecessary wakeup. */ return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; } static inline void vma_refcount_put(struct vm_area_struct *vma) { /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ struct mm_struct *mm = vma->vm_mm; int oldcnt; rwsem_release(&vma->vmlock_dep_map, _RET_IP_); if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { if (is_vma_writer_only(oldcnt - 1)) rcuwait_wake_up(&mm->vma_writer_wait); } } /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. * False locked result is possible if mm_lock_seq overflows or if vma gets * reused and attached to a different mm before we lock it. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. */ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) { int oldcnt; /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need * ACQUIRE semantics, because this is just a lockless check whose result * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) return NULL; /* * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. * Acquire fence is required here to avoid reordering against later * vm_lock_seq check and checks inside lock_vma_under_rcu(). */ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, VMA_REF_LIMIT))) { /* return EAGAIN if vma got detached from under us */ return oldcnt ? NULL : ERR_PTR(-EAGAIN); } rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); /* * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are * racing with vma_end_write_all(), we only start reading from the VMA * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { vma_refcount_put(vma); return NULL; } return vma; } /* * Use only while holding mmap read lock which guarantees that locking will not * fail (nobody can concurrently write-lock the vma). vma_start_read() should * not be used in such cases because it might fail due to mm_lock_seq overflow. * This functionality is used to obtain vma read lock and drop the mmap read lock. */ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) { int oldcnt; mmap_assert_locked(vma->vm_mm); if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, VMA_REF_LIMIT))) return false; rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); return true; } /* * Use only while holding mmap read lock which guarantees that locking will not * fail (nobody can concurrently write-lock the vma). vma_start_read() should * not be used in such cases because it might fail due to mm_lock_seq overflow. * This functionality is used to obtain vma read lock and drop the mmap read lock. */ static inline bool vma_start_read_locked(struct vm_area_struct *vma) { return vma_start_read_locked_nested(vma, 0); } static inline void vma_end_read(struct vm_area_struct *vma) { vma_refcount_put(vma); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); /* * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently * write-locked mmap_lock is dropped or downgraded. */ static inline void vma_start_write(struct vm_area_struct *vma) { unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; __vma_start_write(vma, mm_lock_seq); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } static inline void vma_assert_locked(struct vm_area_struct *vma) { unsigned int mm_lock_seq; VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && !__is_vma_write_locked(vma, &mm_lock_seq), vma); } /* * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these * assertions should be made either under mmap_write_lock or when the object * has been isolated under mmap_write_lock, ensuring no competing writers. */ static inline void vma_assert_attached(struct vm_area_struct *vma) { WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); } static inline void vma_assert_detached(struct vm_area_struct *vma) { WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); } static inline void vma_mark_attached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_detached(vma); refcount_set_release(&vma->vm_refcnt, 1); } void vma_mark_detached(struct vm_area_struct *vma); static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_end_read(vmf->vma); else mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_assert_locked(vmf->vma); else mmap_assert_locked(vmf->vma->vm_mm); } struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); #else /* CONFIG_PER_VMA_LOCK */ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_assert_attached(struct vm_area_struct *vma) {} static inline void vma_assert_detached(struct vm_area_struct *vma) {} static inline void vma_mark_attached(struct vm_area_struct *vma) {} static inline void vma_mark_detached(struct vm_area_struct *vma) {} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) { return NULL; } static inline void vma_assert_locked(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); } static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_lock_init(vma, false); } /* Use when VMA is not part of the VMA tree and needs no locking */ static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { ACCESS_PRIVATE(vma, __vm_flags) = flags; } /* * Use when VMA is part of the VMA tree and modifications need coordination * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and * it should be locked explicitly beforehand. */ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) |= flags; } static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } /* * Use only if VMA is not part of the VMA tree or has no other users and * therefore needs no locking. */ static inline void __vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vm_flags_init(vma, (vma->vm_flags | set) & ~clear); } /* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly. */ static inline void vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vma_start_write(vma); __vm_flags_mod(vma, set, clear); } static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; } static inline bool vma_is_anonymous(struct vm_area_struct *vma) { return !vma->vm_ops; } /* * Indicate if the VMA is a heap for the given task; for * /proc/PID/maps that is the heap of the main task. */ static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) { return vma->vm_start < vma->vm_mm->brk && vma->vm_end > vma->vm_mm->start_brk; } /* * Indicate if the VMA is a stack for the given task; for * /proc/PID/maps that is the stack of the main task. */ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) { /* * We make no effort to guess what a given thread considers to be * its "stack". It's not even well-defined for programs written * languages like Go. */ return vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack; } static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); if (!maybe_stack) return false; if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == VM_STACK_INCOMPLETE_SETUP) return true; return false; } static inline bool vma_is_foreign(struct vm_area_struct *vma) { if (!current->mm) return true; if (current->mm != vma->vm_mm) return true; return false; } static inline bool vma_is_accessible(struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } static inline bool is_shared_maywrite(vm_flags_t vm_flags) { return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == (VM_SHARED | VM_MAYWRITE); } static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) { return is_shared_maywrite(vma->vm_flags); } static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) { return mas_next_range(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) { return mas_prev(&vmi->mas, 0); } static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { __mas_set_range(&vmi->mas, start, end - 1); mas_store_gfp(&vmi->mas, NULL, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { mas_destroy(&vmi->mas); } static inline int vma_iter_bulk_store(struct vma_iterator *vmi, struct vm_area_struct *vma) { vmi->mas.index = vma->vm_start; vmi->mas.last = vma->vm_end - 1; mas_store(&vmi->mas, vma); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; vma_mark_attached(vma); return 0; } static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); } static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) { mas_set(&vmi->mas, addr); } #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } struct mmu_gather; struct inode; extern void prep_compound_page(struct page *page, unsigned int order); static inline unsigned int folio_large_order(const struct folio *folio) { return folio->_flags_1 & 0xff; } #ifdef NR_PAGES_IN_LARGE_FOLIO static inline long folio_large_nr_pages(const struct folio *folio) { return folio->_nr_pages; } #else static inline long folio_large_nr_pages(const struct folio *folio) { return 1L << folio_large_order(folio); } #endif /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ static inline unsigned int compound_order(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 0; return folio_large_order(folio); } /** * folio_order - The allocation order of a folio. * @folio: The folio. * * A folio is composed of 2^order pages. See get_order() for the definition * of order. * * Return: The order of the folio. */ static inline unsigned int folio_order(const struct folio *folio) { if (!folio_test_large(folio)) return 0; return folio_large_order(folio); } /** * folio_reset_order - Reset the folio order and derived _nr_pages * @folio: The folio. * * Reset the order and derived _nr_pages to 0. Must only be used in the * process of splitting large folios. */ static inline void folio_reset_order(struct folio *folio) { if (WARN_ON_ONCE(!folio_test_large(folio))) return; folio->_flags_1 &= ~0xffUL; #ifdef NR_PAGES_IN_LARGE_FOLIO folio->_nr_pages = 0; #endif } #include <linux/huge_mm.h> /* * Methods to modify the page usage count. * * What counts for a page usage: * - cache mapping (page->mapping) * - private data (page->private) * - page mapped in a task's page tables, each mapping * is counted separately * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. */ /* * Drop a ref, return true if the refcount fell to zero (the page has no users) */ static inline int put_page_testzero(struct page *page) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); return page_ref_dec_and_test(page); } static inline int folio_put_testzero(struct folio *folio) { return put_page_testzero(&folio->page); } /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } static inline struct folio *folio_get_nontail_page(struct page *page) { if (unlikely(!get_page_unless_zero(page))) return NULL; return (struct folio *)page; } extern int page_is_ram(unsigned long pfn); enum { REGION_INTERSECTS, REGION_DISJOINT, REGION_MIXED, }; int region_intersects(resource_size_t offset, size_t size, unsigned long flags, unsigned long desc); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); /* * Determine if an address is within the vmalloc range * * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); #else static inline bool is_vmalloc_addr(const void *x) { return false; } static inline int is_vmalloc_or_module_addr(const void *x) { return 0; } #endif /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes or implementation of other core folio_*() primitives. */ static inline int folio_entire_mapcount(const struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1)) return 0; return atomic_read(&folio->_entire_mapcount) + 1; } static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_large_mapcount) + 1; } /** * folio_mapcount() - Number of mappings of this folio. * @folio: The folio. * * The folio mapcount corresponds to the number of present user page table * entries that reference any part of a folio. Each such present user page * table entry must be paired with exactly on folio reference. * * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts * exactly once. * * For hugetlb folios, each abstracted "hugetlb" user page table entry that * references the entire folio counts exactly once, even when such special * page table entries are comprised of multiple ordinary page table entries. * * Will report 0 for pages which cannot be mapped into userspace, such as * slab, page tables and similar. * * Return: The number of times this folio is mapped. */ static inline int folio_mapcount(const struct folio *folio) { int mapcount; if (likely(!folio_test_large(folio))) { mapcount = atomic_read(&folio->_mapcount) + 1; if (page_mapcount_is_type(mapcount)) mapcount = 0; return mapcount; } return folio_large_mapcount(folio); } /** * folio_mapped - Is this folio mapped into userspace? * @folio: The folio. * * Return: True if any page in this folio is referenced by user page tables. */ static inline bool folio_mapped(const struct folio *folio) { return folio_mapcount(folio) >= 1; } /* * Return true if this page is mapped into pagetables. * For compound page it returns true if any sub-page of compound page is mapped, * even if this particular sub-page is not itself mapped by any PTE or PMD. */ static inline bool page_mapped(const struct page *page) { return folio_mapped(page_folio(page)); } static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); return compound_head(page); } static inline struct folio *virt_to_folio(const void *x) { struct page *page = virt_to_page(x); return page_folio(page); } void __folio_put(struct folio *folio); void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { return PAGE_SIZE << compound_order(page); } /* Returns the number of bits needed for the number of bytes in a page */ static inline unsigned int page_shift(struct page *page) { return PAGE_SHIFT + compound_order(page); } /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return compound_order(page); } /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page. */ static inline unsigned long thp_size(struct page *page) { return PAGE_SIZE << thp_order(page); } #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want * pte_mkwrite. But get_user_pages can cause write faults for mappings * that do not have writing enabled, when used by access_process_vm. */ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pte = pte_mkwrite(pte, vma); return pte; } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); #endif /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have * only one copy in memory, at most, normally. * * For the non-reserved pages, page_count(page) denotes a reference count. * page_count() == 0 means the page is free. page->lru is then used for * freelist management in the buddy allocator. * page_count() > 0 means the page has been allocated. * * Pages are allocated by the slab allocator in order to provide memory * to kmalloc and kmem_cache_alloc. In this case, the management of the * page, and the fields in 'struct page' are the responsibility of mm/slab.c * unless a particular usage is carefully commented. (the responsibility of * freeing the kmalloc memory is the caller's, of course). * * A page may be used by anyone else who does a __get_free_page(). * In this case, page_count still tracks the references, and should only * be used through the normal accessor functions. The top bits of page->flags * and page->virtual store page management information, but all other fields * are unused and could be used privately, carefully. The management of this * page is the responsibility of the one who allocated it, and those who have * subsequently been given references to it. * * The other pages (we may call them "pagecache pages") are completely * managed by the Linux memory manager: I/O, buffers, swapping etc. * The following discussion applies only to them. * * A pagecache page contains an opaque `private' member, which belongs to the * page's address_space. Usually, this is the address of a circular list of * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * * A page may belong to an inode's memory mapping. In this case, page->mapping * is the pointer to the inode, and page->index is the file offset of the page, * in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that * case PG_swapcache is set, and page->private is an offset into the swapcache. * * In either case (swapcache or inode backed), the pagecache itself holds one * reference to the page. Setting PG_private should also increment the * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, * - inode pages which have been modified and are MAP_SHARED may need * to be written back to the inode on disk, * - anonymous pages (including MAP_PRIVATE file mappings) which have been * modified may need to be swapped out to swap space and (later) to be read * back into memory. */ /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) /** * folio_get - Increment the reference count on a folio. * @folio: The folio. * * Context: May be called in any context, as long as you know that * you have a refcount on the folio. If you do not already have one, * folio_try_get() may be the right interface for you to use. */ static inline void folio_get(struct folio *folio) { VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); folio_ref_inc(folio); } static inline void get_page(struct page *page) { struct folio *folio = page_folio(page); if (WARN_ON_ONCE(folio_test_slab(folio))) return; folio_get(folio); } static inline __must_check bool try_get_page(struct page *page) { page = compound_head(page); if (WARN_ON_ONCE(page_ref_count(page) <= 0)) return false; page_ref_inc(page); return true; } /** * folio_put - Decrement the reference count on a folio. * @folio: The folio. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put() unless you can be sure that it wasn't the * last reference. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put(struct folio *folio) { if (folio_put_testzero(folio)) __folio_put(folio); } /** * folio_put_refs - Reduce the reference count on a folio. * @folio: The folio. * @refs: The amount to subtract from the folio's reference count. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put_refs() unless you can be sure that these weren't * the last references. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put_refs(struct folio *folio, int refs) { if (folio_ref_sub_and_test(folio, refs)) __folio_put(folio); } void folios_put_refs(struct folio_batch *folios, unsigned int *refs); /* * union release_pages_arg - an array of pages or folios * * release_pages() releases a simple array of multiple pages, and * accepts various different forms of said page array: either * a regular old boring array of pages, an array of folios, or * an array of encoded page pointers. * * The transparent union syntax for this kind of "any of these * argument types" is all kinds of ugly, so look away. */ typedef union { struct page **pages; struct folio **folios; struct encoded_page **encoded_pages; } release_pages_arg __attribute__ ((__transparent_union__)); void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need to * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folios_put(struct folio_batch *folios) { folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); if (folio_test_slab(folio)) return; folio_put(folio); } /* * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload * the page's refcount so that two separate items are tracked: the original page * reference count, and also a new count of how many pin_user_pages() calls were * made against the page. ("gup-pinned" is another term for the latter). * * With this scheme, pin_user_pages() becomes special: such pages are marked as * distinct from normal pages. As such, the unpin_user_page() call (and its * variants) must be used in order to release gup-pinned pages. * * Choice of value: * * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference * counts with respect to pin_user_pages() and unpin_user_page() becomes * simpler, due to the fact that adding an even power of two to the page * refcount has the effect of using only the upper N bits, for the code that * counts up using the bias value. This means that the lower bits are left for * the exclusive use of the original code that increments and decrements by one * (or at least, by much smaller values than the bias value). * * Of course, once the lower bits overflow into the upper bits (and this is * OK, because subtraction recovers the original values), then visual inspection * no longer suffices to directly view the separate counts. However, for normal * applications that don't have huge page reference counts, this won't be an * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); void unpin_folio(struct folio *folio); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); void unpin_user_folio(struct folio *folio, unsigned long npages); void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } #ifndef CONFIG_MMU static inline bool is_nommu_shared_mapping(vm_flags_t flags) { /* * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of * a file mapping. R/O MAP_PRIVATE mappings might still modify * underlying memory if ptrace is active, so this is only possible if * ptrace does not apply. Note that there is no mprotect() to upgrade * write permissions later. */ return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } #endif #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif /* * The identification function is mainly used by the buddy allocator for * determining if two pages could be buddies. We are not really identifying * the zone since we could be using the section number id if we do not have * node id available in page flags. * We only guarantee that it will return the same value for two combinable * pages in a zone. */ static inline int page_zone_id(struct page *page) { return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } #ifdef NODE_NOT_IN_PAGE_FLAGS int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif static inline int folio_nid(const struct folio *folio) { return page_to_nid(&folio->page); } #ifdef CONFIG_NUMA_BALANCING /* page access time bits needs to hold at least 4 seconds */ #define PAGE_ACCESS_TIME_MIN_BITS 12 #if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS #define PAGE_ACCESS_TIME_BUCKETS \ (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) #else #define PAGE_ACCESS_TIME_BUCKETS 0 #endif #define PAGE_ACCESS_TIME_MASK \ (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } static inline int cpupid_to_pid(int cpupid) { return cpupid & LAST__PID_MASK; } static inline int cpupid_to_cpu(int cpupid) { return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } static inline int cpupid_to_nid(int cpupid) { return cpu_to_node(cpupid_to_cpu(cpupid)); } static inline bool cpupid_pid_unset(int cpupid) { return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); } static inline bool cpupid_cpu_unset(int cpupid) { return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) { return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); } #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK); } static inline int folio_last_cpupid(struct folio *folio) { return folio->_last_cpupid; } static inline void page_cpupid_reset_last(struct page *page) { page->_last_cpupid = -1 & LAST_CPUPID_MASK; } #else static inline int folio_last_cpupid(struct folio *folio) { return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } int folio_xchg_last_cpupid(struct folio *folio, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ static inline int folio_xchg_access_time(struct folio *folio, int time) { int last_time; last_time = folio_xchg_last_cpupid(folio, time >> PAGE_ACCESS_TIME_BUCKETS); return last_time << PAGE_ACCESS_TIME_BUCKETS; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { unsigned int pid_bit; pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) { __set_bit(pid_bit, &vma->numab_state->pids_active[1]); } } bool folio_use_access_time(struct folio *folio); #else /* !CONFIG_NUMA_BALANCING */ static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { return folio_nid(folio); /* XXX */ } static inline int folio_xchg_access_time(struct folio *folio, int time) { return 0; } static inline int folio_last_cpupid(struct folio *folio) { return folio_nid(folio); /* XXX */ } static inline int cpupid_to_nid(int cpupid) { return -1; } static inline int cpupid_to_pid(int cpupid) { return -1; } static inline int cpupid_to_cpu(int cpupid) { return -1; } static inline int cpu_pid_to_cpupid(int nid, int pid) { return -1; } static inline bool cpupid_pid_unset(int cpupid) { return true; } static inline void page_cpupid_reset_last(struct page *page) { } static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) { return false; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { } static inline bool folio_use_access_time(struct folio *folio) { return false; } #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid * setting tags for all pages to native kernel tag value 0xff, as the default * value 0x00 maps to 0xff. */ static inline u8 page_kasan_tag(const struct page *page) { u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; tag ^= 0xff; } return tag; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { unsigned long old_flags, flags; if (!kasan_enabled()) return; tag ^= 0xff; old_flags = READ_ONCE(page->flags); do { flags = old_flags; flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) { if (kasan_enabled()) page_kasan_tag_set(page, KASAN_TAG_KERNEL); } #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline u8 page_kasan_tag(const struct page *page) { return 0xff; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { } static inline void page_kasan_tag_reset(struct page *page) { } #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } static inline pg_data_t *page_pgdat(const struct page *page) { return NODE_DATA(page_to_nid(page)); } static inline struct zone *folio_zone(const struct folio *folio) { return page_zone(&folio->page); } static inline pg_data_t *folio_pgdat(const struct folio *folio) { return page_pgdat(&folio->page); } #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } static inline unsigned long page_to_section(const struct page *page) { return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif /** * folio_pfn - Return the Page Frame Number of a folio. * @folio: The folio. * * A folio may contain multiple pages. The pages have consecutive * Page Frame Numbers. * * Return: The Page Frame Number of the first page in the folio. */ static inline unsigned long folio_pfn(const struct folio *folio) { return page_to_pfn(&folio->page); } static inline struct folio *pfn_folio(unsigned long pfn) { return page_folio(pfn_to_page(pfn)); } static inline bool folio_has_pincount(const struct folio *folio) { if (IS_ENABLED(CONFIG_64BIT)) return folio_test_large(folio); return folio_order(folio) > 1; } /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. * * This function checks if a folio has been pinned via a call to * a function in the pin_user_pages() family. * * For small folios, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably * pinned for DMA, but possibly a false positive due to having at least * GUP_PIN_COUNTING_BIAS worth of normal folio references". * * False positives are OK, because: a) it's unlikely for a folio to * get that many refcounts, and b) all the callers of this routine are * expected to be able to deal gracefully with a false positive. * * For most large folios, the result will be exactly correct. That's because * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. * * Return: True, if it is likely that the folio has been "dma-pinned". * False, if the folio is definitely not dma-pinned. */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_has_pincount(folio)) return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then * folio_ref_count() returns a negative value, and callers will avoid * further incrementing the refcount. * * Here, for that overflow case, use the sign bit to count a little * bit higher via unsigned math, and thus still get an accurate result. */ return ((unsigned int)folio_ref_count(folio)) >= GUP_PIN_COUNTING_BIAS; } /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, struct folio *folio) { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; return folio_maybe_dma_pinned(folio); } /** * is_zero_page - Query if a page is a zero page * @page: The page to query * * This returns true if @page is one of the permanent zero pages. */ static inline bool is_zero_page(const struct page *page) { return is_zero_pfn(page_to_pfn(page)); } /** * is_zero_folio - Query if a folio is a zero page * @folio: The folio to query * * This returns true if @folio is one of the permanent zero pages. */ static inline bool is_zero_folio(const struct folio *folio) { return is_zero_page(&folio->page); } /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ #ifdef CONFIG_MIGRATION static inline bool folio_is_longterm_pinnable(struct folio *folio) { #ifdef CONFIG_CMA int mt = folio_migratetype(folio); if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif /* The zero page can be "pinned" but gets special handling. */ if (is_zero_folio(folio)) return true; /* Coherent device memory must always allow eviction. */ if (folio_is_device_coherent(folio)) return false; /* * Filesystems can only tolerate transient delays to truncate and * hole-punch operations */ if (folio_is_fsdax(folio)) return false; /* Otherwise, non-movable zone folios can be pinned. */ return !folio_is_zone_movable(folio); } #else static inline bool folio_is_longterm_pinnable(struct folio *folio) { return true; } #endif static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); #ifdef SECTION_IN_PAGE_FLAGS set_page_section(page, pfn_to_section_nr(pfn)); #endif } /** * folio_nr_pages - The number of pages in the folio. * @folio: The folio. * * Return: A positive power of two. */ static inline long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; return folio_large_nr_pages(folio); } /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) #else #define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES #endif /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ static inline long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 1; return folio_large_nr_pages(folio); } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ static inline long thp_nr_pages(struct page *page) { return folio_nr_pages((struct folio *)page); } /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. * * If you have physically contiguous memory which may span more than * one folio (eg a &struct bio_vec), use this function to move from one * folio to the next. Do not use it if the memory is only virtually * contiguous as the folios are almost certainly not adjacent to each * other. This is the folio equivalent to writing ``page++``. * * Context: We assume that the folios are refcounted and/or locked at a * higher level and do not adjust the reference counts. * Return: The next struct folio. */ static inline struct folio *folio_next(struct folio *folio) { return (struct folio *)folio_page(folio, folio_nr_pages(folio)); } /** * folio_shift - The size of the memory described by this folio. * @folio: The folio. * * A folio represents a number of bytes which is a power-of-two in size. * This function tells you which power-of-two the folio is. See also * folio_size() and folio_order(). * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The base-2 logarithm of the size of this folio. */ static inline unsigned int folio_shift(const struct folio *folio) { return PAGE_SHIFT + folio_order(folio); } /** * folio_size - The number of bytes in a folio. * @folio: The folio. * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The number of bytes in this folio. */ static inline size_t folio_size(const struct folio *folio) { return PAGE_SIZE << folio_order(folio); } /** * folio_maybe_mapped_shared - Whether the folio is mapped into the page * tables of more than one MM * @folio: The folio. * * This function checks if the folio maybe currently mapped into more than one * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single * MM ("mapped exclusively"). * * For KSM folios, this function also returns "mapped shared" when a folio is * mapped multiple times into the same MM, because the individual page mappings * are independent. * * For small anonymous folios and anonymous hugetlb folios, the return * value will be exactly correct: non-KSM folios can only be mapped at most once * into an MM, and they cannot be partially mapped. KSM folios are * considered shared even if mapped multiple times into the same MM. * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly * indicate "mapped shared" (false positive) if a folio was mapped by * more than two MMs at one point in time. * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. * * Further, this function only considers current page table mappings that * are tracked using the folio mapcount(s). * * This function does not consider: * #. If the folio might get mapped in the (near) future (e.g., swapcache, * pagecache, temporary unmapping for migration). * #. If the folio is mapped differently (VM_PFNMAP). * #. If hugetlb page table sharing applies. Callers might want to check * hugetlb_pmd_shared(). * * Return: Whether the folio is estimated to be mapped into more than one MM. */ static inline bool folio_maybe_mapped_shared(struct folio *folio) { int mapcount = folio_mapcount(folio); /* Only partially-mappable folios require more care. */ if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) return mapcount > 1; /* * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ... * simply assume "mapped shared", nobody should really care * about this for arbitrary kernel allocations. */ if (!IS_ENABLED(CONFIG_MM_ID)) return true; /* * A single mapping implies "mapped exclusively", even if the * folio flag says something different: it's easier to handle this * case here instead of on the RMAP hot path. */ if (mapcount <= 1) return false; return folio_test_large_maybe_mapped_shared(folio); } #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { return 0; } #endif /* * Some inline functions in vmstat.h depend on page_zone() */ #include <linux/vmstat.h> #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif #if defined(WANT_PAGE_VIRTUAL) static inline void *page_address(const struct page *page) { return page->virtual; } static inline void set_page_address(struct page *page, void *address) { page->virtual = address; } #define page_address_init() do { } while(0) #endif #if defined(HASHED_PAGE_VIRTUAL) void *page_address(const struct page *page); void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif static __always_inline void *lowmem_page_address(const struct page *page) { return page_to_virt(page); } #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) #define page_address_init() do { } while(0) #endif static inline void *folio_address(const struct folio *folio) { return page_address(&folio->page); } /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool page_is_pfmemalloc(const struct page *page) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)page->lru.next & BIT(1); } /* * Return true only if the folio has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool folio_is_pfmemalloc(const struct folio *folio) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)folio->lru.next & BIT(1); } /* * Only to be called by the page allocator on a freshly allocated * page. */ static inline void set_page_pfmemalloc(struct page *page) { page->lru.next = (void *)BIT(1); } static inline void clear_page_pfmemalloc(struct page *page) { page->lru.next = NULL; } /* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. */ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; /* * Whether to drop the pte markers, for example, the uffd-wp information for * file-backed memory. This should only be specified when we will completely * drop the page in the mm, either by truncation or unmapping of the vma. By * default, the flag is not set. */ #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit_signals(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { return t->mm_cid; } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } static inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is * disabled. This provides functional per-cpu data structure accesses * in user-space, althrough it won't provide the memory usage benefits. */ return raw_smp_processor_id(); } #endif #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else static inline bool can_do_mlock(void) { return false; } #endif extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *); struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details); static inline void zap_vma_pages(struct vm_area_struct *vma) { zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, unsigned long end, unsigned long tree_end, bool mm_wr_locked); struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); struct follow_pfnmap_args { /** * Inputs: * @vma: Pointer to @vm_area_struct struct * @address: the virtual address to walk */ struct vm_area_struct *vma; unsigned long address; /** * Internals: * * The caller shouldn't touch any of these. */ spinlock_t *lock; pte_t *ptep; /** * Outputs: * * @pfn: the PFN of the address * @addr_mask: address mask covering pfn * @pgprot: the pgprot_t of the mapping * @writable: whether the mapping is writable * @special: whether the mapping is a special mapping (real PFN maps) */ unsigned long pfn; unsigned long addr_mask; pgprot_t pgprot; bool writable; bool special; }; int follow_pfnmap_start(struct follow_pfnmap_args *args); void follow_pfnmap_end(struct follow_pfnmap_args *args); extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int generic_error_remove_folio(struct address_space *mapping, struct folio *folio); struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long address, struct pt_regs *regs); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs); extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* should never happen if there's no MMU */ BUG(); return VM_FAULT_SIGBUS; } static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ BUG(); return -EFAULT; } static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif static inline void unmap_shared_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen) { unmap_mapping_range(mapping, holebegin, holelen, 0); } static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); #ifdef CONFIG_BPF_SYSCALL extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); #endif long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); /* * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT. */ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, unsigned long addr, int gup_flags, struct vm_area_struct **vmap) { struct page *page; struct vm_area_struct *vma; int got; if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) return ERR_PTR(-EINVAL); got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); if (got < 0) return ERR_PTR(got); vma = vma_lookup(mm, addr); if (WARN_ON_ONCE(!vma)) { put_page(page); return ERR_PTR(-EINVAL); } *vmap = vma; return page; } long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, struct folio **folios, unsigned int max_folios, pgoff_t *offset); int folio_add_pins(struct folio *folio, unsigned int pins); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; struct page *get_dump_page(unsigned long addr, int *locked); bool folio_mark_dirty(struct folio *folio); bool folio_mark_dirty_lock(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); /* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However * for now all the callers are only use one of the flags at the same * time. */ /* * Whether we should manually check if we can map individual PTEs writable, * because something (e.g., COW, uffd-wp) blocks that from happening for all * PTEs automatically in a writable mapping. */ #define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ #define MM_CP_UFFD_WP (1UL << 2) /* do wp */ #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); static inline bool get_user_page_fast_only(unsigned long addr, unsigned int gup_flags, struct page **pagep) { return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; } /* * per-process(per-mm_struct) statistics. */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { return percpu_counter_read_positive(&mm->rss_stat[member]); } void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { percpu_counter_add(&mm->rss_stat[member], value); mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { percpu_counter_inc(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { percpu_counter_dec(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } /* Optimized variant when folio is already known not to be anon */ static inline int mm_counter_file(struct folio *folio) { if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES; } static inline int mm_counter(struct folio *folio) { if (folio_test_anon(folio)) return MM_ANONPAGES; return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) { return get_mm_counter(mm, MM_FILEPAGES) + get_mm_counter(mm, MM_ANONPAGES) + get_mm_counter(mm, MM_SHMEMPAGES); } static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { return max(mm->hiwater_rss, get_mm_rss(mm)); } static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) { return max(mm->hiwater_vm, mm->total_vm); } static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); if ((mm)->hiwater_rss < _rss) (mm)->hiwater_rss = _rss; } static inline void update_hiwater_vm(struct mm_struct *mm) { if (mm->hiwater_vm < mm->total_vm) mm->hiwater_vm = mm->total_vm; } static inline void reset_mm_hiwater_rss(struct mm_struct *mm) { mm->hiwater_rss = get_mm_rss(mm); } static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, struct mm_struct *mm) { unsigned long hiwater_rss = get_mm_hiwater_rss(mm); if (*maxrss < hiwater_rss) *maxrss = hiwater_rss; } #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_mkspecial(pte_t pte) { return pte; } #endif #ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP static inline bool pmd_special(pmd_t pmd) { return false; } static inline pmd_t pmd_mkspecial(pmd_t pmd) { return pmd; } #endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ #ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP static inline bool pud_special(pud_t pud) { return false; } static inline pud_t pud_mkspecial(pud_t pud) { return pud; } #endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { return 0; } #endif extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pte_t *ptep; __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl)); return ptep; } #ifdef __PAGETABLE_P4D_FOLDED static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return 0; } #else int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); static inline void mm_inc_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return 0; } static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); static inline void mm_inc_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } #endif #ifdef CONFIG_MMU static inline void mm_pgtables_bytes_init(struct mm_struct *mm) { atomic_long_set(&mm->pgtables_bytes, 0); } static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return atomic_long_read(&mm->pgtables_bytes); } static inline void mm_inc_nr_ptes(struct mm_struct *mm) { atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_ptes(struct mm_struct *mm) { atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } #else static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return 0; } static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} #endif int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ? NULL : p4d_offset(pgd, address); } static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } #endif /* CONFIG_MMU */ static inline struct ptdesc *virt_to_ptdesc(const void *x) { return page_ptdesc(virt_to_page(x)); } static inline void *ptdesc_to_virt(const struct ptdesc *pt) { return page_to_virt(ptdesc_page(pt)); } static inline void *ptdesc_address(const struct ptdesc *pt) { return folio_address(ptdesc_folio(pt)); } static inline bool pagetable_is_reserved(struct ptdesc *pt) { return folio_test_reserved(ptdesc_folio(pt)); } /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags * @order: desired pagetable order * * pagetable_alloc allocates memory for page tables as well as a page table * descriptor to describe that memory. * * Return: The ptdesc describing the allocated page tables. */ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) { struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); return page_ptdesc(page); } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) /** * pagetable_free - Free pagetables * @pt: The page table descriptor * * pagetable_free frees the memory of all page tables described by a page * table descriptor and the memory for the descriptor itself. */ static inline void pagetable_free(struct ptdesc *pt) { struct page *page = ptdesc_page(pt); __free_pages(page, compound_order(page)); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); void ptlock_free(struct ptdesc *ptdesc); static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return ptdesc->ptl; } #else /* ALLOC_SPLIT_PTLOCKS */ static inline void ptlock_cache_init(void) { } static inline bool ptlock_alloc(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) { } static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return &ptdesc->ptl; } #endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) { BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); return ptlock_ptr(virt_to_ptdesc(pte)); } static inline bool ptlock_init(struct ptdesc *ptdesc) { /* * prep_new_page() initialize page->private (and therefore page->ptl) * with 0. Make sure nobody took it in use in between. * * It can happen if arch try to use slab for page table allocation: * slab code uses page->slab_cache, which share storage with page->ptl. */ VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); if (!ptlock_alloc(ptdesc)) return false; spin_lock_init(ptlock_ptr(ptdesc)); return true; } #else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) { return &mm->page_table_lock; } static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static inline void __pagetable_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); __folio_set_pgtable(folio); lruvec_stat_add_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); ptlock_free(ptdesc); __folio_clear_pgtable(folio); lruvec_stat_sub_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) { pagetable_dtor(ptdesc); pagetable_free(ptdesc); } static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { if (!ptlock_init(ptdesc)) return false; __pagetable_ctor(ptdesc); return true; } pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { pte_t *pte; __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); return pte; } static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); } pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { pte_t *pte; __cond_lock(RCU, __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp, spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ pte_unmap(pte); \ } while (0) #define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) #define pte_alloc_map(mm, pmd, address) \ (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ (pte_alloc(mm, pmd) ? \ NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) #if defined(CONFIG_SPLIT_PMD_PTLOCKS) static inline struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); } static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) { return page_ptdesc(pmd_pgtable_page(pmd)); } static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(pmd_ptdesc(pmd)); } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptdesc->pmd_huge_pte = NULL; #endif return ptlock_init(ptdesc); } #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) #endif static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); return ptl; } static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { if (!pmd_ptlock_init(ptdesc)) return false; ptdesc_pmd_pts_init(ptdesc); __pagetable_ctor(ptdesc); return true; } /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be * considered ready to switch to split PUD locks yet; there may be places * which need to be converted from page_table_lock. */ static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud) { return &mm->page_table_lock; } static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) { spinlock_t *ptl = pud_lockptr(mm, pud); spin_lock(ptl); return ptl; } static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); extern void free_initmem(void); /* * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) * into the buddy system. The freed pages will be poisoned with pattern * "poison" if it's within range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); extern void adjust_managed_page_count(struct page *page, long count); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); static inline void mark_page_reserved(struct page *page) { SetPageReserved(page); adjust_managed_page_count(page, -1); } static inline void free_reserved_ptdesc(struct ptdesc *pt) { free_reserved_page(ptdesc_page(pt)); } /* * Default method to free all the __init memory into the buddy system. * The freed pages will be poisoned with pattern "poison" if it's within * range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ static inline unsigned long free_initmem_default(int poison) { extern char __init_begin[], __init_end[]; return free_reserved_area(&__init_begin, &__init_end, poison, "unused kernel image (initmem)"); } static inline unsigned long get_num_physpages(void) { int nid; unsigned long phys_pages = 0; for_each_online_node(nid) phys_pages += node_present_pages(nid); return phys_pages; } /* * Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in an * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); #ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) { return 0; } #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void mem_init(void); extern void __init mmap_init(void |